# Preparation

## Libs

In [1]:

import pandas as pd
import numpy as np
from sklearn.svm import SVC

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.metrics import accuracy_score

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.model_selection import cross_val_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from collections import Counter

device = torch.device('cpu')

In [2]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/roesman_raja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/roesman_raja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/roesman_raja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/roesman_raja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Preprocessing

In [4]:
# prompt: make me the slangremover function for english language with working slang_dict from github

slang_dict = {
    "afaik": "as far as I know",
    "brb": "be right back",
    "btw": "by the way",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "ttyl": "talk to you later",
    "imo": "in my opinion",
    "idk": "I don't know",
    "rofl": "rolling on the floor laughing",
    "wtf": "what the f***",
    "lmfao": "laughing my f***ing ass off",
    "asap": "as soon as possible",
    "tbh": "to be honest",
    "diy": "do it yourself",
    "np": "no problem",
    "thx": "thanks",
    "pls": "please",
    "yolo": "you only live once",
    "gtg": "got to go",
    "btw": "by the way",
    "irl": "in real life",
    "omg": "oh my god",
    "lmk": "let me know",
    "imo": "in my opinion",
    "afaik": "as far as I know",
    "rofl": "rolling on the floor laughing",
    "lmao": "laughing my ass off",
    "tbh": "to be honest",
    "brb": "be right back",
    "idk": "I don't know",
    "ily": "I love you",
    "jk": "just kidding",
    "thx": "thanks",
    "asap": "as soon as possible",
    "nvm": "nevermind",
    "np": "no problem",
    "pls": "please",
    "ttyl": "talk to you later",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "wtf": "what the f***",
    "gtg": "got to go",
    "btw": "by the way",
    "irl": "in real life",
    "lmk": "let me know",
    "imo": "in my opinion",
    "afaik": "as far as I know",
    "rofl": "rolling on the floor laughing",
    "lmao": "laughing my ass off",
    "tbh": "to be honest",
    "brb": "be right back",
    "idk": "I don't know",
    "ily": "I love you",
    "jk": "just kidding",
    "thx": "thanks",
    "asap": "as soon as possible",
    "nvm": "nevermind",
    "np": "no problem",
    "pls": "please",
    "ttyl": "talk to you later"
}

In [5]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
    text = re.sub(r'RT[\s]', '', text) # remove RT
    text = re.sub(r"http\S+", '', text) # remove link
    text = re.sub(r'[0-9]+', '', text) # remove numbers

    text = text.replace('\n', ' ') # replace new line into space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    text = text.strip(' ') # remove characters space from both left and right text
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case
    text = text.lower()
    return text

import spacy


nlp = spacy.blank("en") 

def tokenizingText(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]  
    return tokens

def deSlangText(text):
    new_text = []
    for word in text.split():
        if word.lower() in slang_dict:
            new_text.append(slang_dict[word.lower()])
        else:
            new_text.append(word)
    return " ".join(new_text)

def filteringText(text):  # Remove stopwords in a text
    listStopwords = set(stopwords.words('english'))
    filtered = [w for w in text if not w.lower() in listStopwords]
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

# Lemmatizer object
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_words

def stemmingText(text):  # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    return text

def toSentence(list_words): # Convert list of words into sentence
    sentence = ' '.join(word for word in list_words)
    return sentence


In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = tokenizingText(example_sent)
# converts the words in word_tokens to lower case and then checks whether 
#they are present in stop_words or not
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
#with no lower case conversion
filtered_sentence = []

for w in word_tokens:
	if w not in stop_words:
		filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)


['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


## Dataset Preparation

In [7]:

df = pd.read_csv('output_sentences.csv')
df

Unnamed: 0,sentence,sentiment
0,bad . bad .,0
1,bad .,0
2,that one word seems to pretty much sums up bey...,0
3,"if that summary isn't enough for you , how abo...",0
4,still haven't got the point ?,0
...,...,...
64715,people are shot with guns but there is little ...,1
64716,seeing black silhouettes on computer screens i...,1
64717,kurosawa is not going for and easy visual shoc...,1
64718,"of any horror filmmaker in the world , kiyoshi...",1


In [8]:
df['text_clean'] = df['sentence'].astype(str).apply(cleaningText)
df['casefolding'] = df['text_clean'].apply(casefoldingText)
df['text_deslanged'] = df['casefolding'].apply(deSlangText)
df['text_preprocessed'] = df['text_deslanged'].apply(tokenizingText)
df['text_filtered'] = df['text_preprocessed'].apply(filteringText)
df['text_stemmed'] = df['text_filtered'].apply(lemmatize_text)
df['text_classifier'] = df['text_filtered'].apply(toSentence)

In [9]:
df

Unnamed: 0,sentence,sentiment,text_clean,casefolding,text_deslanged,text_preprocessed,text_filtered,text_stemmed,text_classifier
0,bad . bad .,0,bad bad,bad bad,bad bad,"[bad, bad]","[bad, bad, bad, bad]","[bad, bad, bad, bad]",bad bad bad bad
1,bad .,0,bad,bad,bad,[bad],"[bad, bad]","[bad, bad]",bad bad
2,that one word seems to pretty much sums up bey...,0,that one word seems to pretty much sums up bey...,that one word seems to pretty much sums up bey...,that one word seems to pretty much sums up bey...,"[that, one, word, seems, to, pretty, much, sum...","[one, word, seems, pretty, much, sums, beyond,...","[one, word, seems, pretty, much, sum, beyond, ...",one word seems pretty much sums beyond valley ...
3,"if that summary isn't enough for you , how abo...",0,if that summary isnt enough for you how about...,if that summary isnt enough for you how about...,if that summary isnt enough for you how about ...,"[if, that, summary, is, nt, enough, for, you, ...","[summary, nt, enough, ta, ta, ta, summary, nt,...","[summary, nt, enough, ta, ta, ta, summary, nt,...",summary nt enough ta ta ta summary nt enough t...
4,still haven't got the point ?,0,still havent got the point,still havent got the point,still havent got the point,"[still, have, nt, got, the, point]","[still, nt, got, point, still, nt, got, point]","[still, nt, got, point, still, nt, got, point]",still nt got point still nt got point
...,...,...,...,...,...,...,...,...,...
64715,people are shot with guns but there is little ...,1,people are shot with guns but there is little ...,people are shot with guns but there is little ...,people are shot with guns but there is little ...,"[people, are, shot, with, guns, but, there, is...","[people, shot, guns, little, blood, evidence, ...","[people, shot, gun, little, blood, evidence, p...",people shot guns little blood evidence people ...
64716,seeing black silhouettes on computer screens i...,1,seeing black silhouettes on computer screens i...,seeing black silhouettes on computer screens i...,seeing black silhouettes on computer screens i...,"[seeing, black, silhouettes, on, computer, scr...","[seeing, black, silhouettes, computer, screens...","[seeing, black, silhouette, computer, screen, ...",seeing black silhouettes computer screens imme...
64717,kurosawa is not going for and easy visual shoc...,1,kurosawa is not going for and easy visual shoc...,kurosawa is not going for and easy visual shoc...,kurosawa is not going for and easy visual shoc...,"[kurosawa, is, not, going, for, and, easy, vis...","[kurosawa, going, easy, visual, shock, deeper,...","[kurosawa, going, easy, visual, shock, deeper,...",kurosawa going easy visual shock deeper metaph...
64718,"of any horror filmmaker in the world , kiyoshi...",1,of any horror filmmaker in the world kiyoshi ...,of any horror filmmaker in the world kiyoshi ...,of any horror filmmaker in the world kiyoshi k...,"[of, any, horror, filmmaker, in, the, world, k...","[horror, filmmaker, world, kiyoshi, kurosawa, ...","[horror, filmmaker, world, kiyoshi, kurosawa, ...",horror filmmaker world kiyoshi kurosawa one wa...


In [10]:
tf_idf_ngram_vectorizer = TfidfVectorizer()
X = df['text_classifier'].values
y = df['sentiment'].values

X_bi = tf_idf_ngram_vectorizer.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_bi, y, test_size = 0.4, random_state = 42)

# Experiment

## SVM

In [61]:
# Initialize the SVM model
svm_model = SVC(kernel='linear')  # You can experiment with different kernels (e.g., 'rbf', 'poly')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)



In [64]:
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=10)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.6784243  0.6791967  0.65825393 0.67937162 0.67911409 0.67164563
 0.68658254 0.67808396 0.68503734 0.68297708]
Mean cross-validation score: 0.6778687195207502


In [65]:

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Generate the classification report
cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)


Accuracy: 0.6812036464771323
Confusion Matrix:
 [[8521 4151]
 [4102 9114]]
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.67      0.67     12672
           1       0.69      0.69      0.69     13216

    accuracy                           0.68     25888
   macro avg       0.68      0.68      0.68     25888
weighted avg       0.68      0.68      0.68     25888



## Another LSTM

In [73]:
# Convert sparse matrix to dense and then to PyTorch tensors
X_bi_dense = X_bi.toarray()
X_tensor = torch.tensor(X_bi_dense, dtype=torch.float32).to(device)  # Move to GPU
y_tensor = torch.tensor(y, dtype=torch.float32).to(device)

In [74]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor.cpu(), y_tensor.cpu(), test_size=0.4, random_state=42)

In [47]:
# Create DataLoader for training and test sets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [62]:

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # 1 layer
        c0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # 1 layer
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out



In [75]:
# Define input dimensions and create model
input_dim = X_bi_dense.shape[1]
hidden_dim = 128
output_dim = 1

model = LSTMModel(input_dim, hidden_dim, output_dim).to(device)  # Move model to GPU

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 5
train_losses = []

# Train the model
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    num_batches = 0  # To calculate average loss per epoch
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        outputs = model(X_batch.unsqueeze(1))  # Add sequence dimension
        loss = criterion(outputs.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        epoch_loss += loss.item()
        num_batches += 1

    # Calculate average loss for the epoch
    avg_loss = epoch_loss / num_batches
    train_losses.append(avg_loss)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')

# Optionally, print final training losses
print("Training Losses per Epoch:", train_losses)

Epoch [1/5], Average Loss: 0.6321
Epoch [2/5], Average Loss: 0.4535
Epoch [3/5], Average Loss: 0.3506
Epoch [4/5], Average Loss: 0.2909
Epoch [5/5], Average Loss: 0.2499
Training Losses per Epoch: [0.6321140209080949, 0.45353933119351625, 0.3505608503518034, 0.29092366034794087, 0.24985271856139676]


In [76]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Move to GPU
            outputs = model(X_batch.unsqueeze(1))  # Add sequence dimension
            preds = (outputs.squeeze() > 0.5).float()  # Convert probabilities to binary class labels
            
            all_preds.extend(preds.cpu().numpy())  # Move predictions to CPU and convert to numpy
            all_labels.extend(y_batch.cpu().numpy())  # Move labels to CPU and convert to numpy

    # Convert lists to numpy arrays for metric calculations
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print('Confusion Matrix:')
    print(cm)

# Call the evaluation function
evaluate_model(model, test_loader, device)


Accuracy: 0.6706
Precision: 0.6819
Recall: 0.6650
F1 Score: 0.6733
Confusion Matrix:
[[8572 4100]
 [4428 8788]]


## BERT

In [115]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")

In [116]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")
model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")

In [126]:
# Define a function for sentiment analysis
def analyze_sentiment(text):
    result = pipe(text)
    return result

# Example usage
texts = [
    "I love this product! It's amazing and works perfectly.",
    "This is the worst experience I've ever had.",
    "It's okay, nothing special but not bad either.",
]

# Analyze sentiment for each text
for text in texts:
    sentiment = analyze_sentiment(text)
    print(f"Text: {text}\nSentiment: {sentiment}\n")

Text: I love this product! It's amazing and works perfectly.
Sentiment: [{'label': 'HAPPY', 'score': 0.9787024259567261}]

Text: This is the worst experience I've ever had.
Sentiment: [{'label': 'SAD', 'score': 0.9854721426963806}]

Text: It's okay, nothing special but not bad either.
Sentiment: [{'label': 'SAD', 'score': 0.8439469933509827}]



### Finetuned

In [127]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [152]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        # Resetting indices to avoid KeyErrors
        self.texts = texts.reset_index(drop=True).tolist()
        self.labels = labels.reset_index(drop=True).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [153]:
# Example data: replace with your actual data
texts = df['text_classifier']
labels = df['sentiment']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [154]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")
model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-snappfood")

# Create DataLoader
max_len = 128
train_dataset = SentimentDataset(X_train, y_train, tokenizer, max_len)
test_dataset = SentimentDataset(X_test, y_test, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

model = model.to(device)

In [155]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 4  # Number of epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
num_epochs = 4

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = epoch_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}')

100%|██████████| 1618/1618 [06:38<00:00,  4.06it/s]


Epoch 1/4, Average Loss: 0.6717


100%|██████████| 1618/1618 [06:39<00:00,  4.05it/s]


Epoch 2/4, Average Loss: 0.5929


100%|██████████| 1618/1618 [06:39<00:00,  4.05it/s]


Epoch 3/4, Average Loss: 0.5094


100%|██████████| 1618/1618 [06:39<00:00,  4.05it/s]

Epoch 4/4, Average Loss: 0.3838





In [156]:
# Optionally evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            total += labels.size(0)
            correct += (preds == labels).sum().item()

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')

# Evaluate the model on the test set
evaluate_model(model, test_loader, device)   

Accuracy: 0.6428
