# import Necessary Libraries

In [3]:
from gensim.models import Word2Vec
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re

In [4]:
data = pd.read_csv('Preprocessed_Final_dataset.csv',encoding='ISO-8859-1',low_memory=False)

In [5]:
data.tail()

Unnamed: 0,comment,label,clean_text,tokens,text
41093,Indian culture is so backwards,N,indian culture is so backwards,"['indian', 'culture', 'backwards']",indian culture backwards
41094,"Women don't want equality, they just want to b...",N,women do not want equality they just want to b...,"['woman', 'want', 'equality', 'want', 'charge']",woman want equality want charge
41095,fuck covid,P,fuck covid,"['fuck', 'covid']",fuck covid
41096,This computer is garbage,P,this computer is garbage,"['computer', 'garbage']",computer garbage
41097,The only good Muslim is a dead one,N,the only good muslim is a dead one,"['good', 'muslim', 'dead', 'one']",good muslim dead one


In [6]:
data['tokens'].tail()

41093                 ['indian', 'culture', 'backwards']
41094    ['woman', 'want', 'equality', 'want', 'charge']
41095                                  ['fuck', 'covid']
41096                            ['computer', 'garbage']
41097                  ['good', 'muslim', 'dead', 'one']
Name: tokens, dtype: object

# Word2vec Embedding
### Word2Vec is a neural network-based technique used to generate dense vector representations of words (word embeddings) by analyzing the context in which words appear in a large corpus. It captures semantic relationships between words, allowing similar words to have similar vector representations.

In [7]:
import numpy as np
# Train the Word2Vec model
model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average Word2Vec vector for a list of tokens
def get_average_word2vec(tokens, model):
    vector_size = model.wv.vector_size
    vec = np.zeros(vector_size)
    count = 0
    for token in tokens:
        if token in model.wv:
            vec += model.wv[token]
            count += 1
    if count != 0:
        vec /= count
    return vec

# Apply the Word2Vec embedding to the 'tokens' column and create a new column 'embedding'
data['word2vec'] = data['tokens'].apply(lambda tokens: get_average_word2vec(tokens, model))

# Print the DataFrame with the 'embedding' column
print(data[['tokens', 'word2vec']])

                                                  tokens  \
0                                 ['dalits', 'lowlives']   
1                 ['gay', 'people', 'burden', 'society']   
2                                    ['arab', 'welcome']   
3      ['say', 'actually', 'eliminate', 'heebs', 'wis...   
4                         ['bananas', 'black', 'people']   
...                                                  ...   
41093                 ['indian', 'culture', 'backwards']   
41094    ['woman', 'want', 'equality', 'want', 'charge']   
41095                                  ['fuck', 'covid']   
41096                            ['computer', 'garbage']   
41097                  ['good', 'muslim', 'dead', 'one']   

                                                word2vec  
0      [-0.2778713961809196, 0.1314378326589411, 0.26...  
1      [-0.3402217598631978, 0.028047197204279273, 0....  
2      [-0.2702828518262035, 0.10874713839668977, 0.2...  
3      [-0.3521344596632274, 0.099587038473

In [8]:
print(data['word2vec'].iloc[1])

[-0.34022176  0.0280472   0.22755992  0.338593    0.24280386  0.35456144
 -0.06333581 -0.07450576 -0.39754212  0.64561914 -0.54379885  0.10505109
 -0.35141737 -0.03588455 -0.824682    0.52561221 -0.02166811  0.63311345
 -0.02961401 -0.0348969  -0.39919733 -0.12866064  0.11712232 -0.23891418
  0.13580278 -0.29576098  0.04176102  0.06649251  0.02023623  0.31140289
  0.27302766 -0.04220435  0.63909201  0.26196713 -0.33441275 -0.30604809
  0.0068475  -0.24134395  0.14092216 -0.13869322 -0.02181389  0.3157464
 -0.14411364 -0.28157006  0.0965307  -0.09637904  0.47842981 -0.39011551
 -0.0430036  -0.2808932  -0.06220733 -0.05262495 -0.08578807 -0.06792285
 -0.23700072 -0.20065238 -0.24785129 -0.29552251 -0.42779953  0.03286628
  0.61237776 -0.27098842 -0.1721415  -0.08117192 -0.11439796  0.89349133
  0.42009643 -0.34609759 -0.63537831  0.54390265 -0.38526412  0.00952082
  0.12777224 -0.34580731 -0.15579754  0.35573361 -0.06659776  0.1183549
 -0.10265812  0.12814695  0.67741666 -0.42451472  0.0

In [9]:
data.to_csv("word2vec_embedding.csv",index = False)

In [12]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert 'N' to 0 and 'P' to 1 in the label column
data['label'] = label_encoder.fit_transform(data['label'])

# Logistic Regression

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Extract the features and labels
X = np.vstack(data['word2vec'].values)
y = data['label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
logistic_regression_model = LogisticRegression(max_iter=1000) 

# Train the model
logistic_regression_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.5715328467153284
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.77      0.66      4416
           1       0.56      0.34      0.43      3804

    accuracy                           0.57      8220
   macro avg       0.57      0.56      0.54      8220
weighted avg       0.57      0.57      0.55      8220



# SVM Model

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
X = np.vstack(data['word2vec'].values)
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='linear', max_iter=1000) 
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report_str)



Accuracy: 0.45888077858880777
Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.00      0.00      4416
           1       0.46      0.99      0.63      3804

    accuracy                           0.46      8220
   macro avg       0.31      0.50      0.32      8220
weighted avg       0.30      0.46      0.29      8220



### "After evaluating the performance of different models using TF-IDF and Word2Vec embeddings, we have decided to proceed with TF-IDF embeddings due to its superior performance."