The first step in our hate speech detection model involves collecting and preprocessing the data. This step is crucial as the quality and cleanliness of the data significantly affect the model's performance.

LOAD DATASET

In [2]:
import pandas as pd
import numpy as np

data= pd.read_csv("dataset.csv")
data

Unnamed: 0,Text,Label
0,Damn I thought they had strict gun laws in Ger...,0
1,I dont care about what it stands for or anythi...,0
2,It's not a group it's an idea lol,0
3,So it's not just America!,0
4,The dog is a spectacular dancer considering he...,0
...,...,...
17591,I find rats nicer and cleaner than most Chinese,1
17592,"Check out this niggar, they hit things like wi...",1
17593,"this country has become an absolute shambles, ...",0
17594,Me aged 16 = anti-Semitism is bad Me aged 18 =...,1


CLEANING

After observing the dataset, we can infer that we need to clean and transform the raw text data into a format suitable for our model.

In [3]:
import re
def data_cleaning(text):

  text = re.sub(r'\s+', ' ', text) #the re.sub function replaces one or more whitespace characters (\s+) with a single space.

  text = re.sub(r"@\S+", "",text)

  text = re.sub(r'#', '', text)

  import contractions
  text=contractions.fix(text)

  text = text.lower()

  text = re.sub(r'[^\w\s]', '', text)

  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

  text = ' '.join([word for word in text.split() if len(word) > 2 or word.isnumeric()])

  from nltk.stem import WordNetLemmatizer
  lemmatizer = WordNetLemmatizer()
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

  return text

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
data['Text']=data['Text'].apply(data_cleaning)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Split for train and test

In [4]:
from sklearn.model_selection import train_test_split

X=data['Text']
y=data['Label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, X_test.shape, X_val.shape

((12317,), (2640,), (2639,))

Encoding Target Label using LabelEncoder:

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_val = label_encoder.transform(y_val)
y_train, y_val, y_test

(array([0, 0, 0, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 0, ..., 0, 0, 0], dtype=int64),
 array([1, 0, 1, ..., 1, 0, 1], dtype=int64))

Tokenizing and Padding

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=70,padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=70,padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=70,padding='post')



EMBEDDING

Embedding in the context of deep learning and natural language processing (NLP) is a way of representing words or phrases as dense vectors in a continuous vector space. These vectors capture semantic meanings and relationships between words. Embeddings transform the sparse, high-dimensional data of words into a lower-dimensional space, where similar words have similar vector representations.

ONE HOT ENCODING

In [2]:
import re
import pandas as pd
data  = pd.read_csv(r"D:\dataset.csv")
import nltk
nltk.download('punkt') 
from nltk.tokenize import word_tokenize

data['Tokens']=data['Text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ry981\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
texts = data['Text'].values
labels = data['Label'].values

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 

def one_hot_encoding(texts):
    vectorizer = CountVectorizer(binary=True)
    embeddings = vectorizer.fit_transform(texts)
    return embeddings

embeddings_one_hot = one_hot_encoding(texts)
X_train_one_hot, X_test_one_hot, y_train_one_hot, y_test_one_hot = train_test_split(embeddings_one_hot, labels, test_size=0.2, random_state=42)
print(embeddings_one_hot)

  (0, 4634)	1
  (0, 18506)	1
  (0, 18453)	1
  (0, 8227)	1
  (0, 17690)	1
  (0, 8173)	1
  (0, 10461)	1
  (0, 9215)	1
  (0, 7736)	1
  (1, 5579)	1
  (1, 2972)	1
  (1, 394)	1
  (1, 20131)	1
  (1, 9788)	1
  (1, 17485)	1
  (1, 7220)	1
  (1, 13004)	1
  (1, 1096)	1
  (1, 9800)	1
  (1, 3977)	1
  (1, 18653)	1
  (1, 10719)	1
  (1, 18404)	1
  (1, 16541)	1
  (2, 9788)	1
  :	:
  (17594, 6498)	1
  (17594, 123)	1
  (17594, 53)	1
  (17594, 6298)	1
  (17594, 3914)	1
  (17594, 15001)	1
  (17594, 17530)	1
  (17594, 16061)	1
  (17594, 9902)	1
  (17594, 1058)	1
  (17594, 697)	1
  (17594, 68)	1
  (17594, 12285)	1
  (17594, 16320)	1
  (17594, 1074)	1
  (17594, 15570)	1
  (17594, 11487)	1
  (17595, 17055)	1
  (17595, 19474)	1
  (17595, 5562)	1
  (17595, 2188)	1
  (17595, 5030)	1
  (17595, 16061)	1
  (17595, 11517)	1
  (17595, 15632)	1


In [8]:
def term_frequency_encoding(texts):
    vectorizer = CountVectorizer()
    embeddings = vectorizer.fit_transform(texts)
    return embeddings
embeddings_tf = term_frequency_encoding(texts)
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(embeddings_tf, labels, test_size=0.2, random_state=42)
print(embeddings_tf)

  (0, 4634)	1
  (0, 18506)	1
  (0, 18453)	1
  (0, 8227)	1
  (0, 17690)	1
  (0, 8173)	1
  (0, 10461)	1
  (0, 9215)	1
  (0, 7736)	1
  (1, 5579)	1
  (1, 2972)	1
  (1, 394)	1
  (1, 20131)	1
  (1, 9788)	1
  (1, 17485)	1
  (1, 7220)	1
  (1, 13004)	1
  (1, 1096)	1
  (1, 9800)	1
  (1, 3977)	1
  (1, 18653)	1
  (1, 10719)	1
  (1, 18404)	1
  (1, 16541)	1
  (2, 9788)	2
  :	:
  (17594, 6498)	1
  (17594, 123)	1
  (17594, 53)	1
  (17594, 6298)	1
  (17594, 3914)	1
  (17594, 15001)	1
  (17594, 17530)	1
  (17594, 16061)	1
  (17594, 9902)	1
  (17594, 1058)	3
  (17594, 697)	3
  (17594, 68)	1
  (17594, 12285)	1
  (17594, 16320)	3
  (17594, 1074)	1
  (17594, 15570)	1
  (17594, 11487)	1
  (17595, 17055)	1
  (17595, 19474)	1
  (17595, 5562)	1
  (17595, 2188)	1
  (17595, 5030)	1
  (17595, 16061)	1
  (17595, 11517)	1
  (17595, 15632)	1


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_embedding(texts):
    vectorizer = TfidfVectorizer()
    embeddings = vectorizer.fit_transform(texts)
    return embeddings

embeddings_tfidf = tfidf_embedding(texts)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(embeddings_tfidf, labels, test_size=0.2, random_state=42)


print(embeddings_tfidf)
print(embeddings_tfidf.shape)

  (0, 7736)	0.3699627936790537
  (0, 9215)	0.1388506619303749
  (0, 10461)	0.3752596291015289
  (0, 8173)	0.41649038488028367
  (0, 17690)	0.47406059292584823
  (0, 8227)	0.2567293667498387
  (0, 18453)	0.15165197423800875
  (0, 18506)	0.3200116412881795
  (0, 4634)	0.3369195893330076
  (1, 16541)	0.4642690706501393
  (1, 18404)	0.09161042575119856
  (1, 10719)	0.16223888994565513
  (1, 18653)	0.0996453675495539
  (1, 3977)	0.4299948872975778
  (1, 9800)	0.23717831092225916
  (1, 1096)	0.25979668623580054
  (1, 13004)	0.18352720527959232
  (1, 7220)	0.144594064993865
  (1, 17485)	0.38989666328939804
  (1, 9788)	0.1259968016515918
  (1, 20131)	0.17923531261954773
  (1, 394)	0.18577715954742052
  (1, 2972)	0.26841797205609746
  (1, 5579)	0.2645688794841402
  (2, 10867)	0.44695575945206567
  :	:
  (17594, 5562)	0.06812392525584945
  (17594, 5524)	0.09892682229959995
  (17594, 8418)	0.05777122162500571
  (17594, 13091)	0.07406630708554074
  (17594, 11373)	0.20105832019565742
  (17594, 1213

WORD2VEC

CBOW

In [15]:
from gensim.models import Word2Vec
import numpy as np
def word2vec_embedding_cbow(texts):
    model = Word2Vec(texts, vector_size=300, window=5, min_count=1, workers=4,sg=0)
    word_vectors = model.wv
    #print(word_vectors)

    def get_word2vec_embeddings(text, word_vectors):
        embeddings = [word_vectors[word] for word in text if word in word_vectors]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(300)

    embeddings = np.array([get_word2vec_embeddings(text, word_vectors) for text in texts])
    return embeddings

embeddings_w2v_cbow = word2vec_embedding_cbow(data['Tokens'])
print(embeddings_w2v_cbow)
X_train_w2v1, X_test_w2v1, y_train_w2v1, y_test_w2v1 = train_test_split(embeddings_w2v_cbow, labels, test_size=0.2, random_state=42)

[[ 0.17965496  0.21911526  0.07153209 ... -0.03757478  0.22783396
  -0.3270442 ]
 [ 0.2585517   0.29105872  0.11257217 ... -0.03062421  0.52885604
  -0.5240098 ]
 [ 0.1331658   0.27762625  0.38334575 ...  0.06959226  0.69906306
  -0.5330936 ]
 ...
 [ 0.0509892   0.32648292  0.23927818 ... -0.05263434  0.3443746
  -0.29707268]
 [ 0.13551329  0.26237762  0.1619319  ... -0.02249849  0.491493
  -0.4308026 ]
 [ 0.21669094  0.3248966   0.02764771 ... -0.03923914  0.545161
  -0.43017298]]


Word2Vec Skip-Gram

In [18]:
from gensim.models import Word2Vec

def word2vec_embedding_sg(texts):
    model = Word2Vec(texts, vector_size=200, window=6, min_count=1, workers=4,sg=1)
    word_vectors = model.wv
    #print(word_vectors)

    def get_word2vec_embeddings(text, word_vectors):
        embeddings = [word_vectors[word] for word in text if word in word_vectors]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(200)

    embeddings = np.array([get_word2vec_embeddings(text, word_vectors) for text in texts])
    return embeddings

embeddings_w2v_sg = word2vec_embedding_sg(data['Tokens'])
print(embeddings_w2v_sg)
X_train_w2v2, X_test_w2v2, y_train_w2v2, y_test_w2v2 = train_test_split(embeddings_w2v_sg, labels, test_size=0.2, random_state=42)

[[ 0.00337505 -0.08936203  0.01494806 ... -0.06897631 -0.11581796
  -0.16027392]
 [ 0.01793438 -0.10054299  0.05521449 ... -0.09753586 -0.17237748
  -0.24999352]
 [ 0.08088573 -0.0762444  -0.02521128 ... -0.09672859 -0.05041833
  -0.27526566]
 ...
 [ 0.06851795 -0.03317465 -0.01209099 ... -0.02324978  0.02682451
  -0.21082322]
 [ 0.06000196 -0.09641755  0.06982504 ... -0.09364914 -0.1057099
  -0.22627196]
 [ 0.09124887 -0.05448456  0.0516753  ... -0.10454585 -0.1026437
  -0.2512715 ]]


MODEL EVALUATION

Logistic Regression Model

In [9]:
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.read_csv("Embedded_data.csv")
X=data.drop(columns=['Label','Text','Tokens'])
y=data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score,accuracy_score,confusion_matrix,roc_auc_score

# Define the hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the classifier
print("Best Logistic Regression using Grid Search")
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Logistic Regression using Grid Search
Precision: 0.639018691588785
Recall: 0.38359046283309955
Accuracy: 0.6625
ROC-AUC Score: 0.6180129964595297
Confusion Matrix:
 [[1785  309]
 [ 879  547]]


RANDOM FOREST Model

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Example dataset (you should replace this with your own dataset)
# Assume you have a CSV file 'data.csv' with features and target columns
# Replace 'Embedded_data.csv' with your dataset file path
data = pd.read_csv('Embedded_data.csv')

# Separate features and target variable
X=data.drop(columns=['Label','Text','Tokens'])# Features
y = data['Label']                # Target variable

# Encoding categorical data (if 'Label' is categorical)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the classifier
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Printing the classification report
# print(classification_report(y_test, y_pred))

Accuracy: 0.6488636363636363


KNN Model

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data= pd.read_csv("Embedded_data.csv")


X=data.drop(columns=['Label','Text','Tokens'])# Features
y = data['Label']# Features


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)  # K=3, you can choose any value of K

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict on the test data
y_pred = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.58


Hyperparameter Tuning

Logistic Regression Model

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data= pd.read_csv("Embedded_data.csv")


X=data.drop(columns=['Label','Text','Tokens'])# Features
y = data['Label']# Features

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the Logistic Regression classifier
logreg = LogisticRegression(max_iter=1000)

# Define the grid of hyperparameters to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization penalty
    'solver': ['liblinear', 'saga']  # Optimization algorithm
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Print best hyperparameters and best score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_accuracy)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best hyperparameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.6683454740549747
Test set accuracy: 0.6673612426595946


KNN Model

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

data= pd.read_csv("Embedded_data.csv")


X=data.drop(columns=['Label','Text','Tokens'])# Features
y = data['Label']# Features

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define KNN classifier
knn = KNeighborsClassifier()

# Define grid of hyperparameters
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # K values to try
    'weights': ['uniform', 'distance'],  # Weighting scheme for neighbors
    'metric': ['euclidean', 'manhattan']  # Distance metric
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print best hyperparameters and best score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_accuracy)

Best hyperparameters: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Best cross-validation accuracy: 0.6341651902705496
Test set accuracy: 0.6181094904337943


RANDOM FOREST Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import pandas as pd

data = pd.read_csv('Embedded_data.csv')

# Separate features and target variable
X=data.drop(columns=['Label','Text','Tokens'])# Features
y = data['Label']  

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the Random Forest classifier
rf = RandomForestClassifier()

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Print best hyperparameters and best score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", test_accuracy)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python

Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation accuracy: 0.666963956161581
Test set accuracy: 0.6635726463345331


DL MODEL

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
%ls

In [22]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file=r"C:\Users\ry981\glove.6B.200d.txt"
word2vec_output_file=r"D:\word2vec.txt.txt"
glove2word2vec(glove_input_file,word2vec_output_file)

  glove2word2vec(glove_input_file,word2vec_output_file)


(400000, 200)

In [24]:
from gensim.models import KeyedVectors
model= KeyedVectors.load_word2vec_format(word2vec_output_file)

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

# Load the dataset
data = pd.read_csv(r"D:\dataset.csv")

# Preprocess the data
texts = data['Text'].values
labels = data['Label'].values

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Tokenize the text
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=200)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

Embedding using GloVe Embeddings


GloVe.6B.200d.txt is one of the pre-trained GloVe models, where:

6B: The model was trained on a corpus of 6 billion tokens (words).

200d: Each word is represented by a 200-dimensional vector.

Details of GloVe.6B.200d.txt:

Corpus: Common Crawl (a dataset containing 6 billion tokens).

Vocabulary Size: 400,000 unique words.

In [28]:
def glove_embeddings(filepath, word_index, embedding_dim):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dim = 200
glove_filepath =r"C:\Users\ry981\glove.6B.200d.txt"
embedding_matrix = glove_embeddings(glove_filepath, tokenizer.word_index, embedding_dim)

Bidirectional-LSTM model

In [30]:
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Activation
# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=200))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(Dense(256,activation='relu'))

model.add(Dropout(0.4))
model.add(Dense(256,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64,callbacks=[early_stopping])



Epoch 1/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 898ms/step - accuracy: 0.6106 - loss: 0.6545
Epoch 2/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 965ms/step - accuracy: 0.7179 - loss: 0.5210
Epoch 3/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 939ms/step - accuracy: 0.7947 - loss: 0.4189
Epoch 4/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 966ms/step - accuracy: 0.8474 - loss: 0.3389
Epoch 5/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 963ms/step - accuracy: 0.8984 - loss: 0.2458
Epoch 6/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 980ms/step - accuracy: 0.9292 - loss: 0.1779
Epoch 7/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 988ms/step - accuracy: 0.9431 - loss: 0.1372
Epoch 8/10
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 1s/step - accuracy: 0.9618 - loss: 0.1004
Epoch 9/10


<keras.src.callbacks.history.History at 0x19bc217bf10>

Classification Report

In [32]:
from sklearn.metrics import classification_report
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")
y_pred = y_pred.flatten()
print(classification_report(y_test,y_pred))

[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 104ms/step
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      2094
           1       0.62      0.63      0.63      1426

    accuracy                           0.70      3520
   macro avg       0.69      0.69      0.69      3520
weighted avg       0.70      0.70      0.70      3520

