In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.read_csv("Tokenized_HateSpeechDetection.csv")
X=data['Tokens']
y=data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
from gensim.models import Word2Vec
import numpy as np
def word2vec_embedding_sg(texts):
    model = Word2Vec(sentences=X_train, vector_size=200, window=6, min_count=1, workers=4,sg=1)
    word_vectors = model.wv
    #print(word_vectors)

    def get_word2vec_embeddings(text, word_vectors):
        embeddings = [word_vectors[word] for word in text if word in word_vectors]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(200)

    embeddings = np.array([get_word2vec_embeddings(text, word_vectors) for text in texts])
    return embeddings
X_train_w2v=word2vec_embedding_sg(X_train)
X_test_w2v=word2vec_embedding_sg(X_test)



# Naive Bayes Model:
Naive Bayes models typically have fewer hyperparameters to tune compared to other models like SVM or XGBoost. However, it is still beneficial to tune these hyperparameters to achieve optimal performance.

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc
# Train the model
gnb_clf = GaussianNB()
gnb_clf.fit(X_train_w2v, y_train)

# Make predictions
y_pred = gnb_clf.predict(X_test_w2v)

# Evaluate the classifier
print("Gaussian Naive Bayes")
# Precision
precision= precision_score(y_test, y_pred)
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)


# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)

#f1-score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)



Gaussian Naive Bayes
Precision: 0.5964783929134537
Recall: 0.001402524544179523
Accuracy: 0.6948863636363636
ROC-AUC Score: 0.6200030060067009
F1 Score: 0.0028011204481792717


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

# Define the hyperparameter grid
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(GaussianNB(), param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train_w2v, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test_w2v)

# Evaluate the classifier
print("Best Gaussian Naive Bayes using Grid Search")
print("Precision:",precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))
print("f1:", f1_score(y_test, y_pred))


Best Gaussian Naive Bayes using Grid Search
Precision: 0.6986037234042554
Recall: 0.5948863636363636
Accuracy: 0.7948863636363637
ROC-AUC Score: 0.6154537264538263
f1: 0.6568863636363635
