In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

# Load and preprocess data
data = pd.read_csv('new_processed_dataset.csv')
print(data.head())
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

X = data['tweet']  # Use the 'tweet' column for raw text data
y = data['class']  # Use the 'class' column for label data

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to print metrics
def print_metrics(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

   hate_speech  offensive_language  neither  class  \
0            0                   0        3      2   
1            0                   3        0      1   
2            0                   3        0      1   
3            0                   2        1      1   
4            0                   6        0      1   

                                               tweet  \
0  retwet as a woman you should not complain abou...   
1  retwet boy dats coldtyga dwn bad for cufin dat...   
2  retwet dawg retwet you ever fuck a bitch and s...   
3                        retwet she lok like a trany   
4  retwet the shit you hear about me might be tru...   

                                              tokens  
0  ['retwet', 'woman', 'complain', 'cleaning', 'h...  
1  ['retwet', 'boy', 'dat', 'coldtyga', 'dwn', 'b...  
2  ['retwet', 'dawg', 'retwet', 'ever', 'fuck', '...  
3                 ['retwet', 'lok', 'like', 'trany']  
4  ['retwet', 'shit', 'hear', 'might', 'true', 'm...  


In [19]:
# One-Hot Encoding Pipeline
pipeline_ohe = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, max_features=1000)),
    ('classifier', MultinomialNB())
])

# Train and evaluate the model
pipeline_ohe.fit(X_train, y_train)
y_pred_ohe = pipeline_ohe.predict(X_test)
print("One Hot Encoding with Naive Bayes")
print("Classification Report:\n", classification_report(y_test, y_pred_ohe))
print_metrics(y_test, y_pred_ohe)

One Hot Encoding with Naive Bayes
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.27      0.34       285
           1       0.91      0.95      0.93      3793
           2       0.81      0.76      0.78       815

    accuracy                           0.88      4893
   macro avg       0.73      0.66      0.68      4893
weighted avg       0.87      0.88      0.87      4893

Accuracy: 0.8775802166360106
Precision: 0.8655834300398574
Recall: 0.8775802166360106
F1 Score: 0.8696114134545931


In [20]:
# TF-IDF Encoding Pipeline
pipeline_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=1000)),
    ('classifier', MultinomialNB())
])

# Train and evaluate the model
pipeline_tfidf.fit(X_train, y_train)
y_pred_tfidf = pipeline_tfidf.predict(X_test)
print("TF-IDF Encoding with Naive Bayes")
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))
print_metrics(y_test, y_pred_tfidf)

TF-IDF Encoding with Naive Bayes
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       285
           1       0.83      1.00      0.90      3793
           2       0.91      0.38      0.53       815

    accuracy                           0.83      4893
   macro avg       0.58      0.46      0.48      4893
weighted avg       0.79      0.83      0.79      4893

Accuracy: 0.8340486409155937
Precision: 0.793381501500123
Recall: 0.8340486409155937
F1 Score: 0.7895819888582178


In [22]:
# Word2Vec Transformer
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.model = None

    def fit(self, X, y=None):
        tokenized_X = [tweet.split() for tweet in X]
        self.model = Word2Vec(sentences=tokenized_X, vector_size=self.vector_size, window=self.window, min_count=self.min_count)
        return self

    def transform(self, X):
        def get_word2vec_features(text):
            words = text.split()
            feature_vector = np.mean([self.model.wv[word] for word in words if word in self.model.wv] or [np.zeros(self.vector_size)], axis=0)
            return feature_vector
        
        return np.array([get_word2vec_features(tweet) for tweet in X])

# Word2Vec Encoding Pipeline
pipeline_w2v = Pipeline([
    ('word2vec', Word2VecTransformer(vector_size=100)),  # We reduced vector size for faster convergence
    ('classifier', GaussianNB())
])

# Train and evaluate the model
pipeline_w2v.fit(X_train, y_train)
y_pred_w2v = pipeline_w2v.predict(X_test)
print("Word2Vec Encoding with Naive Bayes")
print("Classification Report:\n", classification_report(y_test, y_pred_w2v))
print_metrics(y_test, y_pred_w2v)

Word2Vec Encoding with Naive Bayes
Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.19      0.13       285
           1       0.87      0.72      0.79      3793
           2       0.42      0.62      0.50       815

    accuracy                           0.67      4893
   macro avg       0.46      0.51      0.47      4893
weighted avg       0.75      0.67      0.70      4893

Accuracy: 0.6711628857551605
Precision: 0.7527806416336142
Recall: 0.6711628857551605
F1 Score: 0.7017245472209079


In [21]:
# Term Frequency Encoding Pipeline
pipeline_tf = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('classifier', MultinomialNB())
])

# Train and evaluate the model
pipeline_tf.fit(X_train, y_train)
y_pred_tf = pipeline_tf.predict(X_test)
print("Term Frequency Encoding with Naive Bayes")
print("Classification Report:\n", classification_report(y_test, y_pred_tf))
print_metrics(y_test, y_pred_tf)

Term Frequency Encoding with Naive Bayes
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.27      0.34       285
           1       0.91      0.94      0.93      3793
           2       0.80      0.76      0.78       815

    accuracy                           0.87      4893
   macro avg       0.72      0.66      0.68      4893
weighted avg       0.86      0.87      0.87      4893

Accuracy: 0.8745146127120376
Precision: 0.8632751024350692
Recall: 0.8745146127120376
F1 Score: 0.8674222013334313
