In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# Load the processed dataset
data = pd.read_csv('new_processed_dataset.csv')
print(data.head())

# Ensure there are no NaN values
data.dropna(subset=['tweet'], inplace=True)

# Define features and target variable
X = data['tweet']
y = data['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Function to print metrics
def print_metrics(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))


   hate_speech  offensive_language  neither  class  \
0            0                   0        3      2   
1            0                   3        0      1   
2            0                   3        0      1   
3            0                   2        1      1   
4            0                   6        0      1   

                                               tweet  \
0  retwet as a woman you should not complain abou...   
1  retwet boy dats coldtyga dwn bad for cufin dat...   
2  retwet dawg retwet you ever fuck a bitch and s...   
3                        retwet she lok like a trany   
4  retwet the shit you hear about me might be tru...   

                                              tokens  
0  ['retwet', 'woman', 'complain', 'cleaning', 'h...  
1  ['retwet', 'boy', 'dat', 'coldtyga', 'dwn', 'b...  
2  ['retwet', 'dawg', 'retwet', 'ever', 'fuck', '...  
3                 ['retwet', 'lok', 'like', 'trany']  
4  ['retwet', 'shit', 'hear', 'might', 'true', 'm...  


In [41]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# One Hot Encoding Pipeline
pipeline_ohe = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, max_features=1000)),
    ('classifier', LogisticRegression())
])

# Train and evaluate the model
pipeline_ohe.fit(X_train, y_train)
y_pred_ohe = pipeline_ohe.predict(X_test)
print("One Hot Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_ohe))
print("Classification Report:\n", classification_report(y_test, y_pred_ohe))
print_metrics(y_test, y_pred_ohe)


One Hot Encoding
Accuracy: 0.8932176019378281
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.22      0.30       305
           1       0.93      0.94      0.94      3807
           2       0.81      0.91      0.86       842

    accuracy                           0.89      4954
   macro avg       0.74      0.69      0.70      4954
weighted avg       0.88      0.89      0.88      4954

Accuracy: 0.8932176019378281
Precision: 0.8810039246136164
Recall: 0.8932176019378281
F1 Score: 0.8836092495429437


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Encoding Pipeline
pipeline_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=1000)),
    ('classifier', LogisticRegression())
])

# Train and evaluate the model
pipeline_tfidf.fit(X_train, y_train)
y_pred_tfidf = pipeline_tfidf.predict(X_test)
print("TF-IDF Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))
print_metrics(y_test, y_pred_tfidf)


TF-IDF Encoding
Accuracy: 0.8891804602341542
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.17      0.26       305
           1       0.91      0.96      0.93      3807
           2       0.82      0.83      0.83       842

    accuracy                           0.89      4954
   macro avg       0.77      0.65      0.67      4954
weighted avg       0.88      0.89      0.87      4954

Accuracy: 0.8891804602341542
Precision: 0.8750691000110776
Recall: 0.8891804602341542
F1 Score: 0.8744271581080336


In [45]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.model = None

    def fit(self, X, y=None):
        tokenized_X = [tweet.split() for tweet in X]
        self.model = Word2Vec(sentences=tokenized_X, vector_size=self.vector_size, window=self.window, min_count=self.min_count)
        return self

    def transform(self, X):
        def get_word2vec_features(text):
            words = text.split()
            feature_vector = np.mean([self.model.wv[word] for word in words if word in self.model.wv] or [np.zeros(self.vector_size)], axis=0)
            return feature_vector
        
        return np.array([get_word2vec_features(tweet) for tweet in X])

# Word2Vec Encoding Pipeline
pipeline_w2v = Pipeline([
    ('word2vec', Word2VecTransformer(vector_size=100)),  # We don't set max_features for Word2Vec
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train and evaluate the model
pipeline_w2v.fit(X_train, y_train)
y_pred_w2v = pipeline_w2v.predict(X_test)
print("Word2Vec Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("Classification Report:\n", classification_report(y_test, y_pred_w2v))
print_metrics(y_test, y_pred_w2v)


Word2Vec Encoding
Accuracy: 0.8538554703270085
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.03      0.06       305
           1       0.88      0.95      0.92      3807
           2       0.71      0.71      0.71       842

    accuracy                           0.85      4954
   macro avg       0.74      0.57      0.56      4954
weighted avg       0.84      0.85      0.83      4954

Accuracy: 0.8538554703270085
Precision: 0.8389230389063165
Recall: 0.8538554703270085
F1 Score: 0.8290693914477676


In [44]:
# Term Frequency Encoding Pipeline
pipeline_tf = Pipeline([
    ('vectorizer', CountVectorizer(max_features=1000)),
    ('classifier', LogisticRegression())
])

# Train and evaluate the model
pipeline_tf.fit(X_train, y_train)
y_pred_tf = pipeline_tf.predict(X_test)
print("Term Frequency Encoding")
print("Accuracy:", accuracy_score(y_test, y_pred_tf))
print("Classification Report:\n", classification_report(y_test, y_pred_tf))
print_metrics(y_test, y_pred_tf)


Term Frequency Encoding
Accuracy: 0.8909971740008075
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.21      0.28       305
           1       0.93      0.94      0.94      3807
           2       0.80      0.91      0.85       842

    accuracy                           0.89      4954
   macro avg       0.73      0.69      0.69      4954
weighted avg       0.88      0.89      0.88      4954

Accuracy: 0.8909971740008075
Precision: 0.8780762825958154
Recall: 0.8909971740008075
F1 Score: 0.8810098340399694
