In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Load the dataset with Word2Vec embeddings
data = pd.read_csv('hatespeech_word2vec.csv')

In [3]:
# Check for NaN values in the 'word2vec' column and handle them
data = data.dropna(subset=['word2vec'])

In [4]:
# Ensure all values in 'word2vec' column are strings
data['word2vec'] = data['word2vec'].astype(str)

In [5]:
# Initialize an empty list to store the embeddings and an index list to track valid rows
X = []
valid_indices = []

In [6]:
# Iterate over each string representation of Word2Vec embeddings
for idx, emb_str in enumerate(data['word2vec']):
    # Remove '[' and ']' characters from the string
    emb_str = emb_str.strip('[]')
    
    # Split the string by whitespace and convert each part to float
    try:
        emb_array = np.array([float(num_str) for num_str in emb_str.split()])
        # Append the numpy array to the list X and index to valid_indices
        X.append(emb_array)
        valid_indices.append(idx)
    except ValueError as e:
        print(f"Error converting to float: {e}")
        print(f"String: {emb_str}")#

In [7]:
# Convert list of arrays to 2D numpy array
X = np.array(X)

In [8]:
# Filter the labels using the valid indices
y = data['label'].iloc[valid_indices].reset_index(drop=True)

In [9]:
# Ensure X and y have matching lengths
if len(X) != len(y):
    print("Mismatch between lengths of X and y even after filtering")
    exit()

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Define a pipeline with scaling, PCA, and the classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10)),  # Adjust n_components based on your dataset
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1))
])

In [12]:
# Set up the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100],
    'classifier__max_depth': [10],
    'classifier__min_samples_split': [2],
    'classifier__min_samples_leaf': [1]
}

In [13]:
# Initialize GridSearchCV with 3-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)

In [14]:
# Fit the model
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [15]:
# Get the best model
best_model = grid_search.best_estimator_

In [16]:
# Make predictions on the test set
y_pred = best_model.predict(X_test)

In [17]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.3f}")

Model accuracy: 0.605
