In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv('/Users/CRYPTO/Infosys Internship - Hate Speech Detection/Tokenization & Encoding/Reddit_Encoded.csv')
print(df.head())

                                             comment  hate_speech  \
0  subsection retarded hungarians ohh boy brace l...            1   
1  hiii just got work Foundation and grounding ma...            0   
2                    wow guess soyboys every country            0   
3  owen benjamins soyboy song goes every country ...            0   
4   yall hear sumn means live small town rn for w...            0   

                                  lemmatized_comment  \
0  subsection retard hungarians ohh boy brace liv...   
1  hiii just get work Foundation and ground mainl...   
2                    wow guess soyboys every country   
3  owen benjamins soyboy song go every country amaze   
4  yall hear sumn mean live small town rn for wor...   

                                document_vector_flat  
0  0.014043219,-0.01809359,0.017145459,0.08062436...  
1  -0.0030388932,-0.035133556,0.020659983,0.07383...  
2  0.017362628,0.005587179,0.0297773,0.109146975,...  
3  0.018085241,0.0011954829,

In [2]:
# Convert the document vectors from their string representation back into numerical form
df['document_vector'] = df['document_vector_flat'].apply(lambda x: np.fromstring(x, sep=','))

print(df['document_vector'].head())

0    [0.014043219, -0.01809359, 0.017145459, 0.0806...
1    [-0.0030388932, -0.035133556, 0.020659983, 0.0...
2    [0.017362628, 0.005587179, 0.0297773, 0.109146...
3    [0.018085241, 0.0011954829, 2.8959475e-05, 0.0...
4    [0.023993038, -0.00060867134, 0.005239945, 0.0...
Name: document_vector, dtype: object


In [3]:
# Extract features (document vectors) and target variable (assuming hate_speech is a continuous variable here)
X = np.array(df['document_vector'].tolist())
y = df['hate_speech']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Make predictions
y_pred_continuous = lr.predict(X_test)

## Linear regression is typically used for regression tasks (predicting a continuous value) rather than classification.

In [5]:
# Threshold the predictions at 0.5 to get binary outcomes
y_pred = (y_pred_continuous >= 0.5).astype(int)

In [6]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 0.19
R^2 Score: -0.06


In [7]:
# Calculate classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 0.81
Precision: 0.84
Recall: 0.24
F1-Score: 0.38
