In [218]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
import re
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score



In [220]:
df = pd.read_csv("labeled_data.csv")

In [221]:
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [222]:
df.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [226]:
df["class"].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [227]:
for class_label in [ 0, 1, 2]:
    sample_tweet = df[df["class"] == class_label]['tweet'].iloc[0]
    print(f"Class {class_label}: {sample_tweet}")

Class 0: "@Blackman38Tide: @WhaleLookyHere @HowdyDowdy11 queer" gaywad
Class 1: !!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
Class 2: !!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...


In [228]:
STOPWORDS = {"a","the","is","to","of","and","in","on","for","it","you","i","he","she","we","they"}
def clean_tweet(text):
    text = re.sub(r"http\S+|www\.\S+", " ", text)     # remove URLs
    text = re.sub(r"@\w+", " ", text)                 # remove @mentions
    text = re.sub(r"#(\w+)", r"\1", text)             # keep hashtag word
    text = re.sub(r"[^a-z\s]", " ", text)             # keep letters/spaces only
    text = re.sub(r"\s+", " ", text).strip()          # collapse spaces
    words = [w for w in text.split() if w not in STOPWORDS]
    return " ".join(words)
   
    

In [229]:
df["clean_tweet"] = df['tweet'].apply(clean_tweet)

In [230]:
df["clean_tweet"]

0        s woman shouldn t complain about cleaning up y...
1        boy dats cold tyga dwn bad cuffin dat hoe st p...
2        awg ou ever fuck bitch start cry ou be confuse...
3                                         look like tranny
4        shit hear about me might be true or might be f...
                               ...                        
24778    s muthaf lie right trash ow mine ible scriptur...
24779    ve gone broke wrong heart baby drove me rednec...
24780    young buck wanna eat dat nigguh like aint fuck...
24781                    youu got wild bitches tellin lies
24782    uffled tac ileen ahlia eautiful color combinat...
Name: clean_tweet, Length: 24783, dtype: object

In [231]:
def binary_label(x):
    if x==2:
        return 0
    else: 
        return 1
df["label"] = df["class"].apply(binary_label)
print(df["label"].value_counts())

label
1    20620
0     4163
Name: count, dtype: int64


In [232]:
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,clean_tweet,label
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,s woman shouldn t complain about cleaning up y...,0
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dats cold tyga dwn bad cuffin dat hoe st p...,1
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,awg ou ever fuck bitch start cry ou be confuse...,1
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,1
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear about me might be true or might be f...,1
...,...,...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,s muthaf lie right trash ow mine ible scriptur...,1
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...",ve gone broke wrong heart baby drove me rednec...,0
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like aint fuck...,1
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies,youu got wild bitches tellin lies,1


In [233]:
df_clean = df[["clean_tweet","label"]].copy()

In [237]:
df_clean

Unnamed: 0,clean_tweet,label
0,s woman shouldn t complain about cleaning up y...,0
1,boy dats cold tyga dwn bad cuffin dat hoe st p...,1
2,awg ou ever fuck bitch start cry ou be confuse...,1
3,look like tranny,1
4,shit hear about me might be true or might be f...,1
...,...,...
24778,s muthaf lie right trash ow mine ible scriptur...,1
24779,ve gone broke wrong heart baby drove me rednec...,0
24780,young buck wanna eat dat nigguh like aint fuck...,1
24781,youu got wild bitches tellin lies,1


In [238]:
print(df_clean["label"].value_counts())

label
1    20620
0     4163
Name: count, dtype: int64


# Tokenize tweets

In [249]:
X_text = df_clean["clean_tweet"].tolist()
y = df_clean["label"].to_numpy()

# Splitting the data
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

In [251]:
vectorizer = TfidfVectorizer(
    max_features=10000,      # limit vocabulary size
    ngram_range=(1, 2),      # use unigrams and bigrams
    min_df=2,                # ignore terms that appear in < 2 documents
    max_df=0.95              # ignore terms that appear in > 95% of documents
)

In [253]:
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

print(f"Feature matrix shape: {X_train.shape}")
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

Feature matrix shape: (19826, 10000)
Vocabulary size: 10000


In [255]:
svm_model = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_model.fit(X_train, y_train)

In [257]:
y_pred = svm_model.predict(X_test)


In [259]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.947

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.95      0.86       833
           1       0.99      0.95      0.97      4124

    accuracy                           0.95      4957
   macro avg       0.89      0.95      0.91      4957
weighted avg       0.95      0.95      0.95      4957



In [261]:
import pickle



In [265]:
model_data = {
    'vectorizer': vectorizer,
    'model': svm_model
}
with open('hate_speech_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)
print("Model saved!")

Model saved!
