In [2]:
import pandas as pd

# Load dataset
train_df = pd.read_csv('/content/train_E6oV3lV.csv')

# Display first few rows
train_df.head()


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
# Check for missing values
print(train_df.isnull().sum())

# Fill missing values using different techniques
train_df['tweet'].fillna("No text available", inplace=True)  # Example replacement for missing tweets


id       0
label    0
tweet    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['tweet'].fillna("No text available", inplace=True)  # Example replacement for missing tweets


In [4]:
import numpy as np

# Define a function to remove outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Example (if applicable numeric column exists)
# train_df = remove_outliers(train_df, 'column_name')


In [5]:
from scipy import stats
train_df['tweet_length'] = train_df['tweet'].apply(lambda x: len(x.split()))
train_df = train_df[(np.abs(stats.zscore(train_df['tweet_length'])) < 3)]


In [6]:
import re

def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

train_df['cleaned_tweet'] = train_df['tweet'].apply(clean_text)
train_df.head()


Unnamed: 0,id,label,tweet,tweet_length,cleaned_tweet
0,1,0,@user when a father is dysfunctional and is s...,18,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,19,thanks for lyft credit i cant use cause they d...
2,3,0,bihday your majesty,3,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,14,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,4,factsguide society now motivation


In [7]:
!pip install -U spacy
!python -m spacy download en_core_web_sm  # Download small English model


Collecting spacy
  Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.3.0,>=1.2.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Downloading blis-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading blis-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")


In [9]:
import re

def preprocess_text(text):
    # Remove user mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Process text with spaCy NLP model
    doc = nlp(text)

    # Lemmatization & Stopword Removal
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Join tokens back into a string
    return " ".join(processed_tokens)

# Example usage
sample_text = "@user This is an example of #hate speech detection using spaCy! Visit http://example.com"
processed_text = preprocess_text(sample_text)
print(processed_text)


example hate speech detection spacy visit


In [None]:
train_df['processed_tweet'] = train_df['tweet'].apply(preprocess_text)
train_df.head()


In [11]:
train_df.to_csv('/content/processed_train.csv', index=False)


In [12]:
train_df['label'].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
0,0.929828
1,0.070172


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(train_df['processed_tweet'])


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_count = count_vectorizer.fit_transform(train_df['processed_tweet'])


In [16]:
print("TF-IDF Shape:", X_tfidf.shape)
print("CountVectorizer Shape:", X_count.shape)

TF-IDF Shape: (31936, 5000)
CountVectorizer Shape: (31936, 5000)


In [18]:
import gensim
from gensim.models import Word2Vec

# Tokenize sentences for Word2Vec
sentences = [tweet.split() for tweet in train_df['processed_tweet']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Convert tweets into embeddings (average word vectors per sentence)
def get_avg_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else np.zeros(model.vector_size)

# Apply Word2Vec embeddings
train_df['word2vec_features'] = train_df['processed_tweet'].apply(lambda x: get_avg_word2vec(x.split(), word2vec_model))

Model Building & Training

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Tokenization
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to 5000 words
tokenizer.fit_on_texts(train_df['processed_tweet'])
X_seq = tokenizer.texts_to_sequences(train_df['processed_tweet'])

# Padding sequences to make them of equal length
X_padded = pad_sequences(X_seq, maxlen=100)  # Choose max length

# Prepare labels
y = train_df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"LSTM Model Accuracy: {accuracy:.4f}")

Epoch 1/5




[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 308ms/step - accuracy: 0.9311 - loss: 0.2414 - val_accuracy: 0.9588 - val_loss: 0.1238
Epoch 2/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 279ms/step - accuracy: 0.9661 - loss: 0.0943 - val_accuracy: 0.9623 - val_loss: 0.1180
Epoch 3/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 278ms/step - accuracy: 0.9752 - loss: 0.0716 - val_accuracy: 0.9637 - val_loss: 0.1374
Epoch 4/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 273ms/step - accuracy: 0.9822 - loss: 0.0532 - val_accuracy: 0.9612 - val_loss: 0.1351
Epoch 5/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 275ms/step - accuracy: 0.9847 - loss: 0.0468 - val_accuracy: 0.9601 - val_loss: 0.1516
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 44ms/step - accuracy: 0.9611 - loss: 0.1439
LSTM Model Accuracy: 0.9601


In [21]:
from tensorflow.keras.layers import Bidirectional

# Build BiLSTM Model
model_bilstm = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(1, activation='sigmoid')
])

# Compile model
model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model_bilstm.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate model
loss, accuracy = model_bilstm.evaluate(X_test, y_test)
print(f"BiLSTM Model Accuracy: {accuracy:.4f}")

Epoch 1/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 755ms/step - accuracy: 0.9289 - loss: 0.2352 - val_accuracy: 0.9596 - val_loss: 0.1242
Epoch 2/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 611ms/step - accuracy: 0.9684 - loss: 0.0892 - val_accuracy: 0.9590 - val_loss: 0.1244
Epoch 3/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 612ms/step - accuracy: 0.9754 - loss: 0.0688 - val_accuracy: 0.9635 - val_loss: 0.1261
Epoch 4/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 559ms/step - accuracy: 0.9794 - loss: 0.0577 - val_accuracy: 0.9613 - val_loss: 0.1313
Epoch 5/5
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 558ms/step - accuracy: 0.9838 - loss: 0.0466 - val_accuracy: 0.9648 - val_loss: 0.1497
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 77ms/step - accuracy: 0.9660 - loss: 0.1369
BiLSTM Model Accuracy: 0.9648


In [23]:
import pickle

with open("nlp_hate_speech_model.pkl", "wb") as model_file:
    pickle.dump(model_bilstm, model_file)  # Save BiLSTM model

In [24]:
def predict_hate_speech(text):
    text_seq = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_seq, maxlen=100)
    prediction = model_bilstm.predict(text_padded)
    return "Hate Speech" if prediction[0] > 0.5 else "Non-Hate Speech"

print(predict_hate_speech("I hate everything!"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Non-Hate Speech
