# Project: - Twitter Hate Speech Detection

# Name: - Siddhant Agrawal

# Roll no:- C201

# Class:- B.tech AIML

## Importing all required Libraries

In [None]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SIDDHANT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SIDDHANT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading the Data

In [3]:
data = pd.read_csv("train.csv")

In [4]:
stop_words = set(stopwords.words('english'))

## Data Preprocessing

In [5]:
def clean_tweet(text):
    #removing mentions or any urls
    text = re.sub(r'@[\w]*', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#', '', text)
    
    # lowercasing and removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    
    # removing stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [6]:
data['cleaned_tweet'] = data['tweet'].apply(clean_tweet)

In [7]:
data.to_csv('cleaned_twitter_data.csv', index=False)

In [8]:
print(data[['cleaned_tweet']].head())

                                       cleaned_tweet
0  father dysfunctional selfish drags kids dysfun...
1  thanks lyft credit cant use cause dont offer w...
2                                     bihday majesty
3  model love u take u time urð± ðððð...
4                      factsguide society motivation


In [9]:
test_data = pd.read_csv('test.csv')

In [10]:
test_data['cleaned_tweet'] = test_data['tweet'].apply(clean_tweet)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
train_tweets = data['cleaned_tweet'] 
test_tweets = test_data['cleaned_tweet']

## Vectorization

In [13]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [14]:
X_train = tfidf_vectorizer.fit_transform(train_tweets)
X_test = tfidf_vectorizer.transform(test_tweets)

In [15]:
y_train = data['label'].values

In [16]:
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

In [17]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (31962, 5000)
Shape of X_test: (17197, 5000)


## Model Building

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.metrics import classification_report

In [19]:
model = Sequential()

In [20]:
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
model.fit(X_train_dense, y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.9352 - loss: 0.2397 - val_accuracy: 0.9581 - val_loss: 0.1281
Epoch 2/5
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9718 - loss: 0.0770 - val_accuracy: 0.9571 - val_loss: 0.1365
Epoch 3/5
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9841 - loss: 0.0489 - val_accuracy: 0.9550 - val_loss: 0.1656
Epoch 4/5
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9914 - loss: 0.0270 - val_accuracy: 0.9556 - val_loss: 0.1947
Epoch 5/5
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9957 - loss: 0.0144 - val_accuracy: 0.9537 - val_loss: 0.2461


<keras.src.callbacks.history.History at 0x1843eb941d0>

In [23]:
y_pred = (model.predict(X_test_dense) > 0.5).astype("int32")

[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


## Model testing

In [24]:
def predict_hate_speech(text):
    # Step 1: Clean the text (using your previously defined cleaning function)
    cleaned_text = clean_tweet(text)  # Make sure this function is defined in your code

    # Step 2: Transform the cleaned text to TF-IDF features
    text_tfidf = tfidf_vectorizer.transform([cleaned_text]).toarray()  # Wrap in a list for single input

    # Step 3: Predict using the trained model
    prediction = model.predict(text_tfidf)

    # Step 4: Return the predicted label (1 or 0)
    return (prediction > 0.5).astype(int)[0][0]

In [31]:
sample_text = "thanks for helping me"
predicted_label = predict_hate_speech(sample_text)
if(predicted_label==0):
    print("Not a Hate Speech.")
else:
    print("Hate Speech!!!")
#print(f"Predicted Label: {predicted_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Not a Hate Speech.
