The objective of this task is to detect hate speech in tweets. For the sake of simplicity, we say a tweet contains hate speech if it has a racist or sexist sentiment associated with it. So, the task is to classify racist or sexist tweets from other tweets.

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.


Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

Load

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read

In [None]:
train = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv")
test = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv")

In [None]:
train

In [None]:
test

Analyse label

In [None]:
sns.displot(train['label'])

In [None]:
label_cnt = train['label'].value_counts()
label_cnt

In [None]:
label_pct = train['label'].value_counts() / len(train)
label_pct

Drop label

In [None]:
label = train['label']

train.drop(['label'], axis=1, inplace=True)
train

Combine train and test

In [None]:
combi = train.append(test)
combi

Clean tweets

In [None]:
tweets = combi['tweet']

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

In [None]:
import re
from nltk.corpus import stopwords

""" Cleaning Tweets """
tweets = tweets.str.lower()

# removing special characters and numbers
tweets = tweets.apply(lambda x : re.sub("[^a-z\s]","",x) )

# remove hash tags
tweets = tweets.str.replace("#", " ")

#remove words less than 2 character
tweets = tweets.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# removing stopwords
stopwords = set(stopwords.words("english"))
tweets = tweets.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))


count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

Remove frequently used words

In [None]:
most_freq_words = pd.Series(' '.join(tweets).lower().split()).value_counts()[:25]
tweets = tweets.apply(lambda x : " ".join(word for word in x.split() if word not in most_freq_words ))
print(most_freq_words)

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

Remove rare words

In [None]:
from collections import Counter
from itertools import chain

# split words into lists
v = tweets.str.split().tolist() 
# compute global word frequency
c = Counter(chain.from_iterable(v))
# filter, join, and re-assign
tweets = [' '.join([j for j in i if c[j] > 1]) for i in v]

total_word = 0
for x,word in enumerate(tweets):
    num_word = len(word.split())
    #print(num_word)
    total_word = total_word + num_word
print(total_word)

Define X and y

In [None]:
X = np.array(tweets[: len(train)])
y = label

Split

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_val, y_train, y_val = train_test_split(X,y, stratify=y, test_size=0.3, random_state=1)
X_train.shape, y_train.shape, X_val.shape,y_val.shape

TfIdf

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=0.01)
train_tfIdf = vectorizer_tfidf.fit_transform(X_train.astype('U'))
val_tfIdf = vectorizer_tfidf.transform(X_val.astype('U'))
print(vectorizer_tfidf.get_feature_names()[:5])


Select model

In [None]:
train_tfIdf.shape,  val_tfIdf.shape

In [None]:

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=10,max_iter=1000).fit(train_tfIdf, y_train)
print(model.score(train_tfIdf, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(val_tfIdf)
print(model.score(val_tfIdf, y_val))

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))
