In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.

In [None]:
train=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv',encoding="utf-8")

In [None]:
test=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv',encoding="utf-8")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train['label'].value_counts()

In [None]:
train.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(12,8))
sns.countplot(x='label',data=train,hue='label')


In [None]:
train['number_words']=train['tweet'].apply(lambda x:len(str(x).split()))

In [None]:
test['number_words']=test['tweet'].apply(lambda x:len(str(x).split()))

In [None]:
sns.distplot(train['number_words'],bins=30)

In [None]:
sns.__version__

In [None]:
plt.figure(figsize=(10,8))
p1=sns.kdeplot(train[train['label']==0]['number_words'],shade=True,color='b')
p2=sns.kdeplot(train[train['label']==1]['number_words'],shade=True,color='r')

In [None]:
train['tweet']

In [None]:
import nltk
lemma = nltk.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')
import re
import string

In [None]:
string.punctuation

In [None]:
def cleantext(x):
    tweets = " ".join(filter(lambda x: x[0]!= '@' , x.split()))
    tweets = re.sub('[^a-zA-Z]', ' ', tweets)
    tweets = tweets.lower()
    tweets = tweets.split()
    tweets = [lemma.lemmatize(word) for word in tweets]
    tweets = " ".join(tweets)
    return tweets
def remove_stopword(x):
    tokens=re.split('\W+',x)
   #removal of stopwords from the tokenixed text
    text=[lemma.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [None]:
train['clean_data']=train['tweet'].apply(lambda x:cleantext(x))
test['clean_data']=test['tweet'].apply(lambda x:cleantext(x))

In [None]:
train['clean_data'][2:20]

In [None]:
train['clean_text']=train['clean_data'].apply(lambda x:remove_stopword(x))
test['clean_text']=test['clean_data'].apply(lambda x:remove_stopword(x))


In [None]:
train.head()

In [None]:
from collections import Counter

top=Counter([item for sublist in train['clean_text'] for item in sublist])
df=pd.DataFrame(top.most_common(20))
df=df.iloc[1:,:]
df.columns=['Common_words','count']
df.style.background_gradient(cmap='OrRd')

In [None]:
import plotly.express as px

In [None]:
figure=px.treemap(df,path=['Common_words'],values='count')
figure.show()

## Most common words sentiment wise


In [None]:
train.head()

In [None]:
Positive_sent=train[train['label']==0]
Racist_sent=train[train['label']==1]

In [None]:
top=Counter([item for sublist in Positive_sent['clean_text'] for item in sublist])
df_pos=pd.DataFrame(top.most_common(20))
df_pos=df_pos.iloc[1:,:]
df_pos.columns=['Common_words','count']
df_pos.style.background_gradient(cmap='OrRd')

In [None]:
top=Counter([item for sublist in Racist_sent['clean_text'] for item in sublist])
df_racist=pd.DataFrame(top.most_common(20))
df_racist=df_racist.iloc[1:,:]
df_racist.columns=['Common_words','count']
df_racist.style.background_gradient(cmap='Greens')

In [None]:
figure=px.treemap(df_racist,path=['Common_words'],values='count')
figure.show()

Lets try to print WORD-CLOUD
==

In [None]:
word_data_race=Racist_sent['clean_text']
word_data_race[1:10]

In [None]:
def words(data):
    all_words=[]
    for text in data:
        text=[x.strip(string.punctuation) for x in text]
        all_words.append(text)

    final=[" ".join(text)for text in all_words]
    final_data=" ".join(final)
    return final_data

In [None]:
final_racist=words(word_data_race)


In [None]:
from wordcloud import WordCloud

In [None]:
def word_cloud(x):
    #function to print word cloud
    wordcloud = WordCloud(background_color="black").generate(x)
    plt.figure(figsize = (10,7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
word_cloud(final_racist)

In [None]:
word_data_pos=Positive_sent['clean_text']
word_data_pos[0:10]

In [None]:
final_pos=words(word_data_pos)

In [None]:
word_cloud(final_pos)

Recurrent neural network
====

In [None]:
# training the tokenizer and use tokenizer to convert the sentences to sequences of numbers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['clean_data'])
X_train_seq = tokenizer.texts_to_sequences(train['clean_data'])
X_test_seq = tokenizer.texts_to_sequences(test['clean_data'])


In [None]:
train_seq_padded = pad_sequences(X_train_seq, 50)
test_seq_padded = pad_sequences(X_test_seq, 50)

In [None]:
df1=pd.DataFrame(train['number_words'])
df2=pd.DataFrame(test['number_words'])

In [None]:
from scipy.sparse import hstack
train_rnn= hstack((train_seq_padded,df1))
test_rnn= hstack((test_seq_padded,df2))

In [None]:
print(train_rnn.shape,test_rnn.shape)
y_train=train['label']

In [None]:
len(tokenizer.index_word)

In [None]:
import keras
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
#instantiating our neural network model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 50))
model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
train_data=train_rnn.toarray()
test_data=test_rnn.toarray()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',
              metrics=['accuracy',keras.metrics.Precision(),keras.metrics.Recall()])

In [None]:
history = model.fit(train_data,
                    y_train,
                    batch_size=50, 
                    epochs=10)

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'],label='train_loss')
plt.plot(history.history['accuracy'],label='train_accuracy')
plt.plot(history.history['precision'],label='train_precision')
plt.plot(history.history['recall'],label='train_precision')
plt.legend()
plt.show()

In [None]:
pip install h5py

In [None]:
model.save('Rnn_model.h5')

### By the end of 10 th epoch we got the below results:
Epoch 10/10
loss: 0.0032 - accuracy: 0.9991 - precision: 0.9951 - recall: 0.9924

In [None]:
from sklearn.metrics import precision_score
y_pred = model.predict(test_data,verbose=1)


In [None]:
test['Predicted labels_rnn']=pd.DataFrame(y_pred)

Random forest classifier
=====

TF-IDF vectoriztion
======

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(ngram_range=(1, 2),min_df=2,max_features=1000)
tfidf.fit(train['clean_data'])
tfidf_df=tfidf.transform(train['clean_data']).toarray()
print('shape',tfidf_df.shape)

In [None]:
tfidf_test=tfidf.transform(test['clean_data']).toarray()

In [None]:
y_train=train['label']

In [None]:
from scipy.sparse import hstack
# with the same hstack function we are concatinating a sparse matrix and a dense matirx :)
train_feat= hstack((tfidf_df,df1))
test_feat=hstack((tfidf_test,df2))
print(train_feat.shape)
print(test_feat.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def gridcv(train_feat,test_feat,y_train):
    parameters = {'max_depth': [1,2,3,4,5,6,7,8,9,10]}
    model= GridSearchCV(RandomForestClassifier(), param_grid=parameters, n_jobs=-1,scoring='accuracy',cv=5)
    model.fit(train_feat, y_train)
    print('Best Estimator:      ',model.best_estimator_)
    print('Optimal parameters:  ',model.best_params_)
    print('optimal score:       ',model.best_score_*(100))
    print('-------------------------------------------------')

    model.best_estimator_.fit(train_feat, y_train)
    y_pred = model.best_estimator_.predict(test_feat)
    return y_pred

In [None]:
pred_y=gridcv(tfidf_df,tfidf_test,y_train)

In [None]:
test['predicted_y']=pred_y

In [None]:
test['predicted_y'].value_counts()