**Importing Libraries and Packages**

In [2]:
import numpy as np                              # arithmetic on numerical data
import pandas as pd                             # arithmentic on Tabular  data
import seaborn as sns                           # Pictorial Visuaization Library(dataframe graphs)
import matplotlib.pyplot as plt                 # Pictorial Visuaization Library(static graphs)
from wordcloud import WordCloud, STOPWORDS      # Text Visualization
from sklearn.metrics import classification_report     # To get classification report(Acc,Prec,Rec,F1 score)
from sklearn.metrics import confusion_matrix          # To get performance of classifiaction model

**Reading Dataset**

In [3]:
df_train=pd.read_csv("../input/train/Train_Sir.csv")      # reading dataset in dataframe 
#df_train1=pd.read_csv("../input/hatespeech/train.csv")

In [4]:
df_train.head() # displaying dataset 
#df_train1.head()

**Getting Size of Dataset**

In [5]:
df_train.shape                            # size of dataset in rows,columns

**Checking Null values**

In [6]:
df_train.isnull().sum()                   # count of null values column wise 

**Removing unnecessary columns from the dataset**

In [7]:
df_train.drop(['task_2','task_3'],axis=1,inplace=True)               #dropping task_2 and task_3 columns
df_train.drop(df_train.columns[[0]], axis=1 ,inplace=True)

In [8]:
df_train.columns

In [9]:
df_train.head()                                                      # printing dataset

In [10]:
df_train['task_1'].unique()              # Checking unique values in task_1 column to make classes 

In [11]:
df_train.rename(columns = {'task_1':'class'}, inplace = True)         #renaming task_1 to class for understandability

In [12]:
# here class 1 represents hate offensive (HOF)
# here class 0 represents non hate offensive(NOT)
df_train['class'].replace({'HOF': 1,'NOT':0}, inplace=True)

In [13]:
df_train.head()

In [14]:
df_train1=pd.read_csv("../input/hatespeech/train.csv")

In [15]:
df_train1.head()

In [16]:
df_train1.drop(['id'],axis=1,inplace=True) 

In [17]:
df_train1.rename(columns = {'tweet':'text','label':'class'}, inplace = True)
      #renaming task_1 to class for understandability#renaming task_1 to class for understandability

In [18]:
df_train1.head()

**Checking Data Imbalancing**

In [19]:
df_train['class'].value_counts()      # counting values  of classes 0 and 1

In [20]:
df_train1['class'].value_counts()

In [21]:
sns.countplot('class',data=df_train)      # plotting graph to visualize data imbalancing 

****Data Preprocessing****

1.   Downloading libraries for preprocessing
2.  Downloading Stopwords from nltk library of english




In [22]:
import re                                       # importing regEx to check regular expressions ,characters,symbols etc
import nltk                                     # Library to process text data for further analysis
import string                                   # importing string
nltk.download('stopwords')                      # downloading stopwords from nltk 
from nltk.corpus import stopwords               # importing stopwords from nltk corpus or text reader
stopword=set(stopwords.words('english'))        # making a set of stopwords of english only 
from nltk.stem import WordNetLemmatizer         # importing WordNetLemmatizer(class) from nltk.stem library

**Cleaning the data**

In [23]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)               #Removing special characters,symbols,hyperlinks etc using Regex expression
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]            #Removing Stopwords
    text=" ".join(text)                                                           
    text = nltk.word_tokenize(text)                                               #Word Tokenization
    text=[nltk.stem.WordNetLemmatizer().lemmatize(word) for word in text]         #Lemmatization
    text=" ".join(text)
    return text

In [24]:
df_train['text']=df_train['text'].apply(clean_text)                      #applying cleaning function on our dataset

In [25]:
df_train.head()

**Data Visualisation**

In [26]:
def make_wordcloud(df_train):
    comment_words=''
    for val in df_train.text: 
        val = str(val).lower()
        comment_words += " ".join(val)+" "
    wordcloud = WordCloud(width = 2000, height = 1000,random_state=1,stopwords=STOPWORDS).generate(comment_words)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.figure(figsize=[8,1])
    plt.axis("off")
    plt.show()

In [27]:
make_wordcloud(df_train)                           #visualising text data

In [28]:
x=df_train['text']                   
y=df_train['class']


In [29]:
#Here 20% is test data and 80% is training data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20,random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english', ngram_range=(1,5))
x_train_vectorizer=count.fit_transform(x_train)

In [31]:
x_test_vectorizer=count.transform(x_test)
x_train_vectorizer.toarray()

In [32]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
x_train_tfidf = tfidf.fit_transform(x_train_vectorizer)
x_train_tfidf.toarray()
x_test_tfidf = tfidf.transform(x_test_vectorizer)

In [33]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,SpatialDropout1D
from tensorflow.keras.optimizers import Adam
#from keras.layers.core import Dense, Dropout, Activation
#from tensorflow.keras.optimizers import SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.models import Sequential

In [34]:
max_words = 50000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [73]:
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
#model.add(Dense(64, activation='relu'))
#model.add(Dense(32, activation='relu'))
#model.add(Dense(16, activation='relu'))
#model.add(Dense(8, activation='relu'))
#model.add(Dense(1, activation='sigmoid'))
#model.add(Flatten(input_shape=(6,size)))
#model.add(Dense(64,activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])
#model.compile(loss='mean_squared_error',optimizer='Adam',metrics=['accuracy'])
#model.fit(x, y,epochs=60,batch_size=40,validation_split=0.2)
#model.add(Dense(2, init='uniform', input_dim=64))
#model.add(Activation('softmax'))

#model.compile(optimizer='sgd', loss='mse')
model.summary()
#model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])

In [None]:
#from keras.callbacks import LearningRateScheduler

#scheduler = LearningRateScheduler(schedule, verbose=0)

In [81]:
from keras.callbacks import EarlyStopping,ModelCheckpoint

stop = EarlyStopping(
    monitor='val_loss', 
    mode='min',
    patience=5)

checkpoint= ModelCheckpoint(
    filepath='./',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [104]:
history=model.fit(sequences_matrix,y_train,verbose=1, batch_size=128,epochs=5,validation_split=0.2,callbacks=[stop,checkpoint])
#model.fit(x, y,epochs=60,batch_size=40,validation_split=0.2)

In [105]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [106]:
accr = model.evaluate(test_sequences_matrix,y_test)
#print("Accuracy : ",(accr*100))

In [100]:
#print("Accuracy =", (accr * 100), "%")

**Plotting Graph for Training Accuracy and Validation Accuracy**

In [110]:
epochs=range(1,len(accuracy)+1)
accuracy= history.history['accuracy']
val_acc= history.history['val_accuracy'] 
plt.plot(epochs, accuracy, 'y', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.title('Training and validation Accuracy\n\n')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


**Plotting graph for Training and validation loss**

In [108]:
loss= history.history['loss']
val_loss= history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

**Plotting Graph for Training Accuracy and Training Loss**

In [112]:
accuracy= history.history['accuracy']
loss= history.history['loss']
epochs = range(1, len(accuracy) + 1)
plt.plot(epochs, accuracy, 'y', label='Training Accuracy')
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.title('Training Accuracy and Training Loss\n\n')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [113]:
val_acc= history.history['val_accuracy']
val_loss= history.history['val_loss']
epochs = range(1, len(val_loss) + 1)
plt.plot(epochs, val_acc, 'y', label='Validation Accuracy')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Validation Accuracy and Validation Loss\n\n')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [54]:
lstm_prediction=model.predict(test_sequences_matrix)
print("The probabilities are : ", lstm_prediction, sep='\n')

In [99]:
#Gets labels based on probability 1 if p>= 0.5 else 0
#for each in lstm_prediction:
    #if each[0] >=0.5:
   #     each[0] = 1
  #  else:
 #       each[0] = 0
#lstm_prediction = lstm_prediction.astype('int32') 
#print("\nThe labels are - ", lstm_prediction, sep='\n')

In [98]:
# Calculates accuracy on Test data
#print("\nThe accuracy of the model is ", accr(y_test, lstm_prediction))
#print("\nThe accuracy and other metrics are \n", classification_report(y_test, lstm_prediction, y=[0, 1]),sep='\n')

In [90]:
res=[]
for prediction in lstm_prediction:
    if prediction[0]<0.5:
        res.append(0)
    else:
        res.append(1)

In [91]:
print(confusion_matrix(y_test,res))

In [92]:
print(classification_report(y_test,res))

In [97]:
fig,ax = plt.subplots(figsize=(6, 6))
conf_matrix=confusion_matrix(y_test,res)
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=1)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Model Predictions', fontsize=15)
plt.ylabel('Actual Predictions', fontsize=15)
plt.title('Confusion Matrix\n\n\n', fontsize=20)
plt.show()

In [None]:
model.save("Hate_&_Offensive_Detection_model.h5")