# AICrowd - Hate Speech Detection from tweets [Link](https://www.aicrowd.com/challenges/htspc-hate-speech-classification)

1. Load the data & Load word2vec model
2. Pre-process the data \\
    a. Basic tokenizing, stopwords removal \\
    b. For each tweet, get sentiment analysis \\
    c. For each tweet, do POS tagging and pick the adjectives & adverbs. \\
3. Once we have the sentiment analysis and adjective words for each tweet: \\
    a. Load "n" similar words for the adjective(s). \\
    b. Replace the adjective with the similar word to create more sentences. \\
    c. The label for the sentences will be the same as their parent sentences. \\
    d. Check the sentiment of the new sentences, this will be label_sent. \\
    e. In case of conflict between parent and the label_sent, give priority to label_sent.
4. Now that we have increased the dataset size, we proceed to perform classification using ML Models.

## Google Drive & Imports 

In [0]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [0]:
%matplotlib inline

!pip install vaderSentiment textstat
!pip install textblob

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/44/a3/1218a3b5651dbcba1699101c84e5c84c36cbba360d9dbf29f2ff18482982/vaderSentiment-3.3.1-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 2.7MB/s 
[?25hCollecting textstat
[?25l  Downloading https://files.pythonhosted.org/packages/60/af/0623a6e3adbcfda0be827664eacab5e02cd0a08d36f00013cb53784917a9/textstat-0.6.2-py3-none-any.whl (102kB)
[K     |████████████████████████████████| 102kB 5.2MB/s 
[?25hCollecting pyphen
[?25l  Downloading https://files.pythonhosted.org/packages/15/82/08a3629dce8d1f3d91db843bb36d4d7db6b6269d5067259613a0d5c8a9db/Pyphen-0.9.5-py2.py3-none-any.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 8.2MB/s 
[?25hInstalling collected packages: vaderSentiment, pyphen, textstat
Successfully installed pyphen-0.9.5 textstat-0.6.2 vaderSentiment-3.3.1


In [0]:
import seaborn as sn
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textblob import TextBlob

from time import time
from collections import Counter, defaultdict
from bs4 import BeautifulSoup

from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

from gensim.models import Word2Vec, KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.layers import Input, LSTM, Embedding, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed, InputLayer, GlobalMaxPooling1D
from keras.models import Model, Sequential
from keras.callbacks.callbacks import Callback, EarlyStopping

  import pandas.util.testing as tm


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Using TensorFlow backend.


## 1. Load the Data & word2vec model

In [0]:
train_data = pd.read_csv("/gdrive/My Drive/Dataset/AICrowd/HateSpeechDetection/1fe720be-90e4-4e06-9b52-9de93e0ea937_train.csv")
test_data = pd.read_csv("/gdrive/My Drive/Dataset/AICrowd/HateSpeechDetection/f6eb0bd7-6063-4e50-baa0-111feda638fb_test.csv")

In [0]:
word2vec_model_path = "/gdrive/My Drive/Dataset/GoogleNews-vectors-negative300.bin.gz"

w = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
w.most_similar(positive=["fuck"])

  if np.issubdtype(vec.dtype, np.int):


[('fucking', 0.8137822151184082),
 ('f_*_ck', 0.801154613494873),
 ('f_**_k', 0.7815893888473511),
 ('shit', 0.7604621648788452),
 ('fucked', 0.7501130104064941),
 ('fuckin', 0.7309141755104065),
 ('f_***', 0.7172753810882568),
 ('f_ck', 0.7121477127075195),
 ('f_---', 0.7099311351776123),
 ('Fuck', 0.7066987752914429)]

## 2. Pre-process the data

### Basic Pre-processing

In [0]:
def clean_text(line):
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    
    soup = BeautifulSoup(line, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    split = lower_case.split(" ")
    return " ".join(list(filter(None, split))).strip()

def get_indices(x): return np.argmax(x)

train_data["clean_text"] = train_data.text.apply(lambda x: clean_text(x))
train_data.head()

Unnamed: 0,text,labels,clean_text
0,@realDonaldTrump This is one of the worst time...,0,this is one of the worst times to be american ...
1,How about the crowd in Oval in today's #AUSvIN...,1,how about the crowd in oval in today s ausvind...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,biden his son hunter took advantage of their p...
3,#etsy shop: Benedict Donald so called presiden...,1,etsy shop benedict donald so called president ...
4,@realDonaldTrump Good build a wall around Arka...,0,good build a wall around arkansas fucktrump fu...


### Sentiment Analysis

In [0]:
def get_sentiment_1(text):
    analysis = TextBlob(text)
    val = analysis.sentiment.polarity
    if val > 0:
        return 1
    elif val < 0:
        return 0
    else:
        return -1

train_data["sentiment_1"] = train_data["clean_text"].apply(lambda x: get_sentiment_1(x))
train_data.head()

Unnamed: 0,text,labels,clean_text,sentiment_1
0,@realDonaldTrump This is one of the worst time...,0,this is one of the worst times to be american ...,0
1,How about the crowd in Oval in today's #AUSvIN...,1,how about the crowd in oval in today s ausvind...,-1
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,biden his son hunter took advantage of their p...,0
3,#etsy shop: Benedict Donald so called presiden...,1,etsy shop benedict donald so called president ...,-1
4,@realDonaldTrump Good build a wall around Arka...,0,good build a wall around arkansas fucktrump fu...,1


In [0]:
sentiment_analyzer = VS()

def get_sentiment_2(text, sentiment_analyzer):
    sentiment = sentiment_analyzer.polarity_scores(text)
    negative, neutral, positive = sentiment["neg"], sentiment["neu"], sentiment["pos"]
    if negative > neutral and negative > positive:
        return 0
    elif positive > negative and positive > neutral:
        return 1
    else:
        return -1 


# for index, row in train_data.head(20).iterrows():
#     text, act_label, clean_text = row[0], row[1], row[2]
#     sentiment = sentiment_analyzer.polarity_scores(text)
#     clean_sentiment = sentiment_analyzer.polarity_scores(clean_text)
#     print(index, act_label, [sentiment["neg"], sentiment["pos"]], [clean_sentiment["neg"], clean_sentiment["pos"]])

train_data["sentiment_2"] = train_data["clean_text"].apply(lambda x: get_sentiment_2(x, sentiment_analyzer))
train_data.head()

Unnamed: 0,text,labels,clean_text,sentiment_1,sentiment_2
0,@realDonaldTrump This is one of the worst time...,0,this is one of the worst times to be american ...,0,-1
1,How about the crowd in Oval in today's #AUSvIN...,1,how about the crowd in oval in today s ausvind...,-1,-1
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,biden his son hunter took advantage of their p...,0,-1
3,#etsy shop: Benedict Donald so called presiden...,1,etsy shop benedict donald so called president ...,-1,-1
4,@realDonaldTrump Good build a wall around Arka...,0,good build a wall around arkansas fucktrump fu...,1,-1


### POS Tagging

In [0]:
def get_pos_tags(txt, stopwords, target_forms):
    tokenized = sent_tokenize(txt) 
    words = list()
    for i in tokenized:
        words_list = nltk.word_tokenize(i) 
        words_list = [w for w in words_list if not w in stopwords] 
        tagged = nltk.pos_tag(words_list) 
        words += [word for word, pos_tag in tagged if pos_tag in target_forms]
    return words

In [0]:
target_forms = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]

stopwords = nltk.corpus.stopwords.words("english")
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

train_data["target_words"] = train_data["clean_text"].apply(lambda x: get_pos_tags(x, stopwords, target_forms))
train_data.head()

Unnamed: 0,text,labels,clean_text,sentiment_1,sentiment_2,target_words
0,@realDonaldTrump This is one of the worst time...,0,this is one of the worst times to be american ...,0,-1,"[worst, american, serious, sure, wish, happy, ..."
1,How about the crowd in Oval in today's #AUSvIN...,1,how about the crowd in oval in today s ausvind...,-1,-1,[indian]
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,biden his son hunter took advantage of their p...,0,-1,"[biden, ukraine, dead]"
3,#etsy shop: Benedict Donald so called presiden...,1,etsy shop benedict donald so called president ...,-1,-1,"[short, streetwear]"
4,@realDonaldTrump Good build a wall around Arka...,0,good build a wall around arkansas fucktrump fu...,1,-1,[good]


## 3. Data Augmentation

In [0]:
def get_top_k_words(model, word, k):
    try:
        similar = model.most_similar(positive=[word], topn=k)
    except KeyError as e:
        similar = []
    return similar

def replace_and_create(text, words, model, k=2):
    try:
        augmented_text = list()
        for word in words:
            similar_words = get_top_k_words(model, word, k)
            if len(similar_words) != 0:
                for each_word in similar_words:
                    augmented_text.append(text.replace(word, each_word[0]))
    except Exception as e:
        print(e, words, similar_words)
    return augmented_text

In [0]:
top_k = 2
start_time = 0
column = list()

print("Total no of rows : {}".format(train_data.shape[0]))

for index, row in train_data.iterrows():
    if index % 100 == 0:
        print(index, time() - start_time)
        start_time = time()
    clean_text, target_words = row[2], row[5]
    v = replace_and_create(text=clean_text, words=target_words, model=w, k=top_k)
    column.append(v)

train_data["augmented_texts"] = column
train_data.to_csv("/gdrive/My Drive/Dataset/AICrowd/HateSpeechDetection/Augmented_data.csv", index=False)
train_data.head()

Total no of rows : 5266
0 1588249769.7477138


  if np.issubdtype(vec.dtype, np.int):


100 63.0937922000885
200 61.92446231842041
300 69.8088641166687
400 69.11589050292969
500 76.36979961395264
600 57.246421575546265
700 62.71471452713013
800 53.33625388145447
900 69.03457260131836
1000 63.53877139091492
1100 75.59937620162964
1200 76.38743925094604
1300 66.55491161346436
1400 73.54179501533508
1500 72.98031759262085
1600 55.562992572784424
1700 69.28310656547546
1800 65.42160844802856
1900 66.79487609863281
2000 64.0601601600647
2100 65.84169673919678
2200 61.25098180770874
2300 78.31003427505493
2400 65.72362446784973
2500 57.83689641952515
2600 55.99736022949219
2700 60.053537368774414
2800 65.96400260925293
2900 65.80471229553223
3000 71.54007625579834
3100 75.94544100761414
3200 68.97627902030945
3300 62.50282025337219
3400 68.41259670257568
3500 66.74055171012878
3600 56.97658395767212
3700 65.18144488334656
3800 63.735482692718506
3900 73.65312361717224
4000 63.76209211349487
4100 72.79196238517761
4200 67.43505477905273
4300 76.05166125297546
4400 58.26291179656

Unnamed: 0,text,labels,clean_text,sentiment_1,sentiment_2,target_words,augmented_texts
0,@realDonaldTrump This is one of the worst time...,0,this is one of the worst times to be american ...,0,-1,"[worst, american, serious, sure, wish, happy, ...",[this is one of the Worst times to be american...
1,How about the crowd in Oval in today's #AUSvIN...,1,how about the crowd in oval in today s ausvind...,-1,-1,[indian],[how about the crowd in oval in today s ausvin...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,biden his son hunter took advantage of their p...,0,-1,"[biden, ukraine, dead]",[john_mccain his son hunter took advantage of ...
3,#etsy shop: Benedict Donald so called presiden...,1,etsy shop benedict donald so called president ...,-1,-1,"[short, streetwear]",[etsy shop benedict donald so called president...
4,@realDonaldTrump Good build a wall around Arka...,0,good build a wall around arkansas fucktrump fu...,1,-1,[good],[great build a wall around arkansas fucktrump ...


In [0]:
train_data.head(20)

Unnamed: 0,text,labels,clean_text,sentiment_1,sentiment_2,target_words,augmented_texts
0,@realDonaldTrump This is one of the worst time...,0,this is one of the worst times to be american ...,0,-1,"[worst, american, serious, sure, wish, happy, ...",[this is one of the Worst times to be american...
1,How about the crowd in Oval in today's #AUSvIN...,1,how about the crowd in oval in today s ausvind...,-1,-1,[indian],[how about the crowd in oval in today s ausvin...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,biden his son hunter took advantage of their p...,0,-1,"[biden, ukraine, dead]",[john_mccain his son hunter took advantage of ...
3,#etsy shop: Benedict Donald so called presiden...,1,etsy shop benedict donald so called president ...,-1,-1,"[short, streetwear]",[etsy shop benedict donald so called president...
4,@realDonaldTrump Good build a wall around Arka...,0,good build a wall around arkansas fucktrump fu...,1,-1,[good],[great build a wall around arkansas fucktrump ...
5,Meanwhile ....Dhoni's Reply To ICC ...... #...,1,meanwhile dhoni s reply to icc dhonikeeptheglo...,-1,-1,"[meanwhile, dhoni, icc]",[Meanwhile dhoni s reply to icc dhonikeepthegl...
6,@MeredthSalenger Anything to get a war to dist...,1,anything to get a war to distract fucktrump fu...,-1,-1,[distract],[anything to get a war to divert_attention fuc...
7,Why the FUCK did Doris mention demar lmfaooooo...,0,why the fuck did doris mention demar lmfaooooo...,0,0,"[fuck, doris]",[why the fucking did doris mention demar lmfao...
8,@KimKardashian #trump2020 #fucktrump Maybe yo...,0,trump fucktrump maybe you can hire the ex cons...,-1,-1,"[maybe, ex, dog, porn]",[trump fucktrump probably you can hire the ex ...
9,@matthewamiller Because there are no consequen...,0,because there are no consequences to individua...,1,-1,"[individual, basically, free, open, foreign, t...",[because there are no consequences to individu...


In [0]:
def most_frequent(List): 
    return max(set(List), key = List.count) 

def get_label(label, sent_1, sent_2):
    if sent_1 == -1 and sent_2 == -1:
        return label
    elif sent_1 in [0, 1]:
        if sent_2 in [0, 1]:
            return most_frequent([label, sent_1, sent_2])
        else:
            return label
    elif sent_2 in [0, 1]:
        if sent_1 in [0, 1]:
            return most_frequent([label, sent_1, sent_2])
        else:
            return label

In [0]:
final_data = defaultdict()
final_labels = defaultdict()

data_counter, label_counter = 0, 0

for index, row in train_data.iterrows():
    label, sent_1, sent_2, aug_texts = row[1], row[3], row[4], row[6]
    label = get_label(label, sent_1, sent_2)
    for at in aug_texts:
        final_data[data_counter] = at
        data_counter += 1
    for _ in range(len(aug_texts)):
        final_labels[label_counter] = label
        label_counter += 1


data_df = pd.DataFrame({"data": final_data, "label": final_labels})

## 4. Classification

In [0]:
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 200
VALIDATION_SPLIT = 0.33
EMBEDDING_DIM = 200

EMBEDDING_PATH = "/gdrive/My Drive/Dataset/WordEmbeddings/glove.twitter.27B.200d.txt"

In [0]:
embeddings_index = {}
f = open(EMBEDDING_PATH)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [0]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_df["data"])

sequences = tokenizer.texts_to_sequences(data_df["data"])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(data_df["label"]))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]



Found 18543 unique tokens.
Shape of data tensor: (27420, 200)
Shape of label tensor: (27420, 2)


In [0]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [0]:
def mlp_model():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Dense(100, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Flatten())

    model.add(Dense(1000, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(500, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation="softmax"))

    print(model.summary())

    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

    return model

In [0]:
print("Model used : {}".format("MLP Only"))

model = mlp_model()

cbacks = EarlyStopping(monitor='val_loss', patience=3)

model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          epochs=100, 
          callbacks = [cbacks],
          batch_size=256)

loss, acc = model.evaluate(x_val, y_val)
print({"loss": loss, "acc": acc})

y_pred = list(map(get_indices, model.predict(x_val)))
y_true = list(map(get_indices, y_val))

print("F1 Score : {}".format(f1_score(y_true=y_true, y_pred=y_pred)))
print(classification_report(y_true=y_true, y_pred=y_pred))

Model used : MLP Only
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 200)          3708800   
_________________________________________________________________
dense_1 (Dense)              (None, 200, 100)          20100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
dense_2 (Dense)              (None, 200, 10)           1010      
_________________________________________________________________
dropout_2 (Dropout)          (None, 200, 10)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dense_3 (Dense)              (No

In [0]:
def lstm_model():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(LSTM(30, activation="relu", return_sequences=True, recurrent_dropout=0.3))
    model.add(Dropout(0.3))
    model.add(LSTM(30, activation="relu", return_sequences=True, recurrent_dropout=0.3))
    model.add(Dropout(0.3))
    model.add(LSTM(30, activation="relu", return_sequences=True, recurrent_dropout=0.3))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(1000, activation="relu"))
    model.add(Dense(100, activation="relu"))
    model.add(Dense(10, activation="relu"))
    model.add(Dense(2, activation="softmax"))
    print(model.summary())
    
    return model

In [0]:
cbacks = EarlyStopping(monitor='val_loss', patience=3)

print("Model used : {}".format("LSTM Only"))
model = lstm_model()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          epochs=50, 
          callbacks = [cbacks],
          batch_size=256)

loss, acc = model.evaluate(x_val, y_val)
print({"loss": loss, "acc": acc})

y_pred = list(map(get_indices, model.predict(x_val)))
y_true = list(map(get_indices, y_val))

print("F1 Score : {}".format(f1_score(y_true=y_true, y_pred=y_pred)))
print(classification_report(y_true=y_true, y_pred=y_pred))

Model used : LSTM Only
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 200)          3708800   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 30)           27720     
_________________________________________________________________
dropout_7 (Dropout)          (None, 200, 30)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 200, 30)           7320      
_________________________________________________________________
dropout_8 (Dropout)          (None, 200, 30)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 200, 30)           7320      
_________________________________________________________________
dropout_9 (Dropout)          (N

In [0]:
def one_dimensional_cnn():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))
    model.add(Conv1D(64, 4, activation='relu'))
    model.add(Conv1D(64, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))
    model.add(Conv1D(64, 4, activation='relu'))
    model.add(Conv1D(64, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))

    model.add(Flatten())

    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.6))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.6))
    model.add(Dense(2, activation='softmax'))
    
    print(model.summary())
    
    return model

In [0]:
cbacks = EarlyStopping(monitor='val_loss', patience=3)

print("Model used : {}".format("CNN Only"))
model = one_dimensional_cnn()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          epochs=50,
          callbacks = [cbacks],
          batch_size=64)

loss, acc = model.evaluate(x_val, y_val)
print({"loss": loss, "acc": acc})

y_pred = list(map(get_indices, model.predict(x_val)))
y_true = list(map(get_indices, y_val))

print("F1 Score : {}".format(f1_score(y_true=y_true, y_pred=y_pred)))
print(classification_report(y_true=y_true, y_pred=y_pred))

In [0]:
cm = confusion_matrix(y_true=y_true, y_pred=y_pred)

labels = [0, 1]
title='Confusion matrix'
print(cm)

plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)
fmt = 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()

In [0]:
def cnn_lstm():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))
    model.add(Conv1D(64, 4, activation='relu'))
    model.add(Conv1D(64, 4, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.4))

    model.add(LSTM(30, activation="relu", return_sequences=True, recurrent_dropout=0.3))
    model.add(Dropout(0.3))
    model.add(LSTM(30, activation="relu", return_sequences=True, recurrent_dropout=0.3))
    model.add(Dropout(0.3))
    model.add(LSTM(30, activation="relu", return_sequences=True, recurrent_dropout=0.3))
    model.add(Dropout(0.3))

    model.add(Flatten())

    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation='softmax'))

    print(model.summary())
    
    return model

In [0]:
cbacks = EarlyStopping(monitor='val_loss', patience=3)

print("Model used : {}".format("CNN + LSTM Only"))
model = cnn_lstm()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          epochs=50, 
          callbacks = [cbacks],
          batch_size=256)

loss, acc = model.evaluate(x_val, y_val)
print({"loss": loss, "acc": acc})

y_pred = list(map(get_indices, model.predict(x_val)))
y_true = list(map(get_indices, y_val))

print("F1 Score : {}".format(f1_score(y_true=y_true, y_pred=y_pred)))
print(classification_report(y_true=y_true, y_pred=y_pred))

# Testing

In [0]:
def clean_text(line):
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    
    soup = BeautifulSoup(line, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    split = lower_case.split(" ")
    return " ".join(list(filter(None, split))).strip()

def get_indices(x): return np.argmax(x)

test_data["clean_text"] = train_data["text"].apply(lambda x: clean_text(x))
test_data.head()

In [0]:
tokenizer.fit_on_texts(test_data["clean_text"])
sequences = tokenizer.texts_to_sequences(test_data["clean_text"])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

In [0]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [0]:
preds = model.predict(data)

y_pred = list(map(get_indices, preds))
pd.DataFrame({"label": y_pred}).to_csv("/gdrive/My Drive/Dataset/AICrowd/HateSpeechDetection/submission_satwik.csv", index=False)