In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup # Text Cleaning
import re, string # Regular Expressions, String
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import PorterStemmer # for word stemming
from nltk.stem import WordNetLemmatizer # for word lemmatization
from keras.preprocessing.sequence import pad_sequences
import unicodedata
import html
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline


import multiprocessing

In [17]:
train = pd.read_csv("./input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
test = pd.read_csv('./input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
all_data = pd.read_csv('./input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv')
sub = pd.read_csv('./input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv')

# Preprocessing

In [18]:
# Creating a cleaning function

# set of stopwords to be removed from text
stop = set(stopwords.words('english'))

# update stopwords to have punctuation too
stop.update(list(string.punctuation))

def clean_text(text):
    
    # Remove unwanted html characters
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
    'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
    '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
    ' @-@ ', '-').replace('\\', ' \\ ')
    text = re1.sub(' ', html.unescape(x1))
    
    # remove non-ascii characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    #     # strip html
    #     soup = BeautifulSoup(text, 'html.parser')
    #     text = soup.get_text()
    
    # remove between square brackets
    text = re.sub('\[[^]]*\]', '', text)
    
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # remove twitter tags
    text = text.replace("@", "")
    
    # remove hashtags
    text = text.replace("#", "")
    
    # remove all non-alphabetic characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # remove stopwords from text
    final_text = []
    for word in text.split():
        if word.strip().lower() not in stop:
            final_text.append(word.strip().lower())
    
    text = " ".join(final_text)
    
    # lemmatize words
    lemmatizer = WordNetLemmatizer()    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = " ".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])
    
    # replace all numbers with "num"
    text = re.sub("\d", "num", text)
    
    return text.lower()

In [19]:
train.shape

(1804874, 45)

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804874 entries, 0 to 1804873
Data columns (total 45 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   id                                   int64  
 1   target                               float64
 2   comment_text                         object 
 3   severe_toxicity                      float64
 4   obscene                              float64
 5   identity_attack                      float64
 6   insult                               float64
 7   threat                               float64
 8   asian                                float64
 9   atheist                              float64
 10  bisexual                             float64
 11  black                                float64
 12  buddhist                             float64
 13  christian                            float64
 14  female                               float64
 15  heterosexual                    

In [21]:
train

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.000000,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.000000,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.000000,Is this something I'll be able to install on m...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.000000,0.021277,0.872340,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1804869,6333967,0.000000,"Maybe the tax on ""things"" would be collected w...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,399385,approved,0,0,0,0,0,0.0,0,4
1804870,6333969,0.000000,What do you call people who STILL think the di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,399528,approved,0,0,0,0,0,0.0,0,4
1804871,6333982,0.000000,"thank you ,,,right or wrong,,, i am following ...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,399457,approved,0,0,0,0,0,0.0,0,4
1804872,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,,,...,399519,approved,0,0,0,0,0,0.0,0,66


In [4]:
train = train.dropna()

In [22]:
train.shape

(1804874, 45)

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804874 entries, 0 to 1804873
Data columns (total 45 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   id                                   int64  
 1   target                               float64
 2   comment_text                         object 
 3   severe_toxicity                      float64
 4   obscene                              float64
 5   identity_attack                      float64
 6   insult                               float64
 7   threat                               float64
 8   asian                                float64
 9   atheist                              float64
 10  bisexual                             float64
 11  black                                float64
 12  buddhist                             float64
 13  christian                            float64
 14  female                               float64
 15  heterosexual                    

In [24]:
train['comment_text'] = train['comment_text'].apply(clean_text)
test['comment_text'] = test['comment_text'].apply(clean_text)

In [25]:
train['target'] = np.where(train['target']>0.5,1.0,0.0)

In [26]:
X = train.drop('target', axis=1)
Y = train.target

In [27]:
##### Vocabulary size
voc_size = 100000

In [28]:
train.comment_text

0          cool like would want mother read really great ...
1          thank would make life lot le anxietyinducing k...
2                urgent design problem kudos take impressive
3                    something ill able install site release
4                                       haha guy bunch loser
                                 ...                        
1804869    maybe tax thing would collect product import r...
1804870         call people still think divine role creation
1804871                      thank right wrong follow advice
1804872    anyone quote follow exchange even apocryphal w...
1804873    student define ebd legally disable eligible sp...
Name: comment_text, Length: 1804874, dtype: object

In [29]:
corpus = []
corpus = train.comment_text.copy()

In [30]:
corpus

0          cool like would want mother read really great ...
1          thank would make life lot le anxietyinducing k...
2                urgent design problem kudos take impressive
3                    something ill able install site release
4                                       haha guy bunch loser
                                 ...                        
1804869    maybe tax thing would collect product import r...
1804870         call people still think divine role creation
1804871                      thank right wrong follow advice
1804872    anyone quote follow exchange even apocryphal w...
1804873    student define ebd legally disable eligible sp...
Name: comment_text, Length: 1804874, dtype: object

In [36]:
from tensorflow.keras.preprocessing.text import one_hot

In [38]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr

[[35070, 72683, 60615, 74689, 87444, 92940, 17150, 17104, 92480, 86475, 61809],
 [12547,
  60615,
  50906,
  66266,
  51245,
  30082,
  48186,
  24209,
  1498,
  35769,
  70652,
  59072,
  12244],
 [33133, 27876, 19876, 66048, 18108, 2994],
 [89409, 42659, 18959, 36727, 84903, 37505],
 [91534, 46395, 17072, 63209],
 [35709, 1434, 71203],
 [42180, 93280],
 [91212],
 [55268,
  56262,
  50433,
  54413,
  89207,
  78067,
  85683,
  29512,
  21129,
  35112,
  53913,
  88196],
 [17104, 26939, 6627, 87649, 35429, 58950, 65491],
 [97261, 96378, 17104],
 [17104, 58931, 18520, 57307, 28888, 73611, 38560, 5686, 71713, 39709],
 [56262, 72683, 33450, 85683, 40929],
 [80778, 46395, 75171, 68614, 49036, 12803, 28962, 50906, 46522],
 [58931,
  59072,
  80778,
  6214,
  1046,
  18677,
  33283,
  46395,
  48802,
  81227,
  17150,
  63971,
  30510,
  85519,
  85375,
  63971,
  81855,
  67170,
  94699,
  62771,
  62042],
 [18901,
  1498,
  74689,
  63328,
  56989,
  55890,
  81855,
  30857,
  77876,
  186

In [42]:
sent_length = 300
embedded_matrix = pad_sequences(onehot_repr, padding = "pre", maxlen = sent_length)

In [43]:
print(embedded_matrix)

[[    0     0     0 ... 92480 86475 61809]
 [    0     0     0 ... 70652 59072 12244]
 [    0     0     0 ... 66048 18108  2994]
 ...
 [    0     0     0 ...  8143 19952  8807]
 [    0     0     0 ... 20257 76325  3515]
 [    0     0     0 ...  5366 91124 19876]]


In [49]:
from keras.models import Sequential
from keras.layers import Bidirectional


## Creating model
embedding_vector_features = 40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 40)           4000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 4,113,001
Trainable params: 4,113,001
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
len(embedded_matrix),Y.shape

(1804874, (1804874,))

In [53]:
X_final=np.array(embedded_matrix)
Y_final=np.array(Y)

In [54]:
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size=0.2)

# Modeling

In [57]:
### Finally Training
model1.fit(X_train, Y_train, validation_data=(X_test, Y_test),  epochs=3, batch_size=1024)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x15906d299d0>

In [58]:
model1.save('lstm_toxicity_classifier.h5')

In [59]:
test_corpus = test.comment_text.copy()

In [60]:
test_onehot = [one_hot(words, voc_size) for words in test_corpus]

In [61]:
test_matrix = pad_sequences(test_onehot, padding = "pre", maxlen = sent_length)

In [62]:
preds = model1.predict(test_matrix)
preds

array([[7.2246790e-03],
       [3.0517280e-03],
       [3.0362308e-03],
       ...,
       [8.5994887e-01],
       [7.6095462e-03],
       [4.7856569e-04]], dtype=float32)

In [63]:
sub['prediction'] = preds

In [64]:
sub.to_csv('submission.csv',index=False)

In [65]:
sub

Unnamed: 0,id,prediction
0,7097320,0.007225
1,7097321,0.003052
2,7097322,0.003036
3,7097323,0.000418
4,7097324,0.000254
...,...,...
97315,7194635,0.000584
97316,7194636,0.001913
97317,7194637,0.859949
97318,7194638,0.007610
