In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
import torch.nn as nn
from torch.utils import data
import torch.nn.functional as F
import copy

['fasttext-crawl-300d-2m', 'jigsaw-unintended-bias-in-toxicity-classification', 'glove840b300dtxt']


Using TensorFlow backend.


In [2]:
mode='run'
valid=True
k_fold=True
n_fold=5

In [3]:
crawl_emb_path='../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
glove_emb_path='../input/glove840b300dtxt/glove.840B.300d.txt'

In [4]:
num_models=1
lstm_units=128
dense_hidden_units=4*lstm_units
max_len=220

In [5]:
def get_coeffs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
def load_embeddings(path):
    f=open(path)
    return(dict(get_coeffs(*line.strip().split(' ')) for line in tqdm(f)))
def build_matrix(word_index,path):
    unknown_words=[]
    embedding_index=load_embeddings(path)
    embedding_matrix=np.zeros((len(word_index)+1,300))
    for word,i in word_index.items():
           try:
                embedding_matrix[i]=embedding_index[word]
           except KeyError:
                unknown_words.append(word)
    return(embedding_matrix, unknown_words)
           

In [6]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [7]:
# disable progress bars when submitting
def is_interactive():
    return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

In [8]:
train=pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test=pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')


In [9]:
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [10]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot",
                       "'cause": "because", "could've": "could have", "couldn't": "could not",
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                       "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
                       "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
                       "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                       "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
                       "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                       "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                       "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
                       "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                       "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have",
                       "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                       "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have",
                       "that's": "that is", "there'd": "there would", "there'd've": "there would have", 
                       "there's": "there is", "here's": "here is","they'd": "they would", 
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
                       "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
                       "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
                       "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", 
                       "what'll": "what will", "what'll've": "what will have", "what're": "what are",  
                       "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
                       "where'd": "where did", "where's": "where is", "where've": "where have", 
                       "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                       "won't": "will not", "won't've": "will not have", "would've": "would have",
                       "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", 
                       "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                       "y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are",
                       "you've": "you have" }

In [11]:
def clean_contractions(text,mapping):
    specials = ["’", "‘", "´", "`"]
    for char in specials:
        text=text.replace(char,"‘")
    text=text.lower()
    text=text.replace(" i "," I ")
    text=' '.join([mapping[t] if t in mapping.keys() else t for t in text.split(' ')])
    return text

In [12]:
train['comment_text']

0          This is so cool. It's like, 'would you want yo...
1          Thank you!! This would make my life a lot less...
2          This is such an urgent design problem; kudos t...
3          Is this something I'll be able to install on m...
4                       haha you guys are a bunch of losers.
5                                       ur a sh*tty comment.
6                                hahahahahahahahhha suck it.
7                                        FFFFUUUUUUUUUUUUUUU
8          The ranchers seem motivated by mostly by greed...
9          It was a great show. Not a combo I'd of expect...
10                                   Wow, that sounds great.
11         This is a great story. Man. I wonder if the pe...
12            This seems like a step in the right direction.
13         It's ridiculous that these guys are being call...
14         This story gets more ridiculous by the hour! A...
15         I agree; I don't want to grant them the legiti...
16         Interesting. 

In [13]:
x_train=train['comment_text'].astype(str).apply(lambda x: clean_contractions(x,contraction_mapping))

In [14]:
x_train

0          this is so cool. it is like, 'would you want y...
1          thank you!! this would make my life a lot less...
2          this is such an urgent design problem; kudos t...
3          is this something i will be able to install on...
4                       haha you guys are a bunch of losers.
5                                       ur a sh*tty comment.
6                                hahahahahahahahhha suck it.
7                                        ffffuuuuuuuuuuuuuuu
8          the ranchers seem motivated by mostly by greed...
9          it was a great show. not a combo i would of ex...
10                                   wow, that sounds great.
11         this is a great story. man. I wonder if the pe...
12            this seems like a step in the right direction.
13         it is ridiculous that these guys are being cal...
14         this story gets more ridiculous by the hour! a...
15         i agree; I do not want to grant them the legit...
16         interesting. 

In [15]:
x_test=test['comment_text'].astype(str).apply(lambda x: clean_contractions(x,contraction_mapping))

In [16]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [17]:
def clean_punct(text):
    for sym in punct:
            text=text.replace(sym,' ')
    return text

In [18]:
x_train=x_train.astype(str).apply(lambda x: clean_punct(x))
x_test=x_test.astype(str).apply(lambda x: clean_punct(x))
y_train=np.where(train['target']>=0.5,1,0)
# y_train=train['target'].values
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack',
                     'insult', 'threat']]
num_aux_targets=len(y_aux_train)


In [19]:
y_aux_train=y_aux_train.values

In [20]:
tokenizer=text.Tokenizer(num_words=100000)
tokenizer.fit_on_texts(list(x_train)+list(x_test))

In [21]:
x_train=tokenizer.texts_to_sequences(x_train)
x_test=tokenizer.texts_to_sequences(x_test)

In [22]:
x_train=sequence.pad_sequences(x_train, maxlen=max_len)
x_test=sequence.pad_sequences(x_test,maxlen=max_len)

In [23]:
len(x_train)

1804874

In [24]:
crawl_matrix, crawl_unk_words=build_matrix(tokenizer.word_index,crawl_emb_path)
glove_matrix, glove_unk_words=build_matrix(tokenizer.word_index,glove_emb_path)


In [25]:
embedding_matrix=np.concatenate([crawl_matrix,glove_matrix],axis=-1)
del crawl_matrix
del glove_matrix
gc.collect()


0

In [26]:
max_features=len(tokenizer.word_index)+1
max_features

327573

In [27]:
embedding_matrix.shape

(327573, 600)

In [28]:
del contraction_mapping

gc.collect()

0

In [29]:
# x_train_torch = torch.tensor(x_train, dtype=torch.long).cuda()
# x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()
# y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]), dtype=torch.float32).cuda()

In [30]:
val_split=0.97
if (mode=='run'):
    len_train=len(x_train)
if (mode=='test'):
    len_train=int(len(x_train)*0.2)
indices=np.random.permutation(len_train)    
if (valid & (not k_fold)):    
    x_train1=x_train[indices][:int(len_train*val_split)]
    y_train1=y_train[indices][:int(len_train*val_split)]
    y_aux_train1=y_aux_train[indices][:int(len_train*val_split)]
    x_val=x_train[indices][int(len_train*val_split):]
    y_val=y_train[indices][int(len_train*val_split):]
    y_aux_val=y_aux_train[indices][int(len_train*val_split):]
else:
    if (not valid):
        x_train1=x_train[indices]
        y_train1=y_train[indices] 
        y_aux_train1=y_aux_train[indices]
    if (valid & k_fold):
        x_val=x_train[indices][int(len_train*0.2*(n_fold-1)): int(len_train*0.2*(n_fold))]
        y_val=y_train[indices][int(len_train*0.2*(n_fold-1)): int(len_train*0.2*(n_fold))]
        y_aux_val=y_aux_train[indices][int(len_train*0.2*(n_fold-1)):int(len_train*0.2*(n_fold))]
        x_train1=np.concatenate((x_train[indices][:int(len_train*0.2*(n_fold-1))],
                                x_train[indices][int(len_train*0.2*(n_fold)):]),axis=0)
        y_train1=np.concatenate((y_train[indices][:int(len_train*0.2*(n_fold-1))],
                                y_train[indices][int(len_train*0.2*(n_fold)):]),axis=0) 
        y_aux_train1=np.concatenate((y_aux_train[indices][:int(len_train*0.2*(n_fold-1))],
                                 y_aux_train[indices][int(len_train*0.2*(n_fold)):]),axis=0) 

if (mode=='run'):
    del x_train,y_train,y_aux_train
    gc.collect()
   

In [31]:
# train_data1=train_df.iloc[indices]
# val_data=train_data1[int(m*0.2):int(m*0.4)]
# train_data=train_data[:int(m*0.8)]
# train_data=pd.concat([train_data1[:int(m*0.2)],train_data1[int(m*0.4):]],axis=0)


In [32]:
x_train_tensor=torch.tensor(x_train1, dtype=torch.long).cuda()
del x_train1
y_train_tensor=torch.tensor(np.hstack([np.expand_dims(y_train1,1),y_aux_train1]), 
                            dtype=torch.long).cuda()
del y_train1
if (valid):
    x_val_tensor=torch.tensor(x_val, dtype=torch.long).cuda()
    del x_val
    y_val_tensor=torch.tensor(np.hstack([np.expand_dims(y_val,1),y_aux_val]), 
                                dtype=torch.long).cuda()
    del y_val
x_test_tensor=torch.tensor(x_test,dtype=torch.long).cuda()
del x_test
gc.collect()

0

In [33]:
train_dataset=data.TensorDataset(x_train_tensor,y_train_tensor)
test_dataset=data.TensorDataset(x_test_tensor)
if (valid):
    val_dataset=data.TensorDataset(x_val_tensor,y_val_tensor)

In [34]:
test_dataloader=data.DataLoader(test_dataset,batch_size=512,shuffle=False)

In [35]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [36]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [37]:
class NeuralNet(nn.Module):
    def __init__(self,embedding_matrix,num_aux_targets):
        super().__init__()
        embed_size=embedding_matrix.shape[1]
        self.embedding=nn.Embedding(max_features,embed_size)
        self.embedding.weight=nn.Parameter(torch.tensor(embedding_matrix,
                                                        dtype=torch.float32))
        self.embedding.weight.requires_grad=False
        self.embedding_dropout = SpatialDropout(0.5)
        self.lstm1=nn.LSTM(embed_size,lstm_units,batch_first=True,bidirectional=True)
        self.lstm2=nn.LSTM(lstm_units*2,lstm_units,batch_first=True,bidirectional=True)
        self.linear1 = nn.Linear(dense_hidden_units, dense_hidden_units)
        self.linear2 = nn.Linear(dense_hidden_units, dense_hidden_units)
        self.output=nn.Linear(dense_hidden_units,1)
        self.aux_output=nn.Linear(dense_hidden_units,num_aux_targets)
    def forward(self,x):
        embedding=self.embedding(x)
        embedding = self.embedding_dropout(embedding)
        lstm1,_=self.lstm1(embedding)
        lstm2,_=self.lstm2(lstm1)
        avg_pool=torch.mean(lstm2,1)
        max_pool,_=torch.max(lstm2,1)
        conc=torch.cat((max_pool,avg_pool),1)
        h_conc_linear1  = F.relu(self.linear1(conc))
        h_conc_linear2  = F.relu(self.linear2(conc))
        
        hidden = conc + h_conc_linear1 + h_conc_linear2
        
        result = self.output(hidden)
        aux_result = self.aux_output(hidden)
        out = torch.cat([result, aux_result], 1)
      
 #         result=self.output(conc)
#         aux_result=self.aux_output(conc)
#         out=torch.cat([result,aux_result],1)
        return out
        


In [38]:
def train_model(model,train,val, loss_fn,optimizer,scheduler,num_epochs=5):
    best_val_loss=1e+10
    train_dataloader=data.DataLoader(train,batch_size=512,shuffle=True)
    if (valid==True):
        val_dataloader=data.DataLoader(val,batch_size=512,shuffle=True)
        dataloaders={'train':train_dataloader,'val':val_dataloader}
    for epoch in range(num_epochs):
        start_time = time.time()
#         print('Epoch {} '.format(epoch))
        if (valid):
            for phase in ['train', 'val']:
                if (phase=='train'):
                    model.train()
                else:
                    model.eval()
                avg_loss=0

                if (phase=='train'):
                    scheduler.step()
                for data1 in tqdm(dataloaders[phase], disable=False):
                    x_train = data1[:-1]
                    y_train = data1[-1]
                    y_hat_train=model(*x_train).float()
                    y_train=y_train.float()

                    loss=loss_fn(y_hat_train, y_train)

                    if (phase=='train'):
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                    avg_loss += loss.item() / len(dataloaders[phase])

                if ((phase=='val') and (avg_loss<best_val_loss)):
                    best_model_wts=copy.deepcopy(model.state_dict())
                    best_val_loss=avg_loss
#                 if (phase=='train'):
                print('Epoch {} {} loss: {:.4f}'.format(epoch,phase,avg_loss))
#                 else:
#                     print('Val loss: {:.4f} \n'.format(avg_loss))
        elif (not valid):
            model.train()
            scheduler.step()
            avg_loss=0
            for data1 in tqdm(train_dataloader, disable=False):
                x_train = data1[:-1]
                y_train = data1[-1]
                y_hat_train=model(*x_train).float()
                y_train=y_train.float()

                loss=loss_fn(y_hat_train, y_train)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_dataloader)

            if (avg_loss<best_val_loss):
                best_model_wts=copy.deepcopy(model.state_dict())
                best_val_loss=avg_loss

            print('Epoch {} {} loss: {:.4f}'.format(epoch,'train',avg_loss))
                    
        elapsed_time = time.time() - start_time
        print('Time: {:.2f} \n'.format(elapsed_time))
    model.load_state_dict(best_model_wts)
    return model
                    


In [39]:
def model_predict(model,test_loader,output_dim):
    preds=[]
    test_preds = np.zeros((len(test), output_dim))
    model.eval()
    batch_size=test_loader.batch_size
    for i, x_batch in enumerate(test_loader):
            
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred
    return test_preds

In [40]:
model=NeuralNet(embedding_matrix,y_aux_train1.shape[-1])
model=model.cuda()
loss=nn.BCEWithLogitsLoss(reduction='mean')
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)
scheduler=torch.optim.lr_scheduler.LambdaLR(optimizer,lambda epoch: 0.6**epoch)
val_input=None
if (valid):
    val_input=val_dataset
all_test_preds=[]    
for model_idx in range(num_models):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx-1)    
    model=train_model(model,train_dataset,val_input,loss,optimizer,scheduler,num_epochs=4)
    test_preds = model_predict(model,test_dataloader,output_dim= y_train_tensor.shape[-1])
    all_test_preds.append(test_preds)


Model  0
Epoch 0 train loss: 0.0253
Epoch 0 val loss: 0.0215
Time: 517.70 

Epoch 1 train loss: 0.0211
Epoch 1 val loss: 0.0209
Time: 517.71 

Epoch 2 train loss: 0.0203
Epoch 2 val loss: 0.0206
Time: 520.09 

Epoch 3 train loss: 0.0199
Epoch 3 val loss: 0.0202
Time: 517.34 



In [41]:
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': np.mean(all_test_preds, axis=0)[:, 0]
})

submission.to_csv('submission'+str(n_fold)+'.csv', index=False)