## Import Packages

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.utils.data
from keras.preprocessing import text, sequence
from sklearn.metrics import roc_auc_score
from torch import nn, optim
from torch.autograd import Variable
from torchvision import datasets,transforms
from torch.utils.data import Dataset, DataLoader


from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


## Load Data


In [2]:
df_train = pd.read_csv('Dataset/train.csv')
df_test = pd.read_csv('Dataset/test.csv')
        

In [3]:
columnlist=df_train.columns

In [4]:
columnlist

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [5]:
sentence_train=df_train["comment_text"].fillna("CVxTz").values
classes=columnlist[2:]
label_train=df_train[classes]
sentence_test=df_test["comment_text"].fillna("CVxTz").values

In [6]:
label_train=label_train.values

## Tokenize Word

In [7]:
tokenize=text.Tokenizer(num_words=100000)

In [8]:
tokenize.fit_on_texts(list(sentence_train))

In [9]:
word_index=tokenize.word_index

In [10]:
sentence_train1=tokenize.texts_to_sequences(sentence_train)

In [11]:
sentence_test1=tokenize.texts_to_sequences(sentence_test)

In [12]:
Input_train=sequence.pad_sequences(sentence_train1,20)

In [13]:
Input_test=sequence.pad_sequences(sentence_test1,20)

## Custom Dataset and Dataloader

In [14]:
class CustomDataset(Dataset):
    def __init__(self,X,Y=None,Transform=None):
        self.X=X
        self.Y=Y
        self.Transform=Transform
        if self.Transform is not None:
            self.X=self.Transform(self.X)
            if self.Y is not None:
                self.Y=self.Transform(self.Y)
        self.X=torch.from_numpy(self.X).long()
        if self.Y is not None:
            self.Y=torch.from_numpy(self.Y).float()
    def __getitem__(self,idx):
        data=self.X[idx]
        if self.Y is not None:
            label=self.Y[idx]
            return data,label
        return data
    def __len__(self):
        return len(self.X)
        

In [15]:
Train_set=CustomDataset(Input_train,label_train)

In [16]:
Test_set=CustomDataset(Input_test)

In [17]:
Train=torch.utils.data.DataLoader(Train_set,batch_size=1000)

In [18]:
Test=torch.utils.data.DataLoader(Test_set,batch_size=1000)

## Word Embedding 

In [19]:
def Pretrained_embedding(emb_file):
    embedding_index={}
    with open(emb_file,encoding="utf8") as f:
        for line in f:
            values=line.split()
            word=values[0]
            vector=np.asarray(values[1:],dtype="float32")
            embedding_index[word]=vector
    return embedding_index

In [20]:
embedding_index= Pretrained_embedding("Glove/glove.6B.100d.txt")

In [21]:
def embedding(word_index,embedding_ind):
    embedding_matrix=np.zeros((len(word_index)+1,100))
    for word,i in word_index.items():
        embedding_vector=embedding_ind.get(word)
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    return embedding_matrix
    

In [22]:
embedding_matrix=embedding(word_index,embedding_index)

## Build Model

In [23]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding=nn.Embedding(num_embeddings=100000,embedding_dim=100)
        self.embedding.weight.data=torch.Tensor(embedding_matrix)
        self.lstm=nn.LSTM(100,50,1,batch_first=True,bidirectional=True)
        self.hidden=(Variable(torch.zeros(2,1,50)),Variable(torch.zeros(2,1,50)))
        self.maxpool=nn.MaxPool1d(20)
        self.fc=nn.Linear(100,6)
        self.sigmoid=nn.Sigmoid()
    def forward(self,x):
        x=self.embedding(x)
        x,self.hidden=self.lstm(x)
        x=self.maxpool(x)
        x = x.view(x.size(0), -1)
        x=self.fc(x)
        x=self.sigmoid(x)
        return x
        

In [24]:
model=Net()

## Train Model

In [25]:
def train():
    learnin1g_rate = 1e-4
    optimizer = optim.Adam(model.parameters(), lr=learnin1g_rate)

    model.train()
    for batch_idx, (data, target) in enumerate(Train):
        data, target = Variable(data), Variable(target)
        y_pred = model(data)
        loss = F.binary_cross_entropy(y_pred, target)
        print(batch_idx,loss)
        model.zero_grad()
        loss.backward()
        optimizer.step()

In [26]:
train()

0 tensor(0.7066, grad_fn=<BinaryCrossEntropyBackward>)
1 tensor(0.7051, grad_fn=<BinaryCrossEntropyBackward>)
2 tensor(0.7019, grad_fn=<BinaryCrossEntropyBackward>)
3 tensor(0.7018, grad_fn=<BinaryCrossEntropyBackward>)
4 tensor(0.6993, grad_fn=<BinaryCrossEntropyBackward>)
5 tensor(0.6978, grad_fn=<BinaryCrossEntropyBackward>)
6 tensor(0.6964, grad_fn=<BinaryCrossEntropyBackward>)
7 tensor(0.6946, grad_fn=<BinaryCrossEntropyBackward>)
8 tensor(0.6927, grad_fn=<BinaryCrossEntropyBackward>)
9 tensor(0.6911, grad_fn=<BinaryCrossEntropyBackward>)
10 tensor(0.6892, grad_fn=<BinaryCrossEntropyBackward>)
11 tensor(0.6880, grad_fn=<BinaryCrossEntropyBackward>)
12 tensor(0.6861, grad_fn=<BinaryCrossEntropyBackward>)
13 tensor(0.6844, grad_fn=<BinaryCrossEntropyBackward>)
14 tensor(0.6815, grad_fn=<BinaryCrossEntropyBackward>)
15 tensor(0.6802, grad_fn=<BinaryCrossEntropyBackward>)
16 tensor(0.6786, grad_fn=<BinaryCrossEntropyBackward>)
17 tensor(0.6776, grad_fn=<BinaryCrossEntropyBackward>)
18

146 tensor(0.3487, grad_fn=<BinaryCrossEntropyBackward>)
147 tensor(0.3391, grad_fn=<BinaryCrossEntropyBackward>)
148 tensor(0.3396, grad_fn=<BinaryCrossEntropyBackward>)
149 tensor(0.3358, grad_fn=<BinaryCrossEntropyBackward>)
150 tensor(0.3314, grad_fn=<BinaryCrossEntropyBackward>)
151 tensor(0.3296, grad_fn=<BinaryCrossEntropyBackward>)
152 tensor(0.3245, grad_fn=<BinaryCrossEntropyBackward>)
153 tensor(0.3228, grad_fn=<BinaryCrossEntropyBackward>)
154 tensor(0.3207, grad_fn=<BinaryCrossEntropyBackward>)
155 tensor(0.3216, grad_fn=<BinaryCrossEntropyBackward>)
156 tensor(0.3155, grad_fn=<BinaryCrossEntropyBackward>)
157 tensor(0.3243, grad_fn=<BinaryCrossEntropyBackward>)
158 tensor(0.3118, grad_fn=<BinaryCrossEntropyBackward>)
159 tensor(0.3109, grad_fn=<BinaryCrossEntropyBackward>)


In [27]:
model.eval()

Net(
  (embedding): Embedding(100000, 100)
  (lstm): LSTM(100, 50, batch_first=True, bidirectional=True)
  (maxpool): MaxPool1d(kernel_size=20, stride=20, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=100, out_features=6, bias=True)
  (sigmoid): Sigmoid()
)

In [28]:
for value in model.parameters():
    print(value)

Parameter containing:
tensor([[-2.1995e-02, -1.5313e-03,  2.4060e-02,  ..., -1.7251e-02,
         -1.8546e-02,  1.0394e-02],
        [-5.4732e-02, -2.2900e-01,  7.4086e-01,  ..., -1.4866e-01,
          8.3834e-01,  2.8605e-01],
        [-2.0581e-01,  6.6463e-02,  1.9934e-01,  ..., -3.9948e-01,
          4.8838e-01, -1.4728e-01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-7.5740e-02,  4.2109e-01,  7.7687e-01,  ...,  9.5977e-02,
          1.6282e+00, -1.0819e-01],
        [ 6.8997e-02, -3.1269e-01, -2.4092e-01,  ..., -6.3050e-02,
          5.2090e-01,  4.5936e-01]], requires_grad=True)
Parameter containing:
tensor([[-0.0662, -0.0279, -0.0690,  ...,  0.0654,  0.0175, -0.1101],
        [ 0.0688, -0.1073, -0.0432,  ...,  0.0892,  0.0414, -0.0525],
        [ 0.0817,  0.0610,  0.0551,  ...,  0.0032,  0.1122, -0.0752],
        ...,
        [-0.0657,  0.0717,  0.0007,  ...,  0.0483, -0.0576, -0.0959],
        [ 0

## Test Model

In [29]:
preds=[]
        

In [30]:
for data in Test:
        data = Variable(data, volatile=True)
        output = model(data)
        result=output.data
        result=(result>.16).float()
        preds.append(result.numpy())
        

  


In [31]:
y_test=np.concatenate(preds,axis=0)

In [186]:
sample_submission = pd.read_csv("Dataset/sample_submission.csv")
sample_submission[classes] = y_test
sample_submission.to_csv("submission.csv", index=False)