In [1]:
import torch.nn as nn
from torch.functional import F
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)


In [2]:
INPUT_DIM =  20002
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = 6
DROPOUT = 0.5
PAD_IDX =  1

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES ,OUTPUT_DIM, DROPOUT, PAD_IDX)

In [3]:
import torch
model.load_state_dict(torch.load('tut2-model.pt'))

#test_loss, test_acc = evaluate(model, test_iterator, criterion)
model.embedding.weight.data

tensor([[ 0.0819,  0.0175,  0.1646,  ..., -0.1461,  0.0274, -0.1393],
        [-0.1278, -0.1142,  0.0214,  ..., -0.0455, -0.1260,  0.1340],
        [-0.0189, -0.2308,  0.6949,  ..., -0.1919,  0.7575,  0.1874],
        ...,
        [-1.0605, -0.3805, -0.4359,  ..., -0.3696, -0.0038, -0.2723],
        [-0.8599,  0.3470,  0.8638,  ..., -0.1740, -0.2555,  0.0840],
        [ 0.3404,  0.0319,  0.0420,  ...,  0.4027,  0.3248, -1.5861]])

In [4]:
model.embedding.weight.data.shape

torch.Size([20002, 100])

In [5]:
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re
spacy.load('en')

<spacy.lang.en.English at 0x1221cbcc0>

In [6]:
import pickle

TEXT = pickle.load(open('./custom_embeddings/train_data_field', 'rb'))



In [7]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x121f50fd0>

In [8]:

dataFields = {"comment_text": ("comment_text", TEXT)}

test_data = data.TabularDataset(path='./data/test.json', 
                                            format='json',
                                            fields=dataFields, 
                                            skip_header=False)
len(test_data)

153164

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_iterator = torchtext.data.Iterator(test_data, batch_size=64, device=device, 
                                     sort=False, sort_within_batch=False, 
                                     repeat=False,shuffle=False)

In [10]:
TEXT.unk_token


print(TEXT.vocab.vectors.shape)

torch.Size([20002, 100])


In [11]:
TEXT.vocab.vectors[TEXT.vocab.stoi['mofuckas']]

tensor([ 0.0819,  0.0175,  0.1646, -0.1562, -0.1353,  0.0808,  0.1197, -0.0839,
         0.0643,  0.0129, -0.0114,  0.0017,  0.0051, -0.0195,  0.0776,  0.0170,
         0.0598, -0.0418,  0.1261, -0.0057,  0.1347,  0.1081, -0.0690,  0.0587,
         0.0897,  0.0450,  0.0819,  0.0252, -0.0180,  0.0771, -0.0226,  0.1343,
        -0.0390,  0.1087, -0.0346, -0.0487, -0.0366,  0.0294, -0.1057,  0.0217,
        -0.0896, -0.0056, -0.0024, -0.0583,  0.0304, -0.0205, -0.0783, -0.0511,
         0.1128, -0.0443, -0.0816, -0.0763, -0.0602,  0.0656, -0.3388, -0.0691,
         0.0897,  0.0461,  0.1059,  0.2282, -0.0929, -0.0346, -0.1973,  0.0348,
         0.3062,  0.0781,  0.1258,  0.0681,  0.1165, -0.1356, -0.1326,  0.1248,
        -0.0889, -0.0454,  0.0026,  0.0952,  0.0121,  0.0905, -0.1676,  0.1223,
         0.0909, -0.1292,  0.0008, -0.0408, -0.2441,  0.1631,  0.0282, -0.0344,
         0.0164, -0.1831,  0.1841,  0.2541, -0.0341,  0.2432, -0.0120, -0.1292,
        -0.0995, -0.1461,  0.0274, -0.13

In [12]:
myPreds=[]
with torch.no_grad():
    model.eval()
    for batch in test_iterator:

        torch.cuda.empty_cache()
    
        text = batch.comment_text    
        predictions = model(text).squeeze(1)         
        myPreds+=[torch.sigmoid(predictions).detach().numpy()]
    
        torch.cuda.empty_cache()
myPreds = np.vstack(myPreds)

In [13]:
testDF= pd.read_csv("./data/test.csv")
for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    testDF[col] = myPreds[:, i]

In [14]:
testDF.drop("comment_text", axis=1).to_csv("submission_convolutional_test_2.csv", index=False)