In [37]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import RegexpTokenizer
from torch.nn.utils.rnn import pad_sequence
tpath = '../input/VT-data/train_data.csv'


class Vocab:
    def __init__(self):
        self.stoi = dict()
        self.itos = dict()
        self.tokenizer = RegexpTokenizer('[a-zA-z]+')
        self.stoi['<PAD>'] = 0
        self.stoi['<EOS>'] = 1
        self.stoi['<SOS>'] = 2
        self.stoi['<UNK>'] = 3
        self.itos[0] = '<PAD>'
        self.itos[1] = '<EOS>'
        self.itos[2] = '<SOS>'
        self.itos[3] = '<UNK>'
        self.counter = 4

    def create_vocab(self, alldata):
        for lines in alldata:
            lin = self.tokenizer.tokenize(lines)
            for word in lin:
                if word not in self.stoi:
                    self.stoi[word.lower()] = self.counter
                    self.itos[self.counter] = word.lower()
                    self.counter = self.counter + 1


class DataGen(Dataset):
    def __init__(self, path, train_path = tpath):
        super(DataGen, self).__init__()
        self.data = pd.read_csv(path)
        self.vocab = Vocab()
        self.vocab.create_vocab(pd.read_csv(train_path).input.values)

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, item):
        line = self.data.iloc[item]['input']
        lab = self.data.iloc[item]['labels']
        word = self.vocab.tokenizer.tokenize(line)
        word = [ self.vocab.stoi[wd.lower()] if wd.lower() in self.vocab.stoi else self.vocab.stoi['<UNK>'] for wd in word ]
        word.append(self.vocab.stoi['<EOS>'])
        word = [self.vocab.stoi['<SOS>']] + word
        return torch.Tensor(word), torch.Tensor([lab])


class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        wor = [item[1] for item in batch]
        wor = torch.cat(wor, dim=0)
        target = [item[0] for item in batch]
        target = pad_sequence(target, batch_first=True, padding_value=self.pad_idx)
        return wor, target

In [38]:
tpath = '../input/VT-data/train_data.csv'
cpath = '../input/VT-data/val_data.csv'

In [74]:
import torch
import torch.nn as nn


class Gramodel(nn.Module):
    def __init__(self, num_embeds, embed_dim):
        super(Gramodel, self).__init__()
        self.embed = nn.Embedding(num_embeddings=num_embeds, embedding_dim=embed_dim)
        self.LSTM = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, num_layers=1, bidirectional=True,
                            batch_first=True)
        self.lin = nn.Sequential(nn.Linear(2*embed_dim, 64),
                                 nn.ReLU(),
                                 nn.Linear(64, 2),
                                 nn.Softmax(dim=1))
        self.drop = nn.Dropout(p=0.4)

    def forward(self, x):
        emb = self.embed(x.long())
        self.drop(emb)
        output, (hn, cn) = self.LSTM(emb)
        output = self.drop(output)
        output = self.lin(output[:, -1, :].squeeze(1))
        return output


In [75]:
from tqdm import tqdm
from sklearn.metrics import f1_score 
from sklearn.metrics import classification_report

def accuracy(model, loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    num_correct = 0
    num_samples = 0
    preds = []
    labels = []
    with torch.no_grad():
        for label, data in tqdm(loader):
            label = label.to(device)
            data = data.to(device, dtype = torch.float)
            pre = model(data)
            _, pre = torch.max(pre, dim=1)
            num_correct += (pre == label).sum()
            num_samples += pre.size(0)
            preds = preds + pre.tolist()
            labels = labels + label.tolist()
    
    report = classification_report(labels, preds, output_dict=True)
    f1 = report['macro avg']['f1-score']        
    return (float(num_correct)/float(num_samples), f1)

In [84]:
import torch
import torch.optim as optim

def train():
    vc = DataGen(path=tpath)  
    pad = vc.vocab.stoi['<PAD>']
    dataload = DataLoader(vc, batch_size=128, shuffle=True, collate_fn=MyCollate(pad_idx=pad))
    
    vcv = DataGen(path=cpath)  
    pad = vc.vocab.stoi['<PAD>']
    dataloadcv = DataLoader(vcv, batch_size=128, shuffle=True, collate_fn=MyCollate(pad_idx=pad))
    
    lent = vc.vocab.counter
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    mod = Gramodel(num_embeds=lent,embed_dim= 256).to(device)
    crit = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=mod.parameters(), lr=3e-4)
    epochs = 70
    maxvalf1 = 0.4
    mod.train()
    for epoch in range(epochs):
        print('EPOCH {}'.format(epoch))
        losses = 0
        for label, data in tqdm(dataload):
            label = label.to(device)
            data = data.to(device)
            prediction = mod(data)
            loss = crit(prediction, label.long())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses += loss
        
        reportv = accuracy(mod, dataloadcv)
        valf1 = reportv[1]
        if valf1 > maxvalf1:
            print('---------------Saving CheckPOINT---------------')
            path = 'chechpoint' + str(epoch) + '.pth' 
            torch.save(mod.state_dict(), path)  
            maxvalf1 = valf1
            
        reportt = accuracy(mod, dataload)
        
        print('Train Loss {}'.format(losses/len(dataload.dataset)) )
        print("Train Accu {}  F1 {} ".format(reportt[0], reportt[1]))
        print("Val Accu {}  F1 {} ".format(reportv[0], reportv[1]))
        mod.train()
    return mod

In [85]:
model = train()

In [86]:
model = Gramodel(num_embeds=lent, embed_dim=256)
model.load_state_dict(torch.load('./chechpoint13.pth'))

In [87]:
from sklearn.metrics import classification_report

num_correct = []
num_samples = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
with torch.no_grad():
    for label, data in tqdm(dataload):
        label = label.to(device)
        data = data.to(device, dtype = torch.float)
        pre = model(data)
        _, pre = torch.max(pre, dim=1)
        num_correct += pre.tolist()
        num_samples += label.tolist()

report = classification_report(num_samples, num_correct)
print(report)

In [88]:
vcv = DataGen(path=cpath)  
pad = vc.vocab.stoi['<PAD>']
dataloadcv = DataLoader(vc, batch_size=128, shuffle=True, collate_fn=MyCollate(pad_idx=pad))

num_correct = []
num_samples = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
with torch.no_grad():
    for label, data in tqdm(dataloadcv):
        label = label.to(device)
        data = data.to(device, dtype = torch.float)
        pre = model(data)
        _, pre = torch.max(pre, dim=1)
        num_correct += pre.tolist()
        num_samples += label.tolist()

report = classification_report(num_samples, num_correct)
print(report)

In [89]:
def prediction(mod, testpath, dataset):
    test = pd.read_csv(testpath)
    answer = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    mod.to(device)
    mod.eval()
    for idx in range(len(test)):
        word = dataset.vocab.tokenizer.tokenize(test.iloc[idx]['input'])
        word = [dataset.vocab.stoi[wd.lower()] for wd in word if wd.lower() in dataset.vocab.stoi]
        word.append(dataset.vocab.stoi['<EOS>'])
        word.append(dataset.vocab.stoi['<PAD>'])
        word = [dataset.vocab.stoi['<SOS>']] + word
        vector = torch.Tensor(word)
        vector = vector.to(device)
        with torch.no_grad():
            pred = mod(vector.unsqueeze(0))
            _, pre = pred.max(1)
            answer.append(pre.item())
    return answer


In [90]:
ans = prediction(model, '../input/VT-data/test_data.csv', vc)

In [91]:
testpath = '../input/VT-data/test_data.csv'
df = pd.read_csv(testpath)

In [92]:
df['predicted_label'] = ans

In [93]:
df

In [94]:
df.loc[df['predicted_label'] == 1 ].iloc[140]['input']

In [95]:
df.to_csv('saksham_arora_submission.csv')