In [67]:
from torchtext import data
import numpy as np
import pandas as pd
import spacy
import torch
from joblib import Memory
import re
import nltk
import logging
import torch.nn as nn
import torch.nn.functional as F
import torch.nn as nn

memory=Memory(cachedir='cache/',verbose=1)
logger=logging.getLogger('quoara_dataset')

import sys


contraction_dict = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not",
    "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is",
    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
    "how's": "how is", "I'd": "I would", "I'd've": "I would have",
    "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
    "I've": "I have", "i'd": "i would", "i'd've": "i would have",
    "i'll": "i will",  "i'll've": "i will have", "i'm": "i am",
    "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
    "it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
    "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
    "she'll've": "she will have", "she's": "she is", "should've": "should have",
    "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
    "so's": "so as", "this's": "this is", "that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "here's": "here is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'+'\u200b'+'\ufeff'+ 'करना'+'है'
punct_dict = {
    "‘": "'",    "₹": "e",      "´": "'", "°": "",         "€": "e",
    "™": "tm",   "√": " sqrt ", "×": "x", "²": "2",        "—": "-",
    "–": "-",    "’": "'",      "_": "-", "`": "'",        '“': '"',
    '”': '"',    '“': '"',      "£": "e", '∞': 'infinity', 'θ': 'theta',
    '÷': '/',    'α': 'alpha',  '•': '.', 'à': 'a',        '−': '-',
    'β': 'beta', '∅': ' ', '³': '3', 'π': 'pi','करना':' ','है':' ','\ufeff':' '
}


def tokenizer(comment,max_char=10):
    comment=re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;.]"," ",str(comment).lower())
    comment=re.sub(r'\?+',' ',comment)
    comment=re.sub(r'\!+',' ',comment)
    comment=re.sub(r'\~+',' ',comment)
    comment=re.sub(r'\,+',' ',comment)
    comment=re.sub(r"[ ]+"," ",comment)
    if (len(comment))>max_char:
        comment=comment[:max_char]
    nlp=spacy.load('en_core_web_sm')
    tokenizer=[x.text for x in nlp.tokenizer(comment) if x.text!=' ']
    
    stopwords=nltk.corpus.stopwords.words('english')
    for index,word in enumerate(tokenizer):
        if word in stopwords:
            tokenizer.pop(index)
        if word in punct_dict:
            tokenizer[index]=punct_dict[word]
    
    return tokenizer


def prepare_csv():
    df_train=pd.read_csv('/home/dseo40/nlp/quora/train2.csv')
    df_train['qid']=df_train['qid'].replace('\n',' ')
    train_data,val_data=train_test_split(df_train,test_size=0.2)
    train_data.to_csv('cache/dataset_train.csv',index=False)
    val_data.to_csv('cache/dataset_val.csv',index=False)
    test_data=pd.read_csv('/home/dseo40/nlp/quora/test2.csv')
    test_data['qid']=test_data['qid'].replace('\n',' ')
    test_data.to_csv('cache/dataset_test.csv',index=False)
    
'''
allowed pretrained vectors are ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d', 
'glove.42B.300d', 'glove.840B.300d','glove.twitter.27B.25d', 'glove.twitter.27B.50d',
'glove.twitter.27B.100d', 'glove.twitter.27B.200d','glove.6B.50d', 'glove.6B.100d',
'glove.6B.200d', 'glove.6B.300d']
'''
@memory.cache
def read_files(fix_length=10,lower=False,vectors=None):
    if vectors is not None:
        #pretain vectors only support all lower case
        lower=True
    logger.debug('Prepare csv files')
    prepare_csv()
    comment=data.Field(
            sequential=True,
            fix_length=fix_length,
            tokenize=tokenizer,
            pad_first=True,
            batch_first=True,
            lower=lower)
    label=data.Field(
          sequential=False,
          use_vocab=False)
    data_fields=[('qid',None),('question_text',comment),('target',label)]
    logger.debug('reading train csv file')
    trn,vld=data.TabularDataset.splits(path='cache/',train='dataset_train.csv',validation='dataset_val.csv',
                                 format='csv',skip_header=True,fields=data_fields)
    test=data.TabularDataset(path='cache/dataset_test.csv',format='csv',skip_header=True,fields=data_fields)
    comment.build_vocab(
            trn,
            max_size=10,
            min_freq=2,
            vectors=vectors)
    return trn,vld,test
 


You provided "cachedir='cache/'", use "location='cache/'" instead.
  


In [68]:
from torchtext.data import BucketIterator
import tqdm
from sklearn.model_selection import train_test_split
trn,vld,test=read_files()



________________________________________________________________________________
[Memory] Calling __main__--home-dseo40-__ipython-input__.read_files...
read_files()
______________________________________________________read_files - 13.2s, 0.2min


In [110]:
train_iter,val_iter=BucketIterator.splits((trn,vld)
                                          ,batch_size=3
                                          ,sort_key=lambda x:len(x.question_text)
                                          ,sort_within_batch=False
                                          ,repeat=False)

In [111]:
#dl must be an itertor
#x_var be the name of input_feature
#y_var be the name of output

class BatchWrapper:
    def __init__(self,dl,x_var,y_var):
        self.dl=dl
        self.x_var=x_var
        self.y_var=y_var
    def __iter__(self):
        for batch in self.dl:
            x=getattr(batch,self.x_var)
            if type(self.y_var)==list and len(self.y_var)>0:
                y=torch.cat([getattr(batch,feat).unsqueeze(1) for feat in self.y_var],dim=1).float()
            elif self.y_var==None:
                y=torch.zeros(1)
            else: 
                y=getattr(batch,self.y_var)
            yield(x,y)
    def __len__(self):
        return len(self.dl)

In [112]:
train_dl = BatchWrapper(train_iter, 'question_text','target')
valid_dl=BatchWrapper(val_iter,'question_text','target')

In [113]:
import torch.nn as nn
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(GRU,self).__init__()
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        self.embed=nn.Embedding(n_vocab,embed_dim)
        self.dropout=nn.Dropout(dropout_p)
        self.gru=nn.GRU(embed_dim,hidden_dim,num_layers=n_layers,batch_first=True)
        self.out=nn.Linear(hidden_dim,n_classes)
    
    def __init__state(self,batch_size):
        weight=next(self.parameters()).data
        return weight.new(self.n_layers,batch_size,self.hidden_dim).zero_()
    
    def forward(self,x):
        x=self.embed(x)
        h_0=self.__init__state(batch_size=x.size(0))
        x,_=self.gru(x,h_0)
        h_t=x[:,-1,:]
        self.dropout(h_t)
        logit=self.out(h_t)
        return logit

In [114]:
import tqdm
import torch.optim as optim
import torch.nn.functional as F
model = GRU(n_layers=1,hidden_dim=4,n_vocab=10,embed_dim=128,n_classes=2,dropout_p=0.5)
opt=optim.Adam(model.parameters(),lr=1e-2)

In [158]:
import sys
import time
start=time.time()
EPOCH=3
for epoch in range(1,EPOCH+1):
    running_loss=0

    model.train()
    for x,y in train_dl:
        opt.zero_grad()
        pred=model(x)
        #In our case, each iteration returns b*c matrix where b represents the number of batch while c refers to the number of classes
        #Since the function we are dealing with cross entropy, we should pick the maxium value for each instance ie) logit.max(1) 
        #tensor.max(axis=) or tensor.min(axis=) returns values and indices of the chosen operation. Note that 0 is for indicies
        #whiel 1 for indices
        #Now,we need to compare our predicted classes with the actual ones. Before the comparison, make sure that the shapes of 
        #both must match to each other. logit.max(1)[1].view(y.size())
        loss=F.cross_entropy(pred,y)
        loss.backward()
        opt.step()
        running_loss+=loss.data*x.size(0)
    epoch_loss=running_loss/len(trn)
 
    running_corrects=0
    #Evaluation begins
    val_loss=0
    corrects=0
    model.eval()
    for x,y in valid_dl:
        pred=model(x)
        loss=F.cross_entropy(pred,y)
        val_loss+=loss.data*x.size(0)
        running_corrects+=(pred.max(1)[1].view(y.size()).data==y.data).sum()
    val_loss/=len(vld)
    epoch_accuracy=running_corrects/len(trn)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
    print('Corrects: {}'.format(epoch_accuracy))
    print(f'time taken:{time.time()-start}')
    print('-'*100)

Epoch: 1, Training Loss: 0.0216, Validation Loss: 0.0204
Corrects: 0
time taken:0.03238248825073242
----------------------------------------------------------------------------------------------------
Epoch: 2, Training Loss: 0.0198, Validation Loss: 0.0188
Corrects: 0
time taken:0.05772686004638672
----------------------------------------------------------------------------------------------------
Epoch: 3, Training Loss: 0.0184, Validation Loss: 0.0175
Corrects: 0
time taken:0.07532286643981934
----------------------------------------------------------------------------------------------------


In [153]:
x=torch.FloatTensor([[1,2,3],[4,5,6]])

In [154]:
x.min(axis=1)

torch.return_types.min(
values=tensor([1., 4.]),
indices=tensor([0, 0]))