In [1]:
import torchtext
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import re
from cleantext import clean
import os
from nltk.tokenize import TweetTokenizer

In [2]:
# Preprocess the data 

tweet_tokenizer = TweetTokenizer()
tokenizer = get_tokenizer('basic_english')

def replace_dates(text):
        date_format_a = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', ' <DATE> ', text)
        date_format_b = re.sub(
            r'[A-Za-z]{2,8}\s\d{1,2},?\s\d {4}', ' <DATE> ', date_format_a)
        date_format_c = re.sub(
            r'\d{2} [A-Z][a-z]{2,8} \d{4}', ' <DATE> ', date_format_b)
        return date_format_c

def replace_concurrent_punctuation(text):
    # replace concurrent punctuation with single punctuation
    return re.sub(r'(!|"|\#|\$|%|&|\'|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|‘|\{|\||\}|~){2,}', r' ', text)

def replace_hash_tags(text):
        return re.sub(r'(\s|^)#(\w+)', ' <HASHTAG> ', text)

def remove_special_characters(text):
        # remove special characters other than punctuation
        return re.sub(r'[^A-Za-z0-9\s\.\,\!\?\'\"\:\;]', ' ', text)

def remove_extra_spaces(text):
        return re.sub(r'\s{2,}', ' ', text)

def replace_hyphenated_words(text):
        # replace hyphenated words with words seperated by space
        return re.sub(r'(\w+)-(\w+)', r'\1 \2', text)

def read_data(filename, n_lines):
    with open(filename, 'r') as f:
        lines = []
        for _ in range(n_lines):
            line = f.readline().strip()
            line = re.sub(r'<|>', ' ', line)
            line = replace_dates(line)
            line = replace_hyphenated_words(line)
            line = replace_hash_tags(line)
            # remove < and > from the text
            line = clean(line, no_emoji=True,
                         no_urls=True,
                         no_emails=True,
                         no_phone_numbers=True,
                         no_currency_symbols=True,           
                         replace_with_url=" <URL> ",
                         replace_with_email=" <EMAIL> ",
                         replace_with_phone_number=" <PHONE> ",
                         replace_with_currency_symbol=" <CURRENCY> ",
                         lower=True)
            line = remove_special_characters(line)
            #line = replace_concurrent_punctuation(line)
            line = clean(line,no_numbers=True,no_digits=True,no_punct=True, replace_with_number=" <NUMBER> ",replace_with_digit=" ",replace_with_punct="")
            line = "<BEGIN> " + line + " <END>"
            line = remove_extra_spaces(line)
            tokens=tokenizer(line)
            if len(tokens)>1:
                lines.append(tokens)
    return lines


def save_data(filename, lines):
    # Save the data to a file
    with open(filename, 'w')as f:
        for line in lines:
            line = ' '.join(line)
            f.write(line.strip()+'\n')




In [280]:
# if not os.path.exists('./processed_data'):
#     os.mkdir('processed_data')

# data = read_data('data/alternate/L3Cube-HingCorpus_roman/R11_final_data/concatenated_train_final_shuffled.txt',20000)
# train,valid = train_test_split(data, test_size=0.3, random_state=42)
# valid,test=train_test_split(valid, test_size=0.5, random_state=42)
# #print(train[1:100])
# save_data('processed_data/train.txt', train)
# save_data('processed_data/valid.txt', valid)
# save_data('processed_data/test.txt', test)

In [None]:
#---------------------------------------------------------------------

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [5]:
class L3CubeDataset(Dataset):
    def __init__(self,filename,vocab=None,ngram=5):
        data = self.read_data(filename)
        if vocab is None:
            self.vocab, self.ind2vocab = self.build_vocab(data)
        else:
            self.vocab = vocab
            self.ind2vocab = {v:k for k,v in vocab.items()}
        self.n = ngram
        self.x,self.y = self.__create_dataset(data)
        
    def get_vocab(self):
        return self.vocab

    def read_data(self,filename):
        lines = []
        with open(filename, 'r') as f:
            for line in f.readlines():
                lines.append(line.strip().split(' '))
        return lines

    def build_vocab(self,data):
        word_set = set()
        for line in data:
            for word in line:
                if word not in word_set:
                    word_set.add(word)
        # sort the vocab
        word_list = sorted(list(word_set))
        vocab_dict = {"<unk>":0}
        for i,word in enumerate(word_list):
            vocab_dict[word]=i+1
        ind2word = {v:k for k,v in vocab_dict.items()}
        return vocab_dict, ind2word
    
    def get_ngram(self, tokens):
        n =self.n
        ngram = []
        if len(tokens) == 0:
            return None
        tokens = ["<begin>" for _ in range(n-2)] + tokens
        for i in range(len(tokens)-n+1):
            ngram.append(tokens[i:i+n])
        return ngram
    
    def __get_seq(self, tokens):
        vec= []
        for word in tokens:
            if word in self.vocab:
                vec.append(self.vocab[word])
            else:
                vec.append(self.vocab["<unk>"])
        return vec

    def __create_dataset(self, data):
        x = []
        y= []
        ngrams = []
        for line in data:
            ngrams.extend(self.get_ngram(line))
        
        for ngram in ngrams:
            x_tokens = ngram[:-1]
            y_tokens = ngram[1:]
            x.append(self.__get_seq(x_tokens))
            y.append(self.__get_seq(y_tokens))
        return torch.LongTensor(x),torch.LongTensor(y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def get_dataloader(self, batch_size,shuffle=True):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle,drop_last=True)
    

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader


class LinceDataset(Dataset):
    def __init__(self, filename, vocab_english=None, vocab_hinglish=None, ngram=5):
        data_english, data_hinglish = self.read_data(filename)
        if vocab_hinglish is None:
            self.vocab_h, self.ind2vocab_h = self.build_vocab(data_hinglish)
        else:
            self.vocab_h = vocab_hinglish
            self.ind2vocab_h = {v: k for k, v in vocab_hinglish.items()}
        self.n = ngram
        self.x, self.y = self.__create_dataset(data_hinglish)

    def get_vocab(self):
        return self.vocab_h

    def read_data(self, filename):
        english = []
        hinglish = []
        with open(filename, 'r') as f:
            for line in f.readlines():
                e = line.strip().split('\t')[0]
                english.append(e.strip().split(' '))
                try:
                    h = line.strip().split('\t')[1]
                except:
                    h = ""
                hinglish.append(h.strip().split(' '))
        return english, hinglish

    def build_vocab(self, data):
        word_set = set()
        for line in data:
            for word in line:
                if word not in word_set:
                    word_set.add(word)
        # sort the vocab
        word_list = sorted(list(word_set))
        vocab_dict = {"<unk>": 0}
        for i, word in enumerate(word_list):
            vocab_dict[word] = i+1
        ind2word = {v: k for k, v in vocab_dict.items()}
        return vocab_dict, ind2word

    def get_ngram(self, tokens):
        n = self.n
        ngram = []
        if len(tokens) == 0:
            return None
        tokens = ["<begin>" for _ in range(n-2)] + tokens
        for i in range(len(tokens)-n+1):
            ngram.append(tokens[i:i+n])
        return ngram

    def __get_seq(self, tokens):
        vec = []
        for word in tokens:
            if word in self.vocab_h:
                vec.append(self.vocab_h[word])
            else:
                vec.append(self.vocab_h["<unk>"])
        return vec

    def __create_dataset(self, data):
        x = []
        y = []
        ngrams = []
        for line in data:
            ngrams.extend(self.get_ngram(line))

        for ngram in ngrams:
            x_tokens = ngram[:-1]
            y_tokens = ngram[1:]
            x.append(self.__get_seq(x_tokens))
            y.append(self.__get_seq(y_tokens))
        return torch.LongTensor(x), torch.LongTensor(y)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def get_dataloader(self, batch_size, shuffle=True):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, drop_last=True)

In [9]:
train_dataset = LinceDataset('processed_data/lince/train.txt')
validation_dataset = LinceDataset('processed_data/lince/valid.txt',vocab_hinglish=train_dataset.get_vocab())

In [282]:
# train_dataset = L3CubeDataset('processed_data/train.txt')
# validation_dataset = L3CubeDataset('processed_data/valid.txt',vocab=train_dataset.get_vocab())

In [10]:
import json
json.dump(train_dataset.get_vocab(),open('vocab.json','w'))
len(train_dataset.get_vocab())

9237

In [15]:
vocab_dict = train_dataset.get_vocab()
i=0
for word in vocab_dict:
    if i == 10:
        break
    if len(word) > 10:
        print(word)
        i+=1



aakhirakaar
aalochanaatmak
aanandadaayak
aashcharyachakit
aashcharyajanak
aatmavishvaas
aavashyakata
acceidentally
accidentally
accomplished


In [16]:
print(train_dataset.x[2],train_dataset.y[2])

tensor([   1,    1,  947, 8904]) tensor([   1,  947, 8904, 8052])


In [20]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm
import numpy as np
import os

In [30]:
class GramNet(nn.Module):
    def __init__(self,vocab_size, n_hidden=256, n_layers=4,embedding_dim=200, dropout=None, lr=0.001,model_save_path='.',device='cuda'):
        super().__init__()
        self.dropout = dropout
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.model_save_path = model_save_path
        self.device = device
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if dropout is not None:
            self.rnn = nn.LSTM(embedding_dim, n_hidden, n_layers, dropout=dropout,batch_first=True)
        else:
            self.rnn = nn.LSTM(embedding_dim, n_hidden, n_layers,batch_first=True)
            dropout = 0
        self.fc = nn.Linear(n_hidden, vocab_size)
        self.model_name = 'GramNet_'+str(n_hidden)+'_'+str(n_layers)+'_'+str(dropout)+'_'+str(lr)+'.pt'
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)     
        out, hidden = self.rnn(embedded, hidden)
#         out = self.dropout(out)
        out = out.reshape(-1, self.n_hidden) 
        out = self.fc(out)
        return out, hidden
    
    def __init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        return hidden
    
    def accuracy(self,true, pred):
        true = np.array(true)
        pred = np.array(pred)
        num_correct = sum(true == pred)
        num_total = len(true)
        return num_correct / num_total


    def run_training(self,train_dataset,valid_dataset, epochs=10, batch_size=32, clip = 1,print_every=1):
        device = self.device
        if str(device) == 'cpu':
            print("Training only supported in GPU environment")
            return
        torch.cuda.empty_cache()
        self.to(device)
        train_loader = train_dataset.get_dataloader(batch_size)
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        self.train()
        for epoch in range(epochs):
            hidden = self.__init_hidden(batch_size)
            for i, (x, y) in enumerate(train_loader):
                hidden = tuple([each.data for each in hidden])
                x, y = x.to(device), y.to(device)
                output, hidden = self.forward(x, hidden)
                loss = criterion(output, y.view(-1))
                optimizer.zero_grad()
                loss.backward()
                clip_grad_norm_(self.parameters(), clip)
                optimizer.step()
            if i % print_every == 0:
                acc,_ = self.evaluate(train_dataset)
                acc2,_ = self.evaluate(valid_dataset)
                self.train()
                print("Epoch: {}/{}".format(epoch+1, epochs),
#                       "Step: {}".format(i),
                      "Loss: {}".format(loss.item()),
                      "Training Accuracy: {}".format(acc),
                      "Validation Accuracy: {}".format(acc2))
        self.save(os.path.join(self.model_save_path,self.model_name))
                    
    def evaluate(self, dataset, batch_size=32):
        device = self.device
        self.to(device)
        self.eval()
        loader = dataset.get_dataloader(batch_size)
        hidden = self.__init_hidden(batch_size)
        preds = []
        trues = []
        for i, (x, y) in enumerate(loader):
            hidden = tuple([each.data for each in hidden])
            x, y = x.to(device), y
            output, hidden = self.forward(x, hidden)
            preds.extend(output.argmax(dim=1).cpu().numpy())
            trues.extend(y.view(-1).numpy())
        accuracy = self.accuracy(trues, preds)
        return accuracy, preds
    
    def save(self,filename):
        torch.save(self.state_dict(), filename)

    def load(self,filename):
        self.load_state_dict(torch.load(filename))

In [33]:
vocab_size =len(train_dataset.get_vocab())
net = GramNet(vocab_size,512,3,200,0.2)
print(net)

GramNet(
  (embedding): Embedding(9237, 200)
  (rnn): LSTM(200, 512, num_layers=3, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=512, out_features=9237, bias=True)
)


In [34]:
net.run_training(train_dataset,validation_dataset, epochs=5, batch_size=64, clip = 1,print_every=1)

Epoch: 1/5 Loss: 5.076810836791992 Training Accuracy: 0.21709235252140818 Validation Accuracy: 0.19979408914728683
Epoch: 2/5 Loss: 4.5573906898498535 Training Accuracy: 0.26259217411988584 Validation Accuracy: 0.216953326873385
Epoch: 3/5 Loss: 4.188779354095459 Training Accuracy: 0.30789624960355216 Validation Accuracy: 0.21816456718346253
Epoch: 4/5 Loss: 3.718381643295288 Training Accuracy: 0.34990733032032983 Validation Accuracy: 0.2217377260981912
Epoch: 5/5 Loss: 3.673319101333618 Training Accuracy: 0.38852878211227404 Validation Accuracy: 0.2197593669250646


In [None]:
torch.save(net.state_dict(), './saved_models/model_1.pt')

In [None]:
net2 = WordLSTM(len(train_dataset.get_vocab()))
net2.load_state_dict(torch.load('./saved_models/model_1.pt'))

In [259]:
import math

In [None]:
def get_perp(prob,n):
  p = math.exp(prob*(1/n))
  return 1/p

In [258]:
def predict_prob(net, tkn, trg, h=None): 
    # tensor inputs
    if tkn in token2int:
      x = np.array([[token2int[tkn]]])
    else:
      x = np.array([[token2int['UNK']]])

    inputs = torch.from_numpy(x)

    # push to GPU
    inputs = inputs.cuda()

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = net(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data

    p = p.cpu()

    p = p.numpy()
    p = p.reshape(p.shape[1],)

    if trg in token2int:
        prob = p[token2int[trg]]
    else:
        prob = p[token2int['UNK']]

    return h, prob
# function to calculate perplexity


In [None]:
def get_prob(net, x, y):

    # push to GPU
    net.cuda()

    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    N = len(x)
    tmp = 1

    for i, gram in enumerate(x):
        h, prob = predict_prob(net, gram, y[i], h)
        if (i==0): 
          if (gram in word_counter): tmp*= word_counter[gram]/vocab_size
          else: tmp*= ind/vocab_size   
        else:
          tmp *= prob
        # print(gram,prob,tmp)

    return math.log(tmp)