In [146]:
import torchtext
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import re
from cleantext import clean
import os
from nltk.tokenize import TweetTokenizer

In [175]:
# Preprocess the data 

tweet_tokenizer = TweetTokenizer()
tokenizer = get_tokenizer('basic_english')

def replace_dates(text):
        date_format_a = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', ' <DATE> ', text)
        date_format_b = re.sub(
            r'[A-Za-z]{2,8}\s\d{1,2},?\s\d {4}', ' <DATE> ', date_format_a)
        date_format_c = re.sub(
            r'\d{2} [A-Z][a-z]{2,8} \d{4}', ' <DATE> ', date_format_b)
        return date_format_c

def replace_concurrent_punctuation(text):
    # replace concurrent punctuation with single punctuation
    return re.sub(r'(!|"|\#|\$|%|&|\'|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|‘|\{|\||\}|~){2,}', r' ', text)

def replace_hash_tags(text):
        return re.sub(r'(\s|^)#(\w+)', ' <HASHTAG> ', text)

def remove_special_characters(text):
        # remove special characters other than punctuation
        return re.sub(r'[^A-Za-z0-9\s\.\,\!\?\'\"\:\;]', ' ', text)

def remove_extra_spaces(text):
        return re.sub(r'\s{2,}', ' ', text)

def replace_hyphenated_words(text):
        # replace hyphenated words with words seperated by space
        return re.sub(r'(\w+)-(\w+)', r'\1 \2', text)

def read_data(filename, n_lines):
    with open(filename, 'r') as f:
        lines = []
        for _ in range(n_lines):
            line = f.readline().strip()
            line = re.sub(r'<|>', ' ', line)
            line = replace_dates(line)
            line = replace_hyphenated_words(line)
            line = replace_hash_tags(line)
            # remove < and > from the text
            line = clean(line, no_emoji=True,
                         no_urls=True,
                         no_emails=True,
                         no_phone_numbers=True,
                         no_currency_symbols=True,           
                         replace_with_url=" <URL> ",
                         replace_with_email=" <EMAIL> ",
                         replace_with_phone_number=" <PHONE> ",
                         replace_with_currency_symbol=" <CURRENCY> ",
                         lower=True)
            line = remove_special_characters(line)
            #line = replace_concurrent_punctuation(line)
            line = clean(line,no_numbers=True,no_digits=True,no_punct=True, replace_with_number=" <NUMBER> ",replace_with_digit=" ",replace_with_punct="")
            line = "<BEGIN> " + line + " <END>"
            line = remove_extra_spaces(line)
            tokens=tokenizer(line)
            if len(tokens)>1:
                lines.append(tokens)
    return lines


def save_data(filename, lines):
    # Save the data to a file
    with open(filename, 'w')as f:
        for line in lines:
            line = ' '.join(line)
            f.write(line.strip()+'\n')




In [176]:
if not os.path.exists('./processed_data'):
    os.mkdir('processed_data')

data = read_data('data/alternate/L3Cube-HingCorpus_roman/R11_final_data/concatenated_train_final_shuffled.txt',50000)
train,valid = train_test_split(data, test_size=0.3, random_state=42)
valid,test=train_test_split(valid, test_size=0.5, random_state=42)
#print(train[1:100])
save_data('processed_data/train.txt', train)
save_data('processed_data/valid.txt', valid)
save_data('processed_data/test.txt', test)

In [None]:
#---------------------------------------------------------------------

In [177]:
import torch
from torch.utils.data import Dataset, DataLoader

In [185]:
class L3CubeDataset(Dataset):
    def __init__(self,filename,vocab=None,ngram=5):
        data = self.read_data(filename)
        if vocab is None:
            self.vocab, self.ind2vocab = self.build_vocab(data)
        else:
            self.vocab = vocab
            self.ind2vocab = {v:k for k,v in vocab.items()}
        self.n = ngram
        self.x,self.y = self.__create_dataset(data)
        
    def get_vocab(self):
        return self.vocab

    def read_data(self,filename):
        lines = []
        with open(filename, 'r') as f:
            for line in f.readlines():
                lines.append(line.strip().split(' '))
        return lines

    def build_vocab(self,data):
        word_set = set()
        for line in data:
            for word in line:
                if word not in word_set:
                    word_set.add(word)
        # sort the vocab
        word_list = sorted(list(word_set))
        vocab_dict = {"<unk>":0}
        for i,word in enumerate(word_list):
            vocab_dict[word]=i+1
        ind2word = {v:k for k,v in vocab_dict.items()}
        return vocab_dict, ind2word
    
    def get_ngram(self, tokens):
        n =self.n
        ngram = []
        if len(tokens) == 0:
            return None
        tokens = ["<begin>" for _ in range(n-2)] + tokens
        for i in range(len(tokens)-n+1):
            ngram.append(tokens[i:i+n])
        return ngram
    
    def __get_seq(self, tokens):
        vec= []
        for word in tokens:
            if word in self.vocab:
                vec.append(self.vocab[word])
            else:
                vec.append(self.vocab["<unk>"])
        return vec

    def __create_dataset(self, data):
        x = []
        y= []
        ngrams = []
        for line in data:
            ngrams.extend(self.get_ngram(line))
        
        for ngram in ngrams:
            x_tokens = ngram[:-1]
            y_tokens = ngram[1:]
            x.append(self.__get_seq(x_tokens))
            y.append(self.__get_seq(y_tokens))
        return x,y

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
    def get_dataloader(self, batch_size,shuffle=True):
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle)
    

In [186]:
train_dataset = L3CubeDataset('processed_data/train.txt')

In [187]:
import json
json.dump(train_dataset.get_vocab(),open('vocab.json','w'))
len(train_dataset.get_vocab())

38097

In [188]:
vocab_dict = train_dataset.get_vocab()
for word in vocab_dict:
    if len(word) > 10:
        print(word)



aajtakabpnews
aanasuspend
aapdisappointing
aarahaplease
aatishbaaji
aayegaseekh
aayengethen
aayushsalman
abandonando
abhicrackers
absolutelyplease
academician
accelerated
acchaisafai
accidentally
accommodated
accommodations
accompanied
accompanies
accompanying
accomplished
accomplishment
accomplishments
accordingly
accordinglythanks
accountability
accountable
accountants
achanakmaine
achhabengaluru
achievement
achievements
acknowledge
acknowledgement
acquaintance
acquaintances
adamyankahit
additionally
adhikariyon
adhiteliyadaa
adjustments
administered
administration
administrations
administrative
administrator
advancement
advancements
adventurous
advertisement
advertisements
advertising
advertisments
advicekabhi
aerodynamic
affiliations
afghanistan
afghanistaniyon
afraidcause
afternooncloudy
afternoondry
afternoonmostly
afternoonovercast
afternoonpartly
aggressively
agribusiness
agriculture
ahmedabadthen
akunmakasih
alaipayuthey
alcohlicasestarn
alhamdulillah
alhamdulillahpakistan
all

In [193]:
print(train_dataset.x[2],train_dataset.y[2])

[1, 1, 10964, 12919] [1, 10964, 12919, 18606]
