In [16]:
from collections import Counter
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
import fuzzywuzzy
from fuzzywuzzy import fuzz
from  nltk import word_tokenize
import torch.optim as optim
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import io
import utils

In [None]:
!pip install fuzzywuzzy

In [None]:
!nltk.download('stopwords')

In [None]:
cuda = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

torch.manual_seed(42)

In [None]:
cuda

In [7]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [8]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
def get_processed_tokens_for_english_tweets():
    f = "english/agr_en_train.csv"
    # preprocessing english tweets.
    #ingesting english csv file
    df = pd.read_csv(f,names = ['source','comment','annotation'],encoding='UTF-8')
    df['comment'] = df.comment.str.strip()   # removing spaces
    comments = np.asarray(df['comment'])    # dividing the dataframe into comments and tags and converting to array
    tags = np.asarray(df['annotation'])
    print((len(comments)))
    print(len(tags))
    stop_words = set(stopwords.words('english'))  #english stop words list
    processed_tokens = []
    for comment in comments:
    # comment = "Also see ....hw ur RSS activist caught in Burkha .... throwing beef in d holy temples...https://www.google.co.in/amp/www.india.com/news/india/burkha-clad-rss-activist-caught-throwing-beef-at-temple-pictures-go-viral-on-facebook-593154/amp/,NAGfacebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING"
        comment = comment.lower()   #lower casing each tweets
        Digit_REMOVAL = re.sub(r'[0-9]+', '',comment) #removal of numbers 
        URL_REMOVAL = re.sub(r"http\S+", "", Digit_REMOVAL) # removal of URLS
        tokenizer = nltk.RegexpTokenizer(r"\w+")   # removal of punctuation and tokenizing
        new_words = tokenizer.tokenize(URL_REMOVAL)
        sentence = []
        for word in new_words:
            if word not in stop_words:           #checking for stop words on each sentence
                sentence.append(word)
        processed_tokens.append(sentence)
    return processed_tokens, tags

In [None]:
#-----------------For hinglish dataset

def get_processed_hindi_tokens():
    Hindi_text  = "agr_hi_fb_test.csv"
    df1 = pd.read_csv(Hindi_text,names = ['source','comment','annotation'],encoding='UTF-8')
    df1['comment'] = df1.comment.str.strip()   # removing spaces
    hindi_comments = np.asarray(df1['comment'])    # dividing the dataframe into comments and tags and converting to array
    hindi_tags = np.asarray(df1['annotation']) 
    processed_Hindi_tokens = []
    for comment in hindi_comments:
    #   comment = "Also see ....hw ur RSS activist caught in Burkha .... throwing beef in d holy temples...https://www.google.co.in/amp/www.india.com/news/india/burkha-clad-rss-activist-caught-throwing-beef-at-temple-pictures-go-viral-on-facebook-593154/amp/,NAGfacebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING"   #lower casing each tweets
        Digit_REMOVAL = re.sub(r'[0-9]+', '',str(comment)) #removal of numbers 
        URL_REMOVAL = re.sub(r"http\S+", "", Digit_REMOVAL) # removal of URLS
        Emoji_removal = remove_emoji(URL_REMOVAL)
        Emoji_removal = Emoji_removal.lower() 
        if (isEnglish(Emoji_removal) == True):
            Emoji_removal = re.sub(r'[^\w\s]','',Emoji_removal)# removal of punctuation and tokenizing
        processed_Hindi_tokens.append(word_tokenize(Emoji_removal))
    return processed_Hindi_tokens,hindi_tags


In [None]:
#-----------Transliteration and translation
def get_transliteration_Hinglish_Hindi_dict():
    transliteration_dict = "transliterations.hi-en.csv"
    t_dict = pd.read_csv(transliteration_dict,names = ['Hinglish','Hindi'],encoding='UTF-8',sep='\t')
    t_dict['Hinglish'] = t_dict['Hinglish'].str.strip()
    t_dict['Hindi'] = t_dict['Hindi'].str.strip()
    t_dict = np.asarray(t_dict)
    return t_dict

#--------------profanity dictionary
def get_profanity_dict():
    profanity_dict = "ProfanityText.txt"
    P_dict = pd.read_csv(profanity_dict,names = ['Hinglish','English'],encoding='UTF-8',sep='\t')
    P_dict['Hinglish'] = P_dict['Hinglish'].str.strip()
    P_dict['English'] = P_dict['English'].str.strip()
    P_dict = np.asarray(P_dict)
    return P_dict

In [None]:
#------------------Translation of hindi text back to english-------
def translate_hindi_to_english():
    Hindi_dict = "Hindi_English_dict.csv"
    H_dict = pd.read_csv(Hindi_dict,names = ['Hindi','English'],encoding='UTF-8')

    HE_dict_F = "HE_dictionary_functions.csv"
    H_dict_F = pd.read_csv(HE_dict_F,names = ['Hindi','English'],encoding='UTF-8')
    H_dict_F['Hindi'] = H_dict_F['Hindi'].str.strip()
    H_dict_F['English'] = H_dict_F['English'].str.strip()

    H_hindi_F = np.asarray(H_dict_F['Hindi'])
    H_english_F = np.asarray(H_dict_F['English'])

    H_dict['Hindi'] = H_dict['Hindi'].str.strip()
    H_dict['English'] = H_dict['English'].str.strip()

    H_hindi = np.asarray(H_dict['Hindi'])
    H_english = np.asarray(H_dict['English'])
    
    HE_dict = dict(zip(H_hindi,H_english))
    H_dict_F = dict(zip(H_hindi_F,H_english_F))

    EH_dict = {v:k for k, v in HE_dict.items()}
    EH_dict_F = {v:k for k, v in H_dict_F.items()}
    
    return HE_dict, H_dict_F, EH_dict, EH_dict_F

In [None]:
# Convert
def get_translated_hindi_english(HE_dict, H_dict_F):
    HE_dict = dict(zip(H_hindi,H_english))
    H_dict_F = dict(zip(H_hindi_F,H_english_F))
    for i in range(0,len(processed_Hindi_tokens)):
        #print(i)
        for j in range (0,len(processed_Hindi_tokens[i])):
            Str = processed_Hindi_tokens[i][j]
            if(Str in HE_dict):
                processed_Hindi_tokens[i][j] = HE_dict[Str]
            elif(Str in H_dict_F):
                processed_Hindi_tokens[i][j] = H_dict_F[Str]
    return processed_Hindi_tokens

In [None]:
def get_token_translations(processed_tokens, processed_Hindi_tokens, EH_dict, P_dict, t_dict):
    for i in range(0,len(processed_Hindi_tokens)):
        for j in range (0,len(processed_Hindi_tokens[i])):
            flag = 0
            Str1 = (processed_Hindi_tokens[i][j])
            max_ratio = 60
            max_ratio_P = 75   #needs to be adjusted
            if (Str1 in EH_dict): # check whether the values exists in english dictionary or not.
                continue
            for l in range(0,len(P_dict)):
                Str2 = P_dict[l][0]
                Ratiostr1 = fuzz.ratio(Str1,Str2)
                if (Ratiostr1 >= max_ratio_P):
                    max_ratio_P = Ratiostr1
                    flag = 1
                    processed_Hindi_tokens[i][j] = P_dict[l][1] 
                    break
            for p in EH_dict_F:
                Ratiostr1 = fuzz.ratio(Str1,str(p))
                if(Ratiostr1 >= 98):
                    flag = 1
                    break
            if (flag == 1):
                continue
            else:
                for k in range(0,len(t_dict)):
                    Str2 = t_dict[k][0]
                    Ratiostr1 = fuzz.ratio(Str1,Str2)
                    if (Ratiostr1 > max_ratio):
                        max_ratio = Ratiostr1
                        processed_Hindi_tokens[i][j] = t_dict[k][1]
    return processed_Hindi_tokens

In [None]:
def translate_hindi_back_to_English(processed_Hindi_tokens, HE_dict, H_dict_F):
    for i in range(0,len(processed_Hindi_tokens)):
        for j in range (0,len(processed_Hindi_tokens[i])):
            Str = processed_Hindi_tokens[i][j]
            if(Str in HE_dict):
                processed_Hindi_tokens[i][j] = HE_dict[Str]
            elif(Str in H_dict_F):
                processed_Hindi_tokens[i][j] = H_dict_F[Str]
    return processed_Hindi_tokens

In [17]:
def load_vectors(fname):
    path = str(fname)
    fin = io.open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [None]:
pip install git

In [20]:
%%capture
!pip install fasttext
import fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')
ft.get_dimension()
fasttext.util.reduce_model(ft, 200)
ft.get_dimension()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
#processed_tokens, tags = get_processed_tokens_for_english_tweets()
processed_Hindi_tokens = get_processed_hindi_tokens()
print(1)
t_dict = get_transliteration_Hinglish_Hindi_dict()
P_dict = get_profanity_dict()
print(3)
HE_dict, H_dict_F, EH_dict, EH_dict_F = translate_hindi_to_english()
processed_Hindi_tokens = get_translated_hindi_english(HE_dict, H_dict_F)
print(5)
processed_Hindi_tokens = get_token_translations(processed_tokens, processed_Hindi_tokens, EH_dict, P_dict, t_dict)

In [None]:
_, tags = get_processed_hindi_tokens()

In [None]:

pd.DataFrame(processed_Hindi_tokens).to_csv("ProcessedHinditokensfinal.csv")

In [None]:
processed_Hindi_tokens = pd.read_csv("ProcessedHinditokensfinal.csv",sep=",")
#processed_Hindi_tokens = translate_hindi_back_to_English(processed_Hindi_tokens, HE_dict, H_dict_F)
processed_Hindi_tokens = np.asarray(processed_Hindi_tokens)
lis = []
for i in processed_Hindi_tokens:
    lisT = []
    for j in i:
        if (str(j) == 'nan'):
            continue
        else:
            lisT.append(j)
    lis.append(lisT)
processed_Hindi_tokens = lis

In [None]:
Hindi_dict = "Hindi_English_dict.csv"
H_dict = pd.read_csv(Hindi_dict,names = ['Hindi','English'],encoding='UTF-8')

HE_dict_F = "HE_dictionary_functions.csv"
H_dict_F = pd.read_csv(HE_dict_F,names = ['Hindi','English'],encoding='UTF-8')
H_dict_F['Hindi'] = H_dict_F['Hindi'].str.strip()
H_dict_F['English'] = H_dict_F['English'].str.strip()
H_hindi_F = np.asarray(H_dict_F['Hindi'])
H_english_F = np.asarray(H_dict_F['English'])


H_dict['Hindi'] = H_dict['Hindi'].str.strip()
H_dict['English'] = H_dict['English'].str.strip()
H_hindi = np.asarray(H_dict['Hindi'])
H_english = np.asarray(H_dict['English'])

HE_dict = dict(zip(H_hindi,H_english))
H_dict_F = dict(zip(H_hindi_F,H_english_F))

EH_dict = {v:k for k, v in HE_dict.items()}
EH_dict_F = {v:k for k, v in H_dict_F.items()}

for i in range(0,len(processed_Hindi_tokens)):
    print(i)
    for j in range (0,len(processed_Hindi_tokens[i])):
        Str = processed_Hindi_tokens[i][j]
        if(Str in HE_dict):
            processed_Hindi_tokens[i][j] = HE_dict[Str]
        elif(Str in H_dict_F):
            processed_Hindi_tokens[i][j] = H_dict_F[Str]
            


In [None]:
print(len(processed_Hindi_tokens))
print((hindi_tags[0]))

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_sequence_tags(seq, to_ix):
    idxs = to_ix[seq]
    idxs = torch.tensor(idxs, dtype=torch.long)
    idxs = idxs.view(1)
    return idxs

In [39]:
def sentence_to_padded_sentence(sentence, word_to_ix):
    
    # map sentences to vocab
    sentence =  [[word_to_ix[word] for word in sent] for sent in sentence]
    # sentence now looks like:  
    # [[1, 2, 3, 4, 5, 6, 7], [8, 8], [7, 9]]
    sentence_lengths = [len(sent) for sent in sentence]
    pad_token = word_to_ix['<PAD>']
    longest_sent = max(sentence_lengths)
    batch_size = len(sentence)
    padded_sentence = np.ones((batch_size, longest_sent)) * pad_token
    for i, x_len in enumerate(sentence_lengths):
        sequence = sentence[i]
        padded_sentence[i, 0:x_len] = sequence[:x_len]
  
    return padded_sentence

In [18]:
word_to_ix = {}
ix_to_word = {}
tag_to_ix = {}
ix_to_tag = {}
word_to_ix = {"<PAD>":0}

In [None]:
training_data = utils.substitute_with_UNK(processed_tokens,1)
#training_data = utils.substitute_with_UNK(processed_tokens,word_to_ix)
# testing_data = utils.substitute_with_UNK_for_TEST(processed_tokens,word_to_ix)
# print(len(training_data))

In [21]:
english_embeddings_fasttext = ft.get_words()

In [None]:
'UNK' in english_embeddings_fasttext

In [None]:
english_embeddings_fasttext = [word.lower() for word in english_embeddings_fasttext if not word in stopwords.words('english')]
english_embeddings_fasttext = [word.lower() for word in english_embeddings_fasttext if word.isalpha()]                              

In [22]:
data_vector = dict()
word_vector_list = []

In [23]:
#For PAD
dummy_list = []
dummy_list = np.zeros(200, dtype = float)
word_vector_list.append(dummy_list)

for i, word in enumerate(english_embeddings_fasttext):
    word_to_ix[word] = len(word_to_ix)
    word_vector_list.append(list(ft.get_word_vector(word)))
#for tag in tags:
#    if tag not in tag_to_ix:
#        tag_to_ix[tag] = len(tag_to_ix)
#        ix_to_tag[tag_to_ix[tag]] = tag

# word_to_ix["UNK"] = len(word_to_ix)

#For UNK
# word_vector_list.append(dummy_list)
word_vector_list = np.asarray(word_vector_list)

In [45]:
for tag in hindi_tags:
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)
        ix_to_tag[tag_to_ix[tag]] = tag

# word_to_ix["UNK"] = len(word_to_ix)


In [46]:
tag_to_ix

{'OAG': 0, 'CAG': 1, 'NAG': 2}

In [None]:
del ft

In [None]:
# for sent in training_data:
#     for word in sent:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)
#             ix_to_word[word_to_ix[word]] = word
# for tag in tags:
#     if tag not in tag_to_ix:
#         tag_to_ix[tag] = len(tag_to_ix)
#         ix_to_tag[tag_to_ix[tag]] = tag

# sentence= []
# for sent in training_data:
#      sentence.append(sent[:50])

In [None]:
#training_data = utils.substitute_with_UNK_for_TEST(processed_Hindi_tokens,word_to_ix)
testing_data = utils.substitute_with_UNK_for_TEST(processed_Hindi_tokens,word_to_ix)

In [None]:
test_sentence = []
for sent in testing_data:
     test_sentence.append(sent[:50])

In [None]:
sentence= []
for sent in training_data:
     sentence.append(sent[:50])
padded_sentence = sentence_to_padded_sentence(sentence, word_to_ix)
test_padded_sentence = sentence_to_padded_sentence(test_sentence, word_to_ix)

In [None]:
test_padded_sentence = sentence_to_padded_sentence(test_sentence, word_to_ix)

In [None]:
test_padded_sentence[0]

In [24]:
class MIMCT(nn.Module):   
    def __init__(self,input_channel,vocab_size,word_to_ix,output_channel,embedding_dim,hidden_dim,kernel_size,feature_linear, word_vector_list):
        super(MIMCT, self).__init__()
        
#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.word_embeddings.weight.data.copy_(torch.from_numpy(word_vector_list))
        self.word_embeddings = nn.Embedding.from_pretrained(torch.from_numpy(word_vector_list))
        
        self.CNN_Layers = nn.Sequential( 
            nn.Conv1d(input_channel, output_channel,kernel_size[0], stride=1),
            nn.Conv1d(input_channel, output_channel, kernel_size[1], stride=1),
            nn.Conv1d(input_channel, output_channel, kernel_size[2], stride=1),
            nn.Flatten(),nn.Dropout(p=0.25),
            nn.Linear(feature_linear, 64),
            nn.ReLU(),
            nn.Linear(64, 3),
            nn.Softmax()
            )
        
        
        #create LSTM.
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim,3)
        self.dropout = nn.Dropout(p=0.20)
        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=1)
        self.linear = nn.Linear(50+1,3)
    def forward(self,x):
      #  y = self.LSTM_Layers(x)
#         print(x)
        embeds = self.word_embeddings(x)
#         embeds = embeds.type(torch.DoubleTensor)
        embeds= embeds.float()
        embeds_cnn = embeds.view(1,embeds.size(0),embeds.size(1))
        cnn_output = self.CNN_Layers(embeds_cnn)
        
        lstm_out, _ = self.lstm(embeds.view(len(x), 1, -1))
        lstm_out= self.dropout(lstm_out)
        tag_space = self.hidden2tag(lstm_out.view(len(x), -1))
        
        lstm_output = self.sigmoid(tag_space)
        #concat the outputs the compile layer with categorical cross-entropy the loss function,
        lstm_output = lstm_output.view(lstm_output.size(0),-1)
        cnn_output = cnn_output.view(cnn_output.size(0),-1)
        X = torch.cat((lstm_output,cnn_output))
        X = X.view(1,X.size(0),X.size(1))
        X = self.maxpool(X)
        X = self.linear(X.view(X.size(2), -1))
        X = self.softmax(X)
#         print(X)
        return X

In [25]:
batch_size = 1
input_channel = 10 #vocab size
vocab_size = len(word_to_ix) 
embedding_dim = 200 
output_channel = 10
kernel_size = [20,15,10]
Feature_layer1 = embedding_dim - kernel_size[0] + 1
Feature_layer2 = Feature_layer1 - kernel_size[1] + 1
Feature_layer3 = Feature_layer2 - kernel_size[2] + 1
feature_linear = Feature_layer3 * input_channel

In [28]:
batch_size = 1
input_channel = 50 #vocab size
vocab_size = len(word_to_ix) 
embedding_dim = 200 
output_channel = 50
kernel_size = [20,15,10]
Feature_layer1 = embedding_dim - kernel_size[0] + 1
Feature_layer2 = Feature_layer1 - kernel_size[1] + 1
Feature_layer3 = Feature_layer2 - kernel_size[2] + 1
feature_linear = Feature_layer3 * input_channel

#Parameters for LSTM
hidden_dim = 128
dropout = 0.25, 
#recurrent_dropout = 0.3

model = MIMCT(input_channel,vocab_size,word_to_ix,output_channel,embedding_dim,hidden_dim,kernel_size,feature_linear, word_vector_list)
loss_function = nn.CrossEntropyLoss()
#Adam Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
original_data = training_data
training_data = padded_sentence
 sentence1 = training_data[0]

print(len(tags))

In [None]:
len(training_data)

In [None]:
word_to_ix["UNK"]

In [None]:
# %%capture cap --no-stderr

for epoch in range(10):  # running for 20 epoch
    train_sentence_losses = []
    dev_sentence_losses = []
    print("Epoch:", epoch)
    print("\nTraining Phase:")
    #for sentence in training_data:
    for index, sentence in enumerate(training_data):
        model.zero_grad()
        targets = tags[index]
        sentence_in = torch.tensor(sentence, dtype=torch.long)
        targets = prepare_sequence_tags(targets, tag_to_ix)
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores.cpu(), targets.cpu())
        train_sentence_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if(index == 6000):
            print(index)
    final_train_loss = sum(train_sentence_losses)/len(train_sentence_losses)
    print("Train Loss:", final_train_loss)
    
# with open('epoch_cell_output.txt', 'w') as f:
#     f.write(cap.stdout)

In [None]:
torch.save(model.state_dict(), "./model_v3_Hinglish.pth")

In [29]:
model.load_state_dict(torch.load("./model_v3_Hinglish.pth"))

<All keys matched successfully>

In [None]:
epoch

In [None]:
processed_Hindi_tokens

import csv

with open("hindi_tokens_translated_to_English_list.csv", "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(processed_Hindi_tokens)

In [None]:
def get_processed_testing_tokens_for_english_tweets():
    f = "agr_en_fb_test.csv"
    # preprocessing english tweets.
    #ingesting english csv file
    df = pd.read_csv(f,names = ['source','comment','annotation'],encoding='UTF-8')
    df['comment'] = df.comment.str.strip()   # removing spaces
    comments = np.asarray(df['comment'])    # dividing the dataframe into comments and tags and converting to array
    tags = np.asarray(df['annotation'])
    print((len(comments)))
    print(len(tags))
    stop_words = set(stopwords.words('english'))  #english stop words list
    processed_tokens = []
    for comment in comments:
    # comment = "Also see ....hw ur RSS activist caught in Burkha .... throwing beef in d holy temples...https://www.google.co.in/amp/www.india.com/news/india/burkha-clad-rss-activist-caught-throwing-beef-at-temple-pictures-go-viral-on-facebook-593154/amp/,NAGfacebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING"
        comment = comment.lower()   #lower casing each tweets
        Digit_REMOVAL = re.sub(r'[0-9]+', '',comment) #removal of numbers 
        URL_REMOVAL = re.sub(r"http\S+", "", Digit_REMOVAL) # removal of URLS
        tokenizer = nltk.RegexpTokenizer(r"\w+")   # removal of punctuation and tokenizing
        new_words = tokenizer.tokenize(URL_REMOVAL)
        sentence = []
        for word in new_words:
            if word not in stop_words:           #checking for stop words on each sentence
                sentence.append(word)
        processed_tokens.append(sentence)
    return processed_tokens

In [None]:
processed_tokens = get_processed_testing_tokens_for_english_tweets()

In [36]:
testing_data = utils.substitute_with_UNK_for_TEST(processed_Hindi_tokens,word_to_ix)

In [37]:
test_sentence = []
for sent in testing_data:
     test_sentence.append(sent[:50])

In [40]:
test_padded_sentence = sentence_to_padded_sentence(test_sentence, word_to_ix)

In [47]:
testing_data = test_padded_sentence
with torch.no_grad():
	# this will be the file to write the outputs
    with open("mymodel_output_Hinglish.txt", 'w',encoding='UTF-8') as op:
        for instance in testing_data:
            # Convert the test sentence into a word ID tensor
            test_sentence_in=torch.tensor(instance, dtype=torch.long)

            tag_scores = model(test_sentence_in)

            # Find the tag with the highest probability in each position
            outputs = [int(np.argmax(ts)) for ts in tag_scores.detach().numpy()]
            # Prepare the output to be written in the same format as the test file (word|tag)
            formatted_output = ix_to_tag[outputs[0]]
            # Write the output
            op.write(formatted_output + '\n')
            
            print(outputs)
        print(len(test_data))



[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[1]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[2]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[1]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[1]
[0]
[0]
[1]
[1]
[0]
[0]


NameError: name 'test_data' is not defined

In [30]:
Hindi_dict = "Hindi_English_dict.csv"
H_dict = pd.read_csv(Hindi_dict,names = ['Hindi','English'],encoding='UTF-8')

HE_dict_F = "HE_dictionary_functions.csv"
H_dict_F = pd.read_csv(HE_dict_F,names = ['Hindi','English'],encoding='UTF-8')
H_dict_F['Hindi'] = H_dict_F['Hindi'].str.strip()
H_dict_F['English'] = H_dict_F['English'].str.strip()
H_hindi_F = np.asarray(H_dict_F['Hindi'])
H_english_F = np.asarray(H_dict_F['English'])


H_dict['Hindi'] = H_dict['Hindi'].str.strip()
H_dict['English'] = H_dict['English'].str.strip()
H_hindi = np.asarray(H_dict['Hindi'])
H_english = np.asarray(H_dict['English'])

HE_dict = dict(zip(H_hindi,H_english))
H_dict_F = dict(zip(H_hindi_F,H_english_F))

EH_dict = {v:k for k, v in HE_dict.items()}
EH_dict_F = {v:k for k, v in H_dict_F.items()}

In [43]:
Hindi_text  = "hindi/agr_hi_train.csv"
df1 = pd.read_csv(Hindi_text,names = ['source','comment','annotation'],encoding='UTF-8')
df1['comment'] = df1.comment.str.strip()   # removing spaces
hindi_comments = np.asarray(df1['comment'])    # dividing the dataframe into comments and tags and converting to array
hindi_tags = np.asarray(df1['annotation'])
print((hindi_comments[1])) 
processed_Hindi_tokens = []
for comment in hindi_comments:
#    comment = "Also see ....hw ur RSS activist caught in Burkha .... throwing beef in d holy temples...https://www.google.co.in/amp/www.india.com/news/india/burkha-clad-rss-activist-caught-throwing-beef-at-temple-pictures-go-viral-on-facebook-593154/amp/,NAGfacebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING"
#    comment = comment.lower()   #lower casing each tweets
    Digit_REMOVAL = re.sub(r'[0-9]+', '',str(comment)) #removal of numbers 
    URL_REMOVAL = re.sub(r"http\S+", "", Digit_REMOVAL) # removal of URLS
    Emoji_removal = remove_emoji(URL_REMOVAL)
    Emoji_removal = Emoji_removal.lower()
    if (isEnglish(Emoji_removal) == True):
        Emoji_removal = re.sub(r'[^\w\s]','',Emoji_removal)# removal of punctuation and tokenizing
    processed_Hindi_tokens.append(word_tokenize(Emoji_removal))
processed_Hindi_tokens[0]
processed_Hindi_tokens[11]
processed_Hindi_tokens[6]
processed_Hindi_tokens[11]


Bhai 60sal pehle desh me kya tha pehle pta kro desh waise ka waise kaise hai


['विदिशा', 'से', 'बीजेपी', 'की', 'सुषमा', 'स्वराज', 'आगे']

In [44]:
hindi_tags

array(['OAG', 'CAG', 'CAG', ..., 'OAG', 'NAG', 'CAG'], dtype=object)

In [31]:



#-----------Transliteration and translation
transliteration_dict = "transliterations.hi-en.csv"
t_dict = pd.read_csv(transliteration_dict,names = ['Hinglish','Hindi'],encoding='UTF-8',sep='\t')
t_dict['Hinglish'] = t_dict['Hinglish'].str.strip()
t_dict['Hindi'] = t_dict['Hindi'].str.strip()
t_dict = np.asarray(t_dict)

#--------------profanity dictionary
profanity_dict = "ProfanityText.txt"
P_dict = pd.read_csv(profanity_dict,names = ['Hinglish','English'],encoding='UTF-8',sep='\t')
P_dict['Hinglish'] = P_dict['Hinglish'].str.strip()
P_dict['English'] = P_dict['English'].str.strip()
P_dict = np.asarray(P_dict)


print(t_dict)
processed_Hindi_tokens[4]
for i in range(0,len(processed_Hindi_tokens)):
    print(i)
    for j in range (0,len(processed_Hindi_tokens[i])):
        flag = 0
        Str1 = (processed_Hindi_tokens[i][j])
        max_ratio = 60
        max_ratio_P = 75   #needs to be adjusted
        if (Str1 in EH_dict): # check whether the values exists in english dictionary or not.
            continue;
        for l in range(0,len(P_dict)):
            Str2 = P_dict[l][0]
            Ratiostr1 = fuzz.ratio(Str1,Str2)
            if (Ratiostr1 >= max_ratio_P):
                print(Ratiostr1)
                max_ratio_P = Ratiostr1
                processed_Hindi_tokens[i][j] = P_dict[l][1]
                flag = 1 
                print(flag)
                break;
        for p in EH_dict_F:
            Ratiostr1 = fuzz.ratio(Str1,str(p))
            if(Ratiostr1 >= 98):
                flag = 1
                break;
        if (flag == 1):
            continue;
        
        else:
            for k in range(0,len(t_dict)):
                Str2 = t_dict[k][0]
                Ratiostr1 = fuzz.ratio(Str1,Str2)
                if (Ratiostr1 > max_ratio):
                    max_ratio = Ratiostr1
                    processed_Hindi_tokens[i][j] = t_dict[k][1]
processed_Hindi_tokens[0]
processed_Hindi_tokens[1]
(processed_tokens[12]) 

Badnam Gujat kutte ki olaad madrchod Pakistani kutto suar ki olaad madrchod Pakistani kutto suar ki olaad
[['hajagiree' 'हजगिरी']
 ['chekaanv' 'चेकॉव']
 ['spinagaarn' 'स्पिनगार्न']
 ...
 ['bar' 'वार']
 ['leonard' 'लियोनार्ड']
 ['gurudwar' 'गुरूद्वारा']]
0
1
80
1
84
1
80
1
84
1
80
1
2
77
1
3
89
1
89
1
77
1
75
1
80
1
75
1
75
1
4
5
6
80
1
80
1
80
1
7
75
1
80
1
8
82
1
82
1
100
1
80
1
75
1
9
10
80
1
75
1
11
12
13
80
1
89
1
14
80
1
15
16
77
1
75
1
17
18
75
1
75
1
75
1
75
1
19
89
1
20
21
75
1
22
75
1
80
1
23
24
25
80
1
77
1
26
27
75
1
75
1
75
1
28
75
1
29
89
1
80
1
75
1
77
1
77
1
75
1
30
75
1
80
1
31
32
75
1
77
1
80
1
33
34
86
1
35
75
1
75
1
80
1
89
1
36
75
1
86
1
37
89
1
38
39
77
1
75
1
77
1
77
1
75
1
80
1
75
1
75
1
77
1
83
1
80
1
80
1
80
1
75
1
80
1
75
1
75
1
75
1
80
1
75
1
80
1
77
1
80
1
75
1
75
1
75
1
77
1
89
1
75
1
75
1
75
1
40
75
1
75
1
75
1
75
1
41
42
43
44
80
1
45
77
1
46
86
1
47
48
80
1
75
1
75
1
80
1
75
1
80
1
75
1
80
1
49
77
1
50
51
52
80
1
53
54
83
1
55
75
1
75
1
56
57
75
1
75
1
5

75
1
675
676
75
1
75
1
677
89
1
75
1
75
1
75
1
678
679
680
100
1
100
1
80
1
80
1
75
1
681
80
1
80
1
91
1
682
89
1
683
77
1
684
685
686
687
75
1
688
689
75
1
75
1
75
1
75
1
690
75
1
77
1
77
1
75
1
691
75
1
80
1
692
80
1
75
1
75
1
75
1
75
1
80
1
80
1
89
1
75
1
80
1
75
1
693
694
75
1
80
1
75
1
695
89
1
80
1
75
1
696
697
698
75
1
699
700
701
89
1
89
1
92
1
702
703
89
1
704
77
1
92
1
75
1
705
706
75
1
75
1
707
77
1
100
1
708
86
1
80
1
86
1
75
1
75
1
86
1
89
1
709
75
1
80
1
75
1
80
1
75
1
91
1
75
1
75
1
100
1
75
1
75
1
75
1
75
1
710
75
1
80
1
80
1
91
1
80
1
711
80
1
712
76
1
80
1
75
1
89
1
75
1
713
89
1
80
1
75
1
80
1
77
1
75
1
100
1
89
1
714
715
75
1
716
80
1
86
1
717
718
719
720
77
1
75
1
75
1
721
89
1
100
1
75
1
75
1
89
1
89
1
78
1
722
75
1
723
75
1
80
1
75
1
89
1
724
83
1
75
1
725
77
1
726
80
1
80
1
727
728
75
1
75
1
83
1
729
730
75
1
731
732
80
1
80
1
83
1
100
1
733
75
1
734
735
77
1
80
1
736
737
738
75
1
100
1
80
1
77
1
75
1
739
77
1
77
1
77
1
740
86
1
741
77
1
83
1
742
743
744
745
75


NameError: name 'processed_tokens' is not defined

In [None]:






Hindi_dict = "Hindi_English_dict.csv"
H_dict = pd.read_csv(Hindi_dict,names = ['Hindi','English'],encoding='UTF-8')

HE_dict_F = "HE_dictionary_functions.csv"
H_dict_F = pd.read_csv(HE_dict_F,names = ['Hindi','English'],encoding='UTF-8')
H_dict_F['Hindi'] = H_dict_F['Hindi'].str.strip()
H_dict_F['English'] = H_dict_F['English'].str.strip()
H_hindi_F = np.asarray(H_dict_F['Hindi'])
H_english_F = np.asarray(H_dict_F['English'])


H_dict['Hindi'] = H_dict['Hindi'].str.strip()
H_dict['English'] = H_dict['English'].str.strip()
H_hindi = np.asarray(H_dict['Hindi'])
H_english = np.asarray(H_dict['English'])

HE_dict = dict(zip(H_hindi,H_english))
H_dict_F = dict(zip(H_hindi_F,H_english_F))

EH_dict = {v:k for k, v in HE_dict.items()}
EH_dict_F = {v:k for k, v in H_dict_F.items()}


for i in range(0,len(processed_Hindi_tokens)):
    print(i)
    for j in range (0,len(processed_Hindi_tokens[i])):
        Str = processed_Hindi_tokens[i][j]
        if(Str in HE_dict):
            processed_Hindi_tokens[i][j] = HE_dict[Str]
        elif(Str in H_dict_F):
            processed_Hindi_tokens[i][j] = H_dict_F[Str]
            

In [35]:
processed_Hindi_tokens[3]

['क्या',
 'donkey',
 'है',
 'जी',
 'जब',
 'की',
 'पाम',
 'इब्न',
 'breed',
 'है',
 'to',
 'कितना',
 'pressure',
 'राहत',
 'है',
 'की',
 'वही',
 'animal',
 'है',
 'कच्छ',
 'न',
 'कच्छ',
 'होगा',
 'bug',
 'वार',
 'जी',
 'बस',
 'आप',
 'log',
 'और',
 'support',
 'करे',
 'लोग',
 'को',
 'बाटा',
 'आप',
 'खुर्द',
 'idiot',
 'लईक',
 'लोग',
 'रहे',
 'है',
 'क्या',
 'मिलोंगा',
 'कश्मीर',
 'पाक',
 'वाले',
 'का',
 'हो',
 'जाएगा',
 'to',
 'आप',
 'को',
 'stinking',
 'है',
 'न',
 'son-in-law',
 'hind']