p1.py

# -*- coding: utf-8 -*-
"""CS689-Project-Final.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/16EgSoX3bi0UcjPhcl-97i4RJgDif4vDz

# **Morphological Analysis of Hindi Text**

**Dependencies:** `numpy, conllu, FastText, pandas, Keras, pyconll, sys, collections`

Authors :
Abhinav Kuruma 22111401,
Sanket Kale 22111052, 
Saqeeb 22111053

## POS tag with HMM and Viterbi Algorithm
"""

from google.colab import drive
drive.mount('/content/drive')

!pip install conllu

import sys
import math
import codecs
from io import open
from decimal import *
from conllu import parse
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt

"""### Getting the data in correct format"""

def getTrainData():
  with open("/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-train.conllu", 'r') as file:
      temp = file.read()

  parsesentence = parse(temp)
  train_data = []
  for sent in parsesentence:
    eachSentence = []
    for word in sent:
      wordForm = word['form']
      POS = word['xpos']
      eachWord = wordForm + '/' + POS
      eachSentence.append(eachWord)
    train_data.append(eachSentence)
  return train_data

train_data = getTrainData()

tag_list = set()
tag_count = {}
word_set = set()

print(train_data[-300])

"""### Calculating Trainsition Probability"""

# value = train_data[0]
tag_list.clear()
word_set.clear()
tag_count.clear()
transition_dict = {}
for value in train_data:
  previous = "start"
  for data in value:
    # we store words and their corresponding tags #
      i = data[::-1]
      word = data[:-i.find("/") - 1]
      word_set.add(word.lower())
      data = data.split("/")
      tag = data[-1]
      tag_list.add(tag)
      # store frequency of each tag #
      if tag in tag_count:
          tag_count[tag] += 1
      else:
          tag_count[tag] = 1
      # store the frequency of each combination of tags #
      if (previous + "~tag~" + tag) in transition_dict:
          transition_dict[previous + "~tag~" + tag] += 1
          previous = tag
      else:
          transition_dict[previous + "~tag~" + tag] = 1
          previous = tag
print(transition_dict)

prob_dict = {}
for key in transition_dict.keys():
    den = 0
    val = key.split("~tag~")[0]
    # Probabilty of a tagA to be followed by tagB out of all possible tags # 
    for key_2 in transition_dict:
        if key_2.split("~tag~")[0] == val:
            den += transition_dict[key_2]
    prob_dict[key] = Decimal(transition_dict[key])/(den)
print(prob_dict)

# Added the problities of the combination NOT in the dictionary with minimin prob
for tag in tag_list:
  # if a tag does not occur as a start tag, then set its probability to be a start tag to minimum value #
    if "start" + tag not in  prob_dict:
        prob_dict[("start" + "~tag~" + tag)] = Decimal(1) / Decimal(len(word_set) + tag_count[tag])
for tag1 in tag_list:
    for tag2 in tag_list:
      # if a particular tag combination does not exist in the dictionary, we set its probability to minimum#
        if (tag1 +"~tag~" + tag2) not in prob_dict:
            prob_dict[(tag1+"~tag~"+tag2)] = Decimal(1)/Decimal(len(word_set) + tag_count[tag1])
print(prob_dict)

# Total of 992 POS tag sequence (32 POS with start symbol * 31 POS tag)
len(prob_dict)

"""### Calculating Emission Probability """

count_word = {}
for value in train_data:
    for data in value:
        i = data[::-1]
        word = data[:-i.find("/") - 1]
        tag = data.split("/")[-1]
        # map the words in the training set to their tagged POS #
        if word.lower() + "/" + tag in count_word:
            count_word[word.lower() + "/" + tag] +=1
        else:
            count_word[word.lower() + "/" + tag] = 1
print(count_word)

emission_prob_dict = {}
# calculate probability of a word to be a certain Tag out of all the possible tags that it can be #
for key in count_word:
    emission_prob_dict[key] = Decimal(count_word[key])/tag_count[key.split("/")[-1]]

print(emission_prob_dict)

"""### Implementing Viterbi Algorithm"""

tag_list, prob_dict, emission_prob_dict, tag_count, word_set
transition_prob = prob_dict
emission_prob = emission_prob_dict

def viterbi_algorithm(sentence, tag_list, transition_prob, emission_prob,tag_count, word_set):
    global tag_set
    # Get words from each sentence #
    sentence = sentence.strip("\n")
    word_list = sentence.split()
    current_prob = {}
    for tag in tag_list:
        # transition probability #
        tp = Decimal(0)
        # Emission probability #
        em = Decimal(0)
        # Storing the probability of every tag to be starting tag #
        if "start~tag~"+tag in transition_prob:
            tp = Decimal(transition_prob["start~tag~"+tag])
        # Check for first word in training data. If present, check the probability of the first word to be of given tag
        if word_list[0].lower() in word_set:
            if (word_list[0].lower()+"/"+tag) in emission_prob:
                em = Decimal(emission_prob[word_list[0].lower()+"/"+tag])
                # Storing probability of current combination of tp and em #
                current_prob[tag] = tp * em
         # Check for word in training data. If absent then probability is just tp# 
        else:
            em = Decimal(1) /(tag_count[tag] +len(word_set))
            current_prob[tag] = tp

    if len(word_list) == 1:
        # Return max path if only one word in sentence #
        max_path = max(current_prob, key=current_prob.get)
        return max_path
    else:
        # Tracking from second word to last word #
        for i in range(1, len(word_list)):
            previous_prob = current_prob
            current_prob = {}
            locals()['dict{}'.format(i)] = {}
            previous_tag = ""
            for tag in tag_list:
                if word_list[i].lower() in word_set:
                    if word_list[i].lower()+"/"+tag in emission_prob:
                        em = Decimal(emission_prob[word_list[i].lower()+"/"+tag])
                        # Find the maximum probability using previous node's(tp*em)[i.e probability of reaching to the previous node] * tp * em (Bigram Model) #
                        max_prob, previous_state = max((Decimal(previous_prob[previous_tag]) * Decimal(transition_prob[previous_tag + "~tag~" + tag]) * em, previous_tag) for previous_tag in previous_prob)
                        current_prob[tag] = max_prob
                        locals()['dict{}'.format(i)][previous_state + "~" + tag] = max_prob
                        previous_tag = previous_state
                else:
                    em = Decimal(1) /(tag_count[tag] +len(word_set))
                    max_prob, previous_state = max((Decimal(previous_prob[previous_tag]) * Decimal(transition_prob[previous_tag+"~tag~"+tag]) * em, previous_tag) for previous_tag in previous_prob)
                    current_prob[tag] = max_prob
                    locals()['dict{}'.format(i)][previous_state + "~" + tag] = max_prob
                    previous_tag = previous_state

            # if last word of sentence, then return path dicts of all words #
            if i == len(word_list)-1:
                max_path = ""
                last_tag = max(current_prob, key=current_prob.get)
                max_path = max_path + last_tag + " " + previous_tag
                for j in range(len(word_list)-1,0,-1):
                    for key in locals()['dict{}'.format(j)]:
                        data = key.split("~")
                        if data[-1] == previous_tag:
                            max_path = max_path + " " +data[0]
                            previous_tag = data[0]
                            break
                result = max_path.split()
                result.reverse()
                return " ".join(result)

"""### Testing """

transition_model = prob_dict
emission_model = emission_prob_dict

def printPOS(sentence1):
  path = viterbi_algorithm(sentence1, tag_list, transition_model, emission_model,tag_count, word_set)
  # sentence1 = sentence1.strip("\n")
  word = sentence1.split()
  tag = path.split(" ")
  mytext = ''
  for j in range(0,len(word)):
      if j == len(word)-1:
        mytext = mytext+word[j]+"->" + tag[j]
        print(mytext)
        mytext = ""
      else:
        mytext = mytext+word[j]+"->" + tag[j] + " "

sentence = '''यहाँ लगने वाला तीन दिन का इज्तिमा पूरे देश के लोगों को आमंत्रित करता है ।'''
printPOS(sentence)

test_data = []
with open("/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-dev.conllu", 'r') as file:
  temp = file.read()
parsesentence = parse(temp)

def predSentPOS(strSent):
  path = viterbi_algorithm(strSent, tag_list, transition_model, emission_model,tag_count, word_set)
  word = strSent.split()
  tag = path.split(" ")
  mytext = ''
  for j in range(0,len(word)):
      if j == len(word)-1:
        mytext = mytext+ tag[j]
        return mytext.split()
      else:
        mytext = mytext+ tag[j] + " "
  
def getTestPred():
  ypred = []
  ytest = []
  # a1 = parsesentence
  i = 0
  for sent in parsesentence:
    trainList = []
    strSent = str()
    for word in sent:
      trainList.append(word['xpos'])
      strSent = strSent + word['form'] + " "
    # Adding the sentence token from the train data
    ytest.extend(trainList)
    # print(strSent)
    # Now get the POS tag with respect to each word

    predList = predSentPOS(strSent)
    # print("train-> ", trainList, len(trainList))
    # print("pred-> ", predList, len(predList))
    ypred.extend(predList)

  return ytest, ypred
ytest, ypred = getTestPred()

print(accuracy_score(ytest, ypred)*100)
print(precision_score(ytest, ypred,average='macro')*100)

"""## Analysis of Gender Case Number in Hindi

### Setting up Dependencies
"""

!pip install pyconll

import pyconll
import pandas as pd
from collections import defaultdict

corpus = pyconll.load_from_file("/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-train.conllu")

s = set()
for sentence in corpus:
  for token in sentence:
    s.add(token.form)
print(len(s))

word_gender = {}
word_number = {}
word_case = {}

for sentence in corpus:
  for token in sentence:
    if 'Gender' in token.feats:
      if token.form not in word_gender:
        word_gender[token.form] = list(token.feats['Gender'])[0]  
    if 'Number' in token.feats:
      if token.form not in word_number:
        word_number[token.form] = list(token.feats['Number'])[0]
    if 'Case' in token.feats:
      if token.form not in word_case:
        word_case[token.form] = list(token.feats['Case'])[0]

print(len(word_gender))
print(len(word_number))
print(len(word_case))

"""### Rules based approach for Gender, Number, Case"""

test_gender = {}
test_number = {}
test_case = {}
for sentence in corpus:
  for token in sentence:
    if 'Gender' in token.feats:
      if(token.form[-1] == "ी" or token.form[-1] == "ि" or token.form.endswith("ियाँ") or token.form.endswith("ियां")):
        if token.form not in test_gender:
          test_gender[token.form] = 'Fem'
      else:
        test_gender[token.form] = 'Masc'
    if 'Number' in token.feats:
      if token.form.endswith("ियाँ") or token.form.endswith("ियां") or token.form.endswith("ियों") or (token.form[-1] in ["ो","े","ों","ें"]) or token.form.endswith("ओं") or token.form.endswith("ाओं") or token.form.endswith("नों") or token.form.endswith("यो") :
        if token.form not in test_number:
          test_number[token.form] = 'Plur'
      else:
        if token.form not in test_number:
          test_number[token.form] = 'Sing'
    if 'Case' in token.feats:
      if token.form not in test_case:
        test_case[token.form] = list(token.feats['Case'])[0]

print(len(test_gender))
print(len(test_number))
print(test_number)

"""Calculating Accuracy"""

correct = 0
total = len(test_gender)
for word in word_gender:
  if(word_gender[word] == test_gender[word]):
    correct += 1
accuracy = (correct/total)
print("Accuracy of Gender: ",accuracy*100)

correct = 0
total = len(test_number)
for word in word_number:
  if(word_number[word] == test_number[word]):
    correct += 1
accuracy = (correct/total)
print("Accuracy of Number: ",accuracy*100)

"""### Deep Learning Based Prediction of Gender, Number, Case

**Dependencies:** `FastText, Keras, `
"""

!git clone https://github.com/facebookresearch/fastText.git

!cd fastText

!pip install fastText

import fasttext.util
fasttext.util.download_model('hi', if_exists='ignore') 
model = fasttext.load_model('cc.hi.300.bin')

import numpy as np
from fasttext.FastText import _FastText

model = _FastText(model_path='cc.hi.300.bin')

"""**Deep Learning Model for Number Prediction**"""

# X = Creating FastText embedding vector for each word
# y = Class labels of Number
X = []
y = []
for key in word_number:
  X.append(np.array(model.get_word_vector(key)))
  y.append(word_number[key])

# Changing Number label 
# Sing -> 0 & Plur -> 1
Y = []
for i in y:
  if i == 'Sing':
    Y.append(0)
  else:
    Y.append(1)
Y = np.array(Y)

# Creating Numpy array
X = np.array(X)
print(X.shape,Y.shape)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=0)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=300))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs=40, batch_size=10, validation_split=0.1)

test_loss,test_acc = model.evaluate(X_test, Y_test)
print("Loss: ",test_loss)
print("Accuracy: ",test_acc*100)

"""**Deep Learning Model for Gender prediction**"""

model = _FastText(model_path='cc.hi.300.bin')

# X = Creating FastText embedding vector for each word
# y = Class labels of Number
X = []
y = []
for key in word_gender:
  X.append(np.array(model.get_word_vector(key)))
  y.append(word_gender[key])

# Changing Gender label 
# Masc -> 0 & Fem -> 1
Y = []
for i in y:
  if i == 'Masc':
    Y.append(0)
  else:
    Y.append(1)

X = np.array(X)
Y = np.array(Y)
print(X.shape,Y.shape)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=1)

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=300))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs=40, batch_size=10, validation_split=0.1)

test_loss,test_acc = model.evaluate(X_test, Y_test)
print("Loss: ",test_loss)
print("Accuracy: ",test_acc)

"""**Deep Learning Model for Case Prediction**"""

# Calculating number of classes in Case
aaa = set()
for k in word_case:
  aaa.add(word_case[k])
num_classes = len(aaa)
print(num_classes)

model = _FastText(model_path='cc.hi.300.bin')

# X = Creating FastText embedding vector for each word
# y = Class labels of Number
X = []
y = []
for key in word_case:
  X.append(np.array(model.get_word_vector(key)))
  y.append(word_case[key])

# Changing Case label 
Y = []
for i in y:
  if(i == 'Erg'):
    Y.append(0)
  if(i == 'Nom'):
    Y.append(1)
  if(i == 'Ine'):
    Y.append(2)
  if(i == 'Ins'):
    Y.append(3)
  if(i == 'Dat'):
    Y.append(4)
  if(i == 'Gen'):
    Y.append(5)
  if(i == 'Acc'):
    Y.append(6)

X = np.array(X)
Y = np.array(Y)
X.shape,Y.shape

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=1)

# Create onehotencoding/Categorical for each label
Y_train = to_categorical(Y_train, num_classes)
Y_test = to_categorical(Y_test, num_classes)
print(Y_train.shape,Y_test.shape)

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=300))
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs=30, batch_size=32, validation_split=0.1)

test_loss,test_acc = model.evaluate(X_test, Y_test)
print("Loss: ",test_loss)
print("Accuracy: ", test_acc)

"""## Rule based Lemmatizer """

import sys
import conllu
from conllu import parse

conllu=open('/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-train.conllu','r', encoding="utf-8")
annotations = conllu.read()
sentences = parse(annotations)

text=open("/content/drive/MyDrive/NLPassn1/output.txt","r", encoding="utf-8", errors = 'ignore')
text=text.read(100000)

english=['a','b','c','d','e','f','g','h','i','g','k','l','m','n','o','p','q','r',u'।',"'",'"','“','”','`','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','1','2','3','4','5','6','7','8','9','0','!','@','#','$','%','^','&','*','(',')','_','+','=','-','{','}','[',']',',','.','?',':','"',';','\n','\u200c','/','\xa0','...']
for i in range(len(text)):
    if text[i] in english:  # DATA CLEANING when we encounter any symbols which are in english list
        text=text.replace(text[i],'')
#print(text)

word=[]
lemma=[]
word_lemma_dict={}
for sentence in sentences:
    for token in sentence:
        word.append(token['form'])
        lemma.append(token['lemma'])
for i in range(len(word)):
    word_lemma_dict[word[i]]=lemma[i]
#print(word_lemma_dict)

rules=open("/content/drive/MyDrive/NLPassn1/rules.txt","r", encoding="utf-8")
rules=rules.read()
rules=rules.split('\n')
for i in rules:
    if(i==''):
        break
    else:    
        rule=i
        rule=rule.split(' ')
        word_lemma_dict[rule[0]]=rule[2]
        lemma.append(rule[2])
rules2=open("/content/drive/MyDrive/NLPassn1/rules2.txt","r", encoding="utf-8")
rules2=rules2.read()
rules2=rules2.split('\n')
for i in rules2:
    if(i==''):
        break
    else:    
        rule=i
        rule=rule.split(' ')
        word_lemma_dict[rule[0]]=rule[3] 
        lemma.append(rule[3])
        
rules3=open("/content/drive/MyDrive/NLPassn1/lemma.txt","r", encoding="utf-8")
rules3=rules3.read()  
rules3=rules3.split('\n')
for i in rules3:
    rule=i
    rule=rule.split(':')
    word_lemma_dict[rule[0]]=rule[-1] 
    lemma.append(rule[-1])
word=set(word)
lemma=set(lemma)

suffixes =["ो", "े", "ू", "ु", "ी", "ि", "ा","तृ","ान","ैत","ने","ाऊ","ाव","कर", "ाओ", "िए", "ाई", "ाए", "नी", "ना", "ते", "ीं", "ती","ता", "ाँ", "ां", "ों", "ें","ीय", "ति","या", "पन", "पा","ित","ीन","लु","यत","वट","लू", "ेरा","त्व","नीय","ौनी","ौवल","ौती","ौता","ापा","वास","हास","काल","पान","न्त","ौना","सार","पोश","नाक","ियल","ैया", "ौटी","ावा","ाहट","िया","हार", "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "वान", "बीन","ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं","कला","िमा","कार","गार", "दान","खोर", "ावास","कलाप","हारा","तव्य","वैया", "वाला", "ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां","त्वा","तव्य","कल्प","िष्ठ","जादा","क्कड़", "ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां", "अक्कड़","तव्य:","निष्ठ""ो","े","ू","ु","ी","ि","ा","कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें","ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं","ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां","ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]

def words_lemmas(words):
    
    words_lemmas = {}
    lemmas=[]
    remaining = []
    remaining2 = []
    sett = []
    for i in words:
        sett.append(i)
        flag = 0
        if i in word:
            lemmas.append(word_lemma_dict[i])
            words_lemmas[i] = word_lemma_dict[i]
        elif i in lemma:
            lemmas.append(i)
            words_lemmas[i] = i
        else:
            for s in suffixes:
                if i.endswith(s):
                    t = i[:(len(i)-len(s))]
                    if t in lemma:
                        lemmas.append(t)
                        words_lemmas[i] = t
                        break
                    else:
                        remaining.append(i)
                        flag = 1
                        words_lemmas[i]=t
                        break

            if(flag == 0):
                remaining.append(i)
                words_lemmas[i]=i
                
    for k in remaining:
        flag=0
        for s in suffixes:
            if k.endswith(s):
                k=k[:len(i)-len(s)]
                for s1 in suffixes:
                    j=k+s1
                    if j in lemma:
                        lemmas.append(j)
                        words_lemmas[i]=j
                        break
                    else:
                        remaining2.append(i)
                        words_lemmas[i]=k
                        flag=1
                        break
                break         
        if(flag==0):
            remaining2.append(i)
            words_lemmas[i]=i                
    return words_lemmas

conllu=open('/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-test.conllu','r', encoding="utf-8")
annotations =conllu.read()
sentences = parse(annotations)
word2=[]
lemma2=[]
word2_lemma2_dict={}
for sentence in sentences:
    for token in sentence:
        word2.append(token['form'])
        lemma2.append(token['lemma'])
for i in range(len(word2)):
    word2_lemma2_dict[word2[i]]=lemma2[i]
store=words_lemmas(word2) 
print(len(store.keys()),len(word2_lemma2_dict.keys()))

correct=0
total=len(word2_lemma2_dict.keys())
for i in word2_lemma2_dict.keys():
    if(store[i]==word2_lemma2_dict[i]):
        correct+=1
print("Accuracy:", correct/total)