In [1]:
!pip install scikit-learn



In [2]:
! pip install pyiwn

Collecting pyiwn
  Downloading pyiwn-0.0.5-py3-none-any.whl (12 kB)
Installing collected packages: pyiwn
Successfully installed pyiwn-0.0.5


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

This is the portion where we are doing sentiment analysis based on the hindi sentiwordnet. It tests against a data set of positive and negative movie review data set which contains around 500 positive hindi reviews and negative reviews. And then at the end their scores are showcased

In [56]:
# This module is written to do a Resource Based Semantic analyasis using hindi sentiwordnet.
import pandas as pd
import codecs
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, f1_score
import re

data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/HindiSentiWordnet.txt", delimiter=' ')

fields = ['POS_TAG', 'ID', 'POS', 'NEG', 'LIST_OF_WORDS']

#Creating a dictionary which contain a tuple for every word. Tuple contains a list of synonyms,
# positive score and negative score for that word.
words_dict = {}
for i in data.index:
    words = data[fields[4]][i].split(',')
    for word in words:
        words_dict[word] = (data[fields[0]][i], data[fields[2]][i], data[fields[3]][i])
print(len(words_dict))
#This function determines sentiment of text.
def sentiment(text):
    words = word_tokenize(text)
    votes = []
    pos_polarity = 0
    neg_polarity = 0
    #adverbs, nouns, adjective, verb are only used
    allowed_words = ['a','v','r','n']
    for word in words:
        if word in words_dict:
            #if word in dictionary, it picks up the positive and negative score of the word
            pos_tag, pos, neg = words_dict[word]
            # print(word, pos_tag, pos, neg)
            if pos_tag in allowed_words:
                if pos > neg:
                    pos_polarity += pos
                    votes.append(1)
                elif neg > pos:
                    neg_polarity += neg
                    votes.append(0)
    #calculating the no. of positive and negative words in total in a review to give class labels
    pos_votes = votes.count(1)
    neg_votes = votes.count(0)
    if pos_votes > neg_votes:
        return 1
    elif neg_votes > pos_votes:
        return 0
    else:
        if pos_polarity < neg_polarity:
            return 0
        else:
            return 1


pred_y = []
actual_y = []
# to calculate accuracy
pos_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/pos_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in pos_reviews.split('$'):
    data = line.strip('\n')
    if data:
        pred_y.append(sentiment(data))
        actual_y.append(1)
#print(accuracy_score(actual_y, pred_y) * 100)
# print(len(actual_y))
neg_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/neg_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in neg_reviews.split('$'):
    data=line.strip('\n')
    if data:
        pred_y.append(sentiment(data))
        actual_y.append(0)
# print(len(actual_y))
print(accuracy_score(actual_y, pred_y) * 100)
print('F-measure:  ',f1_score(actual_y,pred_y))


#print(sentiment("मैं इस उत्पाद से बहुत खुश हूँ  यह आराम दायक और सुन्दर है  यह खरीदने लायक है "))

9805
53.5140562248996
F-measure:   0.5270684371807968


In this approach we are trying to create hindi language lexicons from manual seeding process where we have take 50 odd hindi words and have associated sentiment values to each word. It's either 0 for +ve and 1 for -ve and 2 for neutral. Then we are using pyiwn: A Python-based API to access Indian Language WordNets to access the hindi wordnets and assigning the same sentiments to its synonyms as well. If it's already present then we are adding on top of those words and then normalizing it so that sum of the values ranges between 0 and 1

In [None]:
import pyiwn
initial_seed_list = [
                     ("बहादुर",0),("उपयोगी",0),("प्रसन्न",0),("अच्छा",0),("मुलायम",0),("ईमानदार",0),("अद्भुत",0),("प्रतिभाशाली",0),("बुद्धिमान",0),("प्यार",0),("दयालु",0),("बलवान",0),("ताज़ा",0),("आकर्षक",0),("गजब",0),("उत्तम",0),
                     ("पुराना",2),("निजी",2),("तटस्थ",2),("गाँव",2),("लंबा",2),("सरल",2),("भारतीय",2),("अधिक",2),("पिछला",2),("विदेशी",2),("काली",2),("उदारवादी",2),("साहसिक",2),("सामान्य",2),("खोज",2),
                     ("धोखा",1),("दोष",1),("उल्लू",1),("अजीब",1),("शोर",1),("दुखी",1),("निष्प्राण",1),("उदास",1),("मायावी",1),("परेशान",1),("दुर्लभ",1),("बेताब",1),("उथला",1),("अविश्वसनीय",1),("हिंसा",1),("शैतान",1)
                     ]
temporary_seed_list = initial_seed_list.copy()
final_seed_list = {}
wordnet = pyiwn.IndoWordNet()
while temporary_seed_list :
  word, polarity = temporary_seed_list.pop(0)
  if word not in final_seed_list :
    final_seed_list[word] = [0, 0, 0]
    final_seed_list[word][polarity] += 1
#Get the synonyms from the wordnet for word
  synonym_set = []
  for w in wordnet.synsets(word) :#, pos=pyiwn.PosTag.ADJECTIVE) :
    synonym_set += w.lemma_names()[:10]
  synonym_set = list(set(synonym_set))
  for synonym in synonym_set :
    if synonym != word :
      if synonym not in final_seed_list :
        temporary_seed_list.append((synonym, polarity))
        final_seed_list[synonym] = [0, 0, 0]
      final_seed_list[synonym] = [(final_seed_list[word][0] + final_seed_list[synonym][0]), (final_seed_list[word][1] + final_seed_list[synonym][1]), (final_seed_list[word][2] + final_seed_list[synonym][2])]
for word in final_seed_list.keys() :
  total = sum(final_seed_list[word])
  final_seed_list[word] = [final_seed_list[word][i] / total for i in range(3)]

print("Total number of entries in the newely formed hindi senti-wordnet : ", len(final_seed_list.keys()))

[██████████████████████████████████████████████████]
Total number of entries in the newely formed hindi senti-wordnet :  36268


In [None]:
def sentiment_hindi(text):
    words = word_tokenize(text)
    votes = []
    pos_polarity = 0
    neg_polarity = 0
    for word in words:
        if word in final_seed_list:
            #if word in dictionary, it picks up the positive and negative score of the word
            pos, neg, obj = final_seed_list[word]
            # print(word, pos_tag, pos, neg)
            if float(pos) > float(neg):
                pos_polarity += pos
                votes.append(1)
            elif float(neg) > float(pos):
                neg_polarity += neg
                votes.append(0)
    #calculating the no. of positive and negative words in total in a review to give class labels
    pos_votes = votes.count(1)
    neg_votes = votes.count(0)
    if pos_votes > neg_votes:
        return 1
    elif neg_votes > pos_votes:
        return 0
    else:
        if pos_polarity < neg_polarity:
            return 0
        else:
            return 1


Here,we are validating the prediction ability with the new sentiwordnet that got generated by the manual seeding process.

In [None]:
pred_y = []
actual_y = []
# to calculate accuracy
pos_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/pos_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in pos_reviews.split('$'):
    data = line.strip('\n')
    if data:
        pred_y.append(sentiment_hindi(data))
        actual_y.append(1)
#print(accuracy_score(actual_y, pred_y) * 100)
neg_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/neg_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in neg_reviews.split('$'):
    data=line.strip('\n')
    if data:
        pred_y.append(sentiment_hindi(data))
        actual_y.append(0)
print(accuracy_score(actual_y, pred_y) * 100)
print('F-measure:  ',f1_score(actual_y,pred_y))

51.00401606425703
F-measure:   0.672922252010724


Here, we are trying to leverage the large sentiwordnet for English language for hindi sentiment analysis. We will translate entire set of positive and negative movie reviews in English and try to predict it's sentiment. This methodology can be implemented even if we are trying to do it for single sentence.

From here I will try to translate the entire hindi movie sentences in english

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M",src_lang="hin_Deva")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
import codecs
import re
pos_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/pos_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
hindi_review_lines = pos_reviews.split('$')
refined_hindi_lines = []
refined_sentences = [re.sub(r'[a-zA-Z\n]', '', sentence) for sentence in hindi_review_lines]
refined_sentences = [sentence for sentence in refined_sentences if sentence.strip()]
Reviews_in_English = []
for line in refined_sentences:
  inputs = tokenizer(line, return_tensors="pt")
  translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=200
  )
  temp = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
  print(temp)
  Reviews_in_English.append(temp)





the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


Mickey Virus is a film entirely by Manish Paul and the scenes and situations were created in the film according to his image.
Manish has done his duty well.
He has expressed the fear, fears, struggles, determination and victory of Shahid in a very clear way.
They seem natural in every sense of the word.
The casting of the film is tremendous.
The roles of Shahid's brothers, mother and wife have also been chosen for suitable actors.
The courtroom scenes of the film are real.
Akshay Kumar's unbridled and unconcerned style fascinates
He has a deep confidence in front of the camera.
They also have some very sharp dialogues.
They show their talent even in limited scenes.
Faraj Haider has humorously portrayed the generations of tension and warfare that followed the partition between India and Pakistan with a sense of peace and security.
The idea of Farrah Haider is commendable
The movie songs.
One of the great attractions of the film is the combination of Chulbul (Rishi Kapoor) and Bulbul (Ni

In [None]:
# Open the file in write mode
with open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Eng_pos_hindi.txt", "w", encoding="utf-8") as file:
    # Write each item in the list to the file
    for item in Reviews_in_English:
        file.write(item + '\n')

In [None]:
len(Reviews_in_English)

502

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M",src_lang="hin_Deva")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
import codecs
import re
pos_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/neg_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
hindi_review_lines = pos_reviews.split('$')
refined_hindi_lines = []
refined_sentences = [re.sub(r'[a-zA-Z\n]', '', sentence) for sentence in hindi_review_lines]
refined_sentences = [sentence for sentence in refined_sentences if sentence.strip()]
Reviews_in_English = []
for line in refined_sentences:
  inputs = tokenizer(line, return_tensors="pt")
  translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=200
  )
  temp = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
  print(temp)
  Reviews_in_English.append(temp)





Not only did the film set the expectations, but it also made me regret that the good stuff had gone away.
He has a lot of acting, but a good actor can get bored if the character is stereotypical.
Eli Abram is hired to add glamour to the film.
The other characters are very common.
The context and circumstances have changed, but there has been little change in thinking and understanding.
There may also be technical quality deficiencies.
There is laughter in the film, but laughter is dominated by violence.
It's better not to see this movie for children.
He is bruised, bruised, flogged, bruised, and his head is swollen.
The biggest problem with this film is the idea of the script.
The entertainment cycle is broken when you watch a movie.
The movie is disappointing after all.
The song of Shadha starring Kamal Haasan and Sridevi would have given the audience a sense of relief if it had been in the film.
Somewhere reason and reason are ignored.
The film doesn 't even fit in with the songs and

In [None]:
# Open the file in write mode
with open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Eng_neg_hindi.txt", "w", encoding="utf-8") as file:
    # Write each item in the list to the file
    for item in Reviews_in_English:
        file.write(item + '\n')

Here we will define a new function to compare the sentiments of new english sentences

In [30]:
import csv
English_Sentiment_Dict={}
sign =0
with open('/content/drive/MyDrive/Colab Notebooks/CS689 Project/Trying to Create new words/English_sentiwordnet_word_sentiment.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
      if sign==1: #Just to avoid the first row
        English_Sentiment_Dict[row[0]]=(float(row[1]),float(row[2]),float(row[3]))
      sign =1


In [14]:
def sentiment_english(text):
    words = word_tokenize(text)
    votes = []
    pos_polarity = 0
    neg_polarity = 0
    for word in words:
        if word in English_Sentiment_Dict:
            #if word in dictionary, it picks up the positive and negative score of the word
            pos, neg, obj = English_Sentiment_Dict[word]
            # print(word, pos_tag, pos, neg)
            if float(pos) > float(neg):
                pos_polarity += pos
                votes.append(1)
            elif float(neg) > float(pos):
                neg_polarity += neg
                votes.append(0)
    #calculating the no. of positive and negative words in total in a review to give class labels
    pos_votes = votes.count(1)
    neg_votes = votes.count(0)
    if pos_votes > neg_votes:
        return 1
    elif neg_votes > pos_votes:
        return 0
    else:
        if pos_polarity < neg_polarity:
            return 0
        else:
            return 1


In [54]:
import codecs
from sklearn.metrics import accuracy_score, f1_score
pred_y = []
actual_y = []
# to calculate accuracy
pos_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Eng_pos_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in pos_reviews.split('\n'):
    data = line.strip('\n')
    if data:
        pred_y.append(sentiment_english(data))
        actual_y.append(1)
#print(accuracy_score(actual_y, pred_y) * 100)
neg_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Eng_neg_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in neg_reviews.split('\n'):
    data=line.strip('\n')
    if data:
        pred_y.append(sentiment_english(data))
        actual_y.append(0)
print(accuracy_score(actual_y, pred_y) * 100)
print('F-measure:  ',f1_score(actual_y,pred_y))

63.417085427135675
F-measure:   0.6856649395509499


Another Method of calculating the sentiments of the sentences

In [51]:
import nltk
nltk.download('sentiwordnet')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def get_sentiment(word):
    sentiment = {'pos': 0, 'neg': 0}
    synsets = swn.senti_synsets(word)
    for syn in synsets:
        sentiment['pos'] += syn.pos_score()
        sentiment['neg'] += syn.neg_score()
    return sentiment

def analyze_sentiment(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    sentiment = {'pos': 0, 'neg': 0}
    for token in tokens:
        token_sentiment = get_sentiment(token)
        sentiment['pos'] += token_sentiment['pos']
        sentiment['neg'] += token_sentiment['neg']

    total = sentiment['pos'] + sentiment['neg']
    if total == 0:
        return 0 # Neutral
    else:
        if(((sentiment['pos'] - sentiment['neg']) / total)>0):
          return 1
        else:
          return 0

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
import codecs
from sklearn.metrics import accuracy_score, f1_score
pred_y = []
actual_y = []
with open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Eng_pos_hindi.txt", "r", encoding='utf-8', errors='ignore') as pos_reviews:
  for line in pos_reviews:
      data = line.strip('\n')
      if data:
          pred_y.append(analyze_sentiment(data))
          actual_y.append(1)
# #print(accuracy_score(actual_y, pred_y) * 100)
with open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Eng_neg_hindi.txt", "r", encoding='utf-8', errors='ignore') as neg_reviews:
  for line in neg_reviews:
      data=line.strip('\n')
      if data:
          pred_y.append(analyze_sentiment(data))
          actual_y.append(0)
print(accuracy_score(actual_y, pred_y) * 100)
print('F-measure:  ',f1_score(actual_y,pred_y))

64.82412060301507
F-measure:   0.7078464106844742


Now, Trying it out with online English to Hindi Dictionary Sabdanjali

In [13]:
import re
count =0
with open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/shabdanjali.utf8","r", encoding='utf-8', errors='ignore') as file:
  for line in file:
    count+=1
    if(count==100):
      break
    print(line)

"a","Art","1.एक"

I bought a pen.



"aback","Adv","1.पीछे/हतप्रभ"

I was somewhat taken aback by his rudeness.



"abacus","N","1.गिनतारा"

Japanese prefer to use abacus for calculations.



"abandon","V","1.छोड़_देना"

A baby abandoned by its parents was found here.



"abandoned","Adj","1.छोड़ा_हुआ"

Abandoned children are kept in orphanages.



"abandonment","N","1.परित्याग"

They have a fear of abandonment in old age.



"abase","V","1.अवमानित_करना"

A policeman is abasing a good businessman.



"abashed","Adj","1.लज्जित"

His teacher's criticism left him feeling rather abashed.



"abashedly","Adv","1.लज्जित_रूप_से"

He was looking about abashedly among his literary contemporaries.



"abate","V","1.कम_होना"

Students' interest in studies seems to have abated.



"abatement","N","1.कमी"

There seems to be an abatement of interest in studies among students.



"abattoir","N","1.वधशाला"

The animals were taken to an abattoir for slaughter.



"abbess","N","1.मठाध्यक्षा"

Mother Teres

In [17]:
allowed_postags = ['"Adj"', '"Adv"']
translated = {}
with open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/shabdanjali.utf8","r", encoding='utf-8', errors='ignore') as file:
  for line in file:
    if len(line) == 0:
        break
    elif line.startswith('"'):
        words = line.split(",")
        # print(words)#Sample ['"undergarment"', '"N"', '"1.अन्दर_के_कपडे"\n']
        prevword = words[0][1:(len(words[0])-1)]
        # print(prevword) #output :- undergarment
        prev_pos_tag = words[1]
        # print(prev_pos_tag) #output :-"N"
        if (words[1] in allowed_postags):
          word_h = re.split('"|/|\d+.',words[2])
          #  print(word_h) #Sample output ['', '', 'चंचल_व्यवहार', '\n']
          for i in word_h:
            if len(i) != 0 and i != "\n":
              word_hindi = re.split('\{|\[',i)
              word_hindi_1 = word_hindi[0].split("_") #"चंचल" "व्यवहार"
              s_word = " ".join(word_hindi_1) #चंचल व्यवहार
              if s_word != " ":
                if s_word in translated:
                  translated[s_word].append(prevword)
                else:
                  translated[s_word] = [prevword]
    elif line[0] == "-" and (prev_pos_tag in allowed_postags): #--"2.क्षणिक_विराम"
      word_h = re.split('"|/|\d+.',line[2:])
      for i in word_h:
        if len(i) != 0 and i != "\n":
          word_hindi = re.split('\{|\[',i)
          # print(word_hindi) #Example ['इधर_उधर']
          word_hindi_1 = word_hindi[0].split("_") # "चंचल" "व्यवहार"
          s_word = " ".join(word_hindi_1) # चंचल व्यवहार
          if s_word != " ":
            if s_word in translated:
              translated[s_word].append(prevword)
            else:
              translated[s_word] = [prevword]

In [21]:
translated

{'पीछे': ['aback', 'after'],
 'हतप्रभ': ['aback'],
 'छोड़ा हुआ': ['abandoned'],
 'लज्जित': ['abashed', 'ashamed'],
 'लज्जित रूप से': ['abashedly'],
 'उदर सम्बन्धी': ['abdominal'],
 'उदर सम्बन्धी रूप से': ['abdominally'],
 'असामान्य': ['aberrant',
  'abnormal',
  'fancy',
  'peculiar',
  'uncommon',
  'unusual'],
 'असामान्य रूप से': ['aberrantly', 'abnormally', 'extraordinarily'],
 'घृणित': ['abhorrent',
  'abominable',
  'accursed',
  'contemptible',
  'despicable',
  'hateful',
  'heinous',
  'repellent',
  'vile'],
 'घृणित रूप से': ['abhorrently', 'abominablely', 'accursedly'],
 'स्थाई': ['abiding'],
 'अति': ['abject', 'abjectly'],
 'बहुत ज्यादा': ['abject', 'dearly', 'numerous', 'plentifully'],
 'बहुत ज्यादा रूप से': ['abjectly'],
 'जला हुआ': ['ablaze'],
 'समर्थ': ['able', 'capable', 'competent', 'strong'],
 'चतुर': ['able', 'astute', 'clever', 'devious'],
 'समर्थ रूप से': ['ably'],
 'सवार': ['aboard'],
 'आदिम': ['aboriginal', 'primal', 'pristine'],
 'आदिम रूप से': ['aboriginally'],


In [22]:
import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')
from nltk.corpus import sentiwordnet as swn
import numpy as np

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [24]:
import csv
English_Sentiment_Dict={}
sign =0
with open('/content/drive/MyDrive/Colab Notebooks/CS689 Project/Trying to Create new words/English_sentiwordnet_word_sentiment.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
      if sign==1: #Just to avoid the first row
        English_Sentiment_Dict[row[0]]=(float(row[1]),float(row[2]),float(row[3]))
      sign =1

In [33]:
hindi_senti_wordnet_sabdanjali_dict = {}
for hindi_word in translated:
  pos_score =0
  neg_score=0
  obj_score =0
  for words in translated[hindi_word]:
    # print(hindi_word,words)
    try:
      pos_score += English_Sentiment_Dict[words][0]
      neg_score += English_Sentiment_Dict[words][1]
      obj_score += English_Sentiment_Dict[words][2]
      total = pos_score + neg_score + obj_score
      if total != 0:
        pos_score= pos_score/total
        neg_score= neg_score/total
        obj_score= obj_score/total
    except:
      continue
  hindi_senti_wordnet_sabdanjali_dict[hindi_word] = (pos_score,neg_score,obj_score)

In [34]:
len(hindi_senti_wordnet_sabdanjali_dict)

6157

In [35]:
hindi_senti_wordnet_sabdanjali_dict["गोपनीय"]

(0.2422180628902879, 0.15235138068089782, 0.6054305564288143)

In [37]:
stopwords =[]
with open('/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/hindi_stopwords.txt',"r", encoding='utf-8', errors='ignore') as file:
  for lines in file:
    lines = lines.strip()
    stopwords.append(lines)

In [39]:
"ये" in stopwords

True

In [41]:
def sentiment_hindi_sabdanjali(text):
    words = word_tokenize(text)
    votes = []
    pos_polarity = 0
    neg_polarity = 0
    for word in words:
      if word not in stopwords:
        if word in hindi_senti_wordnet_sabdanjali_dict:
            #if word in dictionary, it picks up the positive and negative score of the word
            pos, neg, obj = hindi_senti_wordnet_sabdanjali_dict[word]
            # print(word, pos_tag, pos, neg)
            if float(pos) > float(neg):
                pos_polarity += pos
                votes.append(1)
            elif float(neg) > float(pos):
                neg_polarity += neg
                votes.append(0)
    #calculating the no. of positive and negative words in total in a review to give class labels
    pos_votes = votes.count(1)
    neg_votes = votes.count(0)
    if pos_votes > neg_votes:
        return 1
    elif neg_votes > pos_votes:
        return 0
    else:
        if pos_polarity < neg_polarity:
            return 0
        else:
            return 1


In [44]:
import codecs
from sklearn.metrics import accuracy_score, f1_score
from nltk.tokenize import word_tokenize
pred_y = []
actual_y = []
# to calculate accuracy
pos_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/pos_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in pos_reviews.split('$'):
    data = line.strip('\n')
    if data:
        pred_y.append(sentiment_hindi_sabdanjali(data))
        actual_y.append(1)
#print(accuracy_score(actual_y, pred_y) * 100)
neg_reviews = codecs.open("/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/neg_hindi.txt", "r", encoding='utf-8', errors='ignore').read()
for line in neg_reviews.split('$'):
    data=line.strip('\n')
    if data:
        pred_y.append(sentiment_hindi_sabdanjali(data))
        actual_y.append(0)
print(accuracy_score(actual_y, pred_y) * 100)
print('F-measure:  ',f1_score(actual_y,pred_y))

58.333333333333336
F-measure:   0.6872645064054257


In [46]:
import csv
with open('/content/drive/MyDrive/Colab Notebooks/CS689 Project/ResourceBasedClassifier Github/Sabdanjali_Created_Hindi_Sentiments.csv', mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['Word', 'Pos_Score', 'Neg_Score', 'Obj_Score'])
  for word, values in hindi_senti_wordnet_sabdanjali_dict.items():
    writer.writerow([word] + list(values))