# Importing packages

In [1]:
import os
import io
import sklearn
import nltk
import re
import string
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# DATA PRE PROCESSING

In [2]:
df_cleaned = pd.read_csv("lyrics_1051_cleaned.csv", index_col=[0], encoding='utf-8')

In [3]:
df_cleaned = df_cleaned.reset_index(drop=True)

In [4]:
df_cleaned = df_cleaned.drop('X', 1)

  df_cleaned = df_cleaned.drop('X', 1)


In [5]:
temp = df_cleaned

In [6]:
temp["lyrics"]=temp["lyrics"].apply(str)

In [7]:
temp = temp.dropna().reset_index(drop=True)

In [8]:
temp.shape

(1051, 6)

## Cleaning the Hindi lyrics

In [9]:
temp["hin_cleaned"] = temp["lyrics"].map(lambda x: re.sub(r'\\xa\d', '', x))
temp["hin_cleaned"] = temp["hin_cleaned"].map(lambda x: re.sub(r'[\.]+', '', x))
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'\s*x\s*[0-9]', ' ', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'\s+', ' ', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[,]+', ' ', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[?]+', ' ', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'\s+x\s+[०१२३४५६७८९]', ' ', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[)()]', ' ', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
temp["hin_cleaned"] = temp["hin_cleaned"].map(lambda x: re.sub(r'\d*', '', x))
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'\t+', '', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[’]+', '', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[“]+', '', regex=True) 
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[”]+', '', regex=True)
temp["hin_cleaned"] = temp["hin_cleaned"].replace(r'[A-Za-z]+', '', regex=True)

## Cleaning translated english lyrics

In [10]:
temp["eng_cleaned"]=temp["eng_cleaned"].apply(str)

In [11]:
temp["eng_cleaned"] = temp["eng_cleaned"].map(lambda x: re.sub(r'\\xa\d', '', x))
temp["eng_cleaned"] = temp["eng_cleaned"].map(lambda x: re.sub(r'[\.]+', '.', x))
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'\s*X\s*[0-9]', ' ', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'\s*x\s*[0-9]', ' ', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'\s+', ' ', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
temp["eng_cleaned"] = temp["eng_cleaned"].map(lambda x: re.sub(r'\d*', '', x))
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'\t+', '', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'[’]+', '', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'[“]+', '', regex=True) 
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'[”]+', '', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'\\n', '', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'[)()]', '', regex=True)
temp["eng_cleaned"] = temp["eng_cleaned"].replace(r'ॐ', '', regex=True)

In [12]:
temp.drop(temp[temp['Title'] == "tu-jo-mila-reprise-hindi"].index, inplace=True)    # manually found out and removed the duplicate

In [13]:
temp.drop(temp[temp['Title'] == "ramleela-movie"].index, inplace=True)    # manually found out that it did not contain song lyrics but movie dialogs and hence removed this row

## Write final dataset to csv for using in other models

In [14]:
temp.to_csv("lyrics_1051_cleaned.csv", encoding="utf-8")

# Hindi SentiwordNet approach

The development of the below model is referenced from the github source https://github.com/shubham721/Sentiment-Analysis-On-Hindi-Reviews

## Removing hindi stop words

In [15]:
stopwords_hindi = open('final_stopwords.txt','r').read().splitlines()

In [19]:
def remove_stopwords(text):
  '''
  this method removes the stop words in a given text using the stopwords_hindi specified.
  '''
  return " ".join([word for word in text.split() if word not in stopwords_hindi])

In [20]:
temp['hin_cleaned'] = temp.hin_cleaned.apply(remove_stopwords)

In [21]:
temp['hin_cleaned']

0       लोगो दुनिया हर कदम इंसा गलत सही समझ के कहु गलत...
1       छल्ली दे सीना दाग दे सारी गोलियाँ नी अज्ज जी क...
2       पिया तू काहें रूठा रे निगोड़ा जग झूठा रे पिया त...
3       आरम्भ प्रचण्ड बोले मस्तकों के झुंड ज़ंग घड़ी ग...
4       सर उठेगा धड़ कटेगा छिपेगा बचेगा पथ पथ घाट मौत क...
                              ...                        
1046    दिन मोहब्बत ओढ़ दिन गली मोड़ तेरी हथेली परलिखूं ...
1047    ज़िन्दगी सफ़र कैसा सफ़र समझा नहीं जाना नहीं ज़...
1048    ज़िन्दगी के सफ़र गुज़र मकाम वो नहीं आते वो नही...
1049    ज़िन्दगी मौत ना बन जाए संभालो यारों खो चैनओअमन...
1050    गुनगुनाती हवाएं गुनगुनाती गगन गा सार आलम ज़ूबी...
Name: hin_cleaned, Length: 1050, dtype: object

### Load the Hindi-SentiWordNet and implement Sentiment Analysis

In [22]:
data = pd.read_csv("HindiSentiWordnet.txt", delimiter=' ')

In [23]:
fields = ['POS_TAG', 'ID', 'POS', 'NEG', 'LIST_OF_WORDS']

In [24]:
words_dict = {}
for i in data.index:
    words = data[fields[4]][i].split(',')
    for word in words:
        words_dict[word] = (data[fields[0]][i], data[fields[2]][i], data[fields[3]][i])

In [25]:
words_dict

{'अनौपचारिक': ('a', 0.0, 0.0),
 'मृत': ('a', 0.0, 0.75),
 'परवर्ती': ('a', 0.125, 0.0),
 'अच्छा': ('a', 0.75, 0.0),
 'बढ़िया': ('a', 0.625, 0.0),
 'सौभाग्यशाली': ('a', 0.875, 0.0),
 'खुशकिस्मत': ('a', 0.875, 0.0),
 'खुशनसीब': ('a', 0.875, 0.0),
 'तक़दीर_वाला': ('a', 0.875, 0.0),
 'नसीब_वाला': ('a', 0.875, 0.0),
 'भाग्यवान': ('a', 0.875, 0.0),
 'भाग्यशाली': ('a', 0.875, 0.0),
 'ख़ुशक़िस्मत': ('a', 0.875, 0.0),
 'ख़ुशनसीब': ('a', 0.875, 0.0),
 'दुर्भाग्यशाली': ('a', 0.0, 1.0),
 'अभागा': ('a', 0.0, 1.0),
 'बदनसीब': ('a', 0.0, 1.0),
 'भाग्यहीन': ('a', 0.0, 1.0),
 'मनहूस': ('a', 0.0, 1.0),
 'बदकिस्मत': ('a', 0.0, 1.0),
 'मंदभाग्य': ('a', 0.0, 1.0),
 'बदक़िस्मत': ('a', 0.0, 1.0),
 'दईमारा': ('a', 0.0, 1.0),
 'कमबख्त': ('a', 0.0, 1.0),
 'कमबख़्त': ('a', 0.0, 1.0),
 'अधन्य': ('a', 0.0, 1.0),
 'अभागी': ('a', 0.0, 1.0),
 'आवासहीन': ('a', 0.0, 0.875),
 'आश्रयहीन': ('a', 0.125, 0.375),
 'गृहहीन': ('a', 0.0, 0.875),
 'गृहविहीन': ('a', 0.0, 0.875),
 'बेघर': ('a', 0.0, 0.875),
 'बेघरबार': ('a', 0.0, 

In [26]:
#function to find the sentiment of the input text parameter
def sentiment(text):
    words = word_tokenize(text)
    votes = []
    pos_polarity = 0
    neg_polarity = 0
    allowed_words = ['a','v','r','n']
    for word in words:
        if word in words_dict:
            #if word in dictionary, it picks up the positive and negative score of the word
            pos_tag, pos, neg = words_dict[word]
            if pos_tag in allowed_words:
                if pos > neg:
                    pos_polarity += pos
                    votes.append(1)
                elif neg > pos:
                    neg_polarity += neg
                    votes.append(0)
    #calculating the no. of positive and negative words in total
    pos_votes = votes.count(1)
    neg_votes = votes.count(0)
    if pos_votes > neg_votes:
        return 1
    elif neg_votes > pos_votes:
        return -1
    else:
        if pos_polarity < neg_polarity:
            return -1
        elif pos_polarity > neg_polarity:
            return 1
        else:
            return 1

In [27]:
temp["HSWN_sentiment"] = temp["hin_cleaned"].apply(lambda x: sentiment(x))

In [28]:
temp['HSWN_sentiment'].value_counts()

 1    732
-1    318
Name: HSWN_sentiment, dtype: int64

In [29]:
temp['polarity'].value_counts()

 1    587
-1    463
Name: polarity, dtype: int64

# Finding accuracy

In [30]:
accuracy = accuracy_score(temp["HSWN_sentiment"], temp["polarity"])

In [31]:
accuracy

0.5361904761904762