# Hindi Text Analysis 

In [1]:
# Import necessary libraries
import pandas as pd 
import numpy as np

In [2]:
# Import the data 
df = pd.read_excel('../data/top_decade_songs_translated.xlsx')
df.shape

(70, 26)

### Define the constants 

In [3]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,key,acousticness,danceability,...,valence,time_signature,mode,playlist_id,year,decade,hindi_lyrics,english_lyrics,language,english_translated_lyrics
0,59HjNZgoziKgAwGOhrKRPJ,Tere Mere Sapne Ab Ek Rang Hain,Guide,S. D. Burman,12/6/1965,0.000261,47,3,0.978,0.246,...,0.552,3,1,37i9dQZF1DXa1eCiO3E6Rr,1965,1960,तेरे मेरे सपने\nअब्ब एक रंग हैं\nजहां भी ले जा...,Tere mere sapne\nAbb ek rang hain\nJaha bhee l...,hi,Yours is my dream\nAbb is a color\nWherever yo...
1,1P278K5LuPJOatR1wBUywC,Aane Se Uske Aaye Bahar,Jeene Ki Raah,Laxmikant Pyarelal,1/1/1969,0.000248,51,6,0.886,0.37,...,0.668,4,0,3dKv6gpADy34FI6rcP7DAT,1969,1960,आने से उस के आये बहार\nजाने से उस के जाए बहार\...,Aane se us ke aaye bahar\nJaane se us ke jaaye...,hi,"From his coming, he came out.\nGo out of it.\n..."
2,7ukboFFuDuxKWRdxahmth7,Beqarar Karke Hamen Yun Na Jaiye,Bees Saal Baad,Hemant Kumar,1/1/1962,0.00019,51,10,0.971,0.653,...,0.757,4,0,3dKv6gpADy34FI6rcP7DAT,1962,1960,बेक़रार करके हमें यूँ न जाइये\nआपको हमारी कसम ल...,Beqaraar karake hame yun na jaaiye\nAapako ham...,hi,Don't let us go like that.\nI swear to you bac...
3,6xCnMMPkIIhn3QyGJgd5xd,Ae Mere Zohra Jabeen,Waqt,Ravi,7/28/1965,0.000235,52,6,0.984,0.441,...,0.514,3,0,3dKv6gpADy34FI6rcP7DAT,1965,1960,ऐ मेरी जोहरा जबीं\nतुझे मालुम नहीं\nतू अभी तक ...,Ai meri zoharaa zabi\nTujhe maalum nahi\nTu ab...,hi,O my zohra jabeen\nYou don't know\nYou're stil...
4,1AlatlIkROgLvG6pgBBnAz,Roop Tera Mastana,Aradhana,S. D. Burman,9/27/1969,0.000225,55,8,0.666,0.449,...,0.543,4,0,3dKv6gpADy34FI6rcP7DAT,1969,1960,रूप तेरा मस्ताना\nरूप तेरा मस्ताना\nभूल कोई हम...,"Rup teraa mastaanaa, pyaar meraa divaanaa\nRup...",hi,form your mastana\nform your mastana\nLet no o...


In [None]:
# text cleaning function 
def clean_hindi_text(text): 
    """
    Function to clean the hindi devangari text
    """
    
    stopwords_hi = ['तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी','क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी','श्री','वैसा','आपका','अंदर', 'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'आदि', 'आप', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे', 'और', 'कई', 'कर','करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जा', 'जितना', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिन', 'तिन्हें', 'तिन्हों', 'तिस', 'तिसे', 'तो', 'था', 'थी', 'थे', 'दबारा', 'दिया', 'दुसरा', 'दूसरे', 'दो', 'द्वारा', 'न', 'नहीं', 'ना', 'निहायत', 'नीचे', 'ने', 'पर', 'पर', 'पहले', 'पूरा', 'पे', 'फिर', 'बनी', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वर्ग', 'वह', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'ही', 'हुआ', 'हुई', 'हुए', 'है', 'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने', 'अपनि', 'जेसे', 'होति', 'सभि', 'तिंहों', 'इंहों', 'दवारा', 'इसि', 'किंहें', 'थि', 'उंहों', 'ओर', 'जिंहें', 'वहिं', 'अभि', 'बनि', 'हि', 'उंहिं', 'उंहें', 'हें', 'वगेरह', 'एसे', 'रवासा', 'कोन', 'निचे', 'काफि', 'उसि', 'पुरा', 'भितर', 'हे', 'बहि', 'वहां', 'कोइ', 'यहां', 'जिंहों', 'तिंहें', 'किसि', 'कइ', 'यहि', 'इंहिं', 'जिधर', 'इंहें', 'अदि', 'इतयादि', 'हुइ', 'कोनसा', 'इसकि', 'दुसरे', 'जहां', 'अप', 'किंहों', 'उनकि', 'भि', 'वरग', 'हुअ', 'जेसा', 'नहिं']
    stopwords_en = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    punctuations = ['nn','n', '।','/', '`', '+', '\', '"', '?', '▁(', '$', '@', '[', '_', "'", '!', ',', ':', '^', '|', ']', '=', '%', '&', '.', ')', '(', '#', '*', '', ';', '-', '}','|','"']
    to_be_removed = stopwords_hi + punctuations + stopwords_en
    
    # Remove numbers
    text = re.sub(r'\d', '', text)
    
    # Replace \n and \t functions 
    text = re.sub(r'\n', '', text)
    text = text.strip()
    
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove Stopwords and Lemmatise the data
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in STOPWORDS]
    text = ' '.join(text)
    
    return 
    