# Lyric Data Analysis

Segregation and Text analysis of the lyric data

In [1]:
# Import the necessary modules
import pandas as pd 
import numpy as np

In [10]:
# Load the data
df = pd.read_excel('../data/top_decade_songs_translated.xlsx')
df.shape

(70, 26)

In [11]:
df.head()

Unnamed: 0,track_id,name,album,artist,release_date,length,popularity,key,acousticness,danceability,...,valence,time_signature,mode,playlist_id,year,decade,hindi_lyrics,english_lyrics,language,english_translated_lyrics
0,59HjNZgoziKgAwGOhrKRPJ,Tere Mere Sapne Ab Ek Rang Hain,Guide,S. D. Burman,12/6/1965,0.000261,47,3,0.978,0.246,...,0.552,3,1,37i9dQZF1DXa1eCiO3E6Rr,1965,1960,तेरे मेरे सपने\nअब्ब एक रंग हैं\nजहां भी ले जा...,Tere mere sapne\nAbb ek rang hain\nJaha bhee l...,hi,Yours is my dream\nAbb is a color\nWherever yo...
1,1P278K5LuPJOatR1wBUywC,Aane Se Uske Aaye Bahar,Jeene Ki Raah,Laxmikant Pyarelal,1/1/1969,0.000248,51,6,0.886,0.37,...,0.668,4,0,3dKv6gpADy34FI6rcP7DAT,1969,1960,आने से उस के आये बहार\nजाने से उस के जाए बहार\...,Aane se us ke aaye bahar\nJaane se us ke jaaye...,hi,"From his coming, he came out.\nGo out of it.\n..."
2,7ukboFFuDuxKWRdxahmth7,Beqarar Karke Hamen Yun Na Jaiye,Bees Saal Baad,Hemant Kumar,1/1/1962,0.00019,51,10,0.971,0.653,...,0.757,4,0,3dKv6gpADy34FI6rcP7DAT,1962,1960,बेक़रार करके हमें यूँ न जाइये\nआपको हमारी कसम ल...,Beqaraar karake hame yun na jaaiye\nAapako ham...,hi,Don't let us go like that.\nI swear to you bac...
3,6xCnMMPkIIhn3QyGJgd5xd,Ae Mere Zohra Jabeen,Waqt,Ravi,7/28/1965,0.000235,52,6,0.984,0.441,...,0.514,3,0,3dKv6gpADy34FI6rcP7DAT,1965,1960,ऐ मेरी जोहरा जबीं\nतुझे मालुम नहीं\nतू अभी तक ...,Ai meri zoharaa zabi\nTujhe maalum nahi\nTu ab...,hi,O my zohra jabeen\nYou don't know\nYou're stil...
4,1AlatlIkROgLvG6pgBBnAz,Roop Tera Mastana,Aradhana,S. D. Burman,9/27/1969,0.000225,55,8,0.666,0.449,...,0.543,4,0,3dKv6gpADy34FI6rcP7DAT,1969,1960,रूप तेरा मस्ताना\nरूप तेरा मस्ताना\nभूल कोई हम...,"Rup teraa mastaanaa, pyaar meraa divaanaa\nRup...",hi,form your mastana\nform your mastana\nLet no o...


In [12]:
## Add word count to each song 
df['word_count'] = df['english_translated_lyrics'].apply(lambda x: len(x.split()))

In [13]:
# Average number of words per song in in every decade 
df.groupby('decade')['word_count'].mean()

decade
1960    193.5
1970    191.9
1980    209.8
1990    250.9
2000    264.2
2010    189.7
2020    235.5
Name: word_count, dtype: float64

In [14]:
# Collect the decade wise lyrics 
decade_lyrics = pd.DataFrame(df.groupby('decade', as_index=False).english_translated_lyrics.apply(lambda x: ' '.join(x)))
decade_lyrics

Unnamed: 0,decade,english_translated_lyrics
0,1960,Yours is my dream\nAbb is a color\nWherever yo...
1,1970,I'm not a shayar\nBut hey laughed.\nEver since...
2,1980,Janu my life\nI am your sacrifice\nJanu my lif...
3,1990,even if you say nothing\nI listened to...\ncho...
4,2000,Aa aa.. Come on... Aa aa.. Aa aa.. The second ...
5,2010,I am this Sahib ji\nLet's go all the g\nStill ...
6,2020,"When you walk the way, you turn the way\nWhen ..."


In [20]:
# Collect the decade wise lyrics 
decade_lyrics = pd.DataFrame(df.groupby('decade', as_index=False).english_lyrics.apply(lambda x: ' '.join(x)))
decade_lyrics

Unnamed: 0,decade,english_lyrics
0,1960,Tere mere sapne\nAbb ek rang hain\nJaha bhee l...
1,1970,Main shaayar to nahin\nMagar ai hanseen\nJab s...
2,1980,Jaanu meri jaan\nMain tere qurbaan\nJaanu meri...
3,1990,"Chaahe tum kuch na kaho,\nMaine sun liya…ke sa..."
4,2000,Aa aa..aa aa…aa aa..aa aa.. aa..\n\n\nNa hai y...
5,2010,Mann yeh saahib ji\nJaane hai sab ji\nPhir bhi...
6,2020,Jab Chalte Chalte Raah Mude\nJab Jugnu Muthi K...


In [24]:
# Save the data 
decade_lyrics.to_csv('../data/hinglish_lyrics_by_decade.csv', index=False)

In [22]:
# Create text files for decade-wise lyrics 
for i in range(len(decade_lyrics)):
    with open(f"../data/lyrics/hinglish_lyrics_from_{decade_lyrics['decade'][i]}.txt", 'w', encoding="utf-8") as f:
        f.write(decade_lyrics.iloc[i, 1])

In [23]:
df['decade'].value_counts()

1960    10
1970    10
1980    10
1990    10
2000    10
2010    10
2020    10
Name: decade, dtype: int64

## Lyrics Text Analysis

In [None]:
# Create a function to clean the data 
def clean_data(text): 
    """Create a function to clean the text by removing stopwords

    Args:
        text (str): A text to be cleaned. 
    """

In [None]:
decade_lyrics