In [None]:
# Loading all the required libraries:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as py
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Reading the dataset and viewing top five rows:

In [None]:
df = pd.read_csv('../input/500-greatest-songs-of-all-time/Top 500 Songs.csv', encoding='ISO-8859-2')
df.head()

So here we have data for the best songs of all time. The dataset contains 8 columns. All the columns are having categorical data. And by the name of the dataset I believe it must be having 500 rows. Still, let me confirm it:

In [None]:
df.shape

Now, before exploring I'll check if there's any missing value in the dataset:

# Data Cleaning:

I'll first check which variables are having missing values:

In [None]:
categorical_nan = [feature for feature in df.columns if df[feature].isna().sum()>0 and df[feature].dtypes=='O']
print(categorical_nan)

Okay so only the 'streak' column is having missing values. I'm filling these missing values with "Empty":

In [None]:
for feature in categorical_nan:
    df[feature] = df[feature].fillna('Empty')

But before moving forward, I need to verify if the missing values got filled or not:

In [None]:
df[categorical_nan].isna().sum()

Okay now I'm good to explore the dataset:

# Data Exploration: 

This dataset looks very interesting. I'll try to explore it as much as I can:

I'll start with what this dataset is about. Plotting a pie-chart for the "greatest songs" using the "**title**" column:

In [None]:
color = plt.cm.RdPu(np.linspace(0,1,20))
df["title"].value_counts().sort_values(ascending=True).head(20).plot.pie(y="artist",colors=color,autopct="%0.1f%%")
plt.title("Greatest Songs Ever")
plt.axis("off")
plt.show()

Similarly, the greatest artists:

In [None]:
color = plt.cm.YlGnBu(np.linspace(0,1,20))
df["artist"].value_counts().sort_values(ascending=True).head(20).plot.pie(y="streak",colors=color,autopct="%0.1f%%")
plt.title("Greatest Artists Ever")
plt.axis("off")
plt.show()

Greatest writers:

In [None]:
color = plt.cm.RdGy(np.linspace(0,1,20))
df["writers"].value_counts().sort_values(ascending=True).head(20).plot.pie(y="released",colors=color,autopct="%0.1f%%")
plt.title("Greatest Song Writers Ever")
plt.axis("off")
plt.show()

Greatest streaks in the dataset:

In [None]:
color = plt.cm.RdYlBu(np.linspace(0,1,20))
df["streak"].value_counts().sort_values(ascending=True).head(20).plot.pie(y="writer",colors=color,autopct="%0.1f%%")
plt.title("Greatest Streak")
plt.axis("off")
plt.show()

Greatest producers of all time:

In [None]:
color = plt.cm.OrRd(np.linspace(0,1,20))
df["producer"].value_counts().sort_values(ascending=True).head(20).plot.pie(y="description",colors=color,autopct="%0.1f%%")
plt.title("Greatest Song Producer")
plt.axis("off")
plt.show()

By date released:

In [None]:
color = plt.cm.BuGn(np.linspace(0,1,20))
df["released"].value_counts().sort_values(ascending=True).head(20).plot.pie(y="title",colors=color,autopct="%0.1f%%")
plt.title("Greatest songs of their period")
plt.axis("off")
plt.show()

Now, I'll collect the data in '**title**', '**artist**', '**writers**' & '**producer**' variables/columns in separate boxes:

In [None]:
text_cols = ['title', 'artist', 'writers', 'producer']

from wordcloud import WordCloud, STOPWORDS

wc = WordCloud(stopwords = set(list(STOPWORDS) + ['|']), random_state = 42)
fig, axes = plt.subplots(2, 2, figsize=(20, 12))
axes = [ax for axes_row in axes for ax in axes_row]

for i, c in enumerate(text_cols):
  op = wc.generate(str(df[c]))
  _ = axes[i].imshow(op)
  _ = axes[i].set_title(c.upper(), fontsize=24)
  _ = axes[i].axis('off')

This looks good! Now I'll see what words are **most common**** in the **artists**** list using WordCloud:

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
text = " ".join(str(each) for each in df.artist)
# Create and generate a word cloud image:
wordcloud = WordCloud(max_words=200,colormap='GnBu', background_color="black").generate(text)
plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='Bilinear')
plt.axis("off")
plt.figure(1,figsize=(12, 12))
plt.show()

I want to check Elton John's popularity, through the charts:

In [None]:
elton = df[(df['artist']=='Elton John')].reset_index(drop=True)
elton.head(20)

So Elton John came 17 times on the streak. Now I'd like to check Jimi Hendrix's popularity too:

In [None]:
jimi = df[(df['artist']=='The Jimi Hendrix Experience')].reset_index(drop=True)
jimi.head(20)

Finally, let me check Elvis Presley's popularity before moving forward:

In [None]:
elvis = df[(df['artist']=='Elvis Presley')].reset_index(drop=True)
elvis.head(20)

Woah! Elvis was 11 times on the streak and not only that, he was 9 times No. 1 on the charts! He was/is truly the greatest!

Now let me check the "**most common
writers****" in the dataset:

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
text = " ".join(str(each) for each in df.writers)
# Create and generate a word cloud image:
wordcloud = WordCloud(max_words=200,colormap='GnBu', background_color="black").generate(text)
plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='Bilinear')
plt.axis("off")
plt.figure(1,figsize=(12, 12))
plt.show()

Likewise, the **most common artists**** on the list are:

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
text = " ".join(str(each) for each in df.artist)
# Create and generate a word cloud image:
wordcloud = WordCloud(max_words=200,colormap='Set3', background_color="black").generate(text)
plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='Bilinear')
plt.axis("off")
plt.figure(1,figsize=(12, 12))
plt.show()

Now I'll see when & where were the songs released:

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
text = " ".join(str(each) for each in df.released)
# Create and generate a word cloud image:
wordcloud = WordCloud(max_words=200,colormap='GnBu', background_color="black").generate(text)
plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
# Display the generated image:
plt.imshow(wordcloud, interpolation='Bilinear')
plt.axis("off")
plt.figure(1,figsize=(12, 12))
plt.show()

Let me check the longest streak ever:

In [None]:
longest = df[(df['streak']=='30 weeks; No. 1')].reset_index(drop=True)
longest.head(10)

Awesome! Dr Dre is my favorite producer :D

So All Shook Up by Elvis and In Da Club by 50 cent were loved for really long time. ( For 30 weeks these 2 songs remained Number 1). Now that's something!

So, as I've got a lot of interesting information from this dataset I think I should stop here now. Thank you for your time. Regards.
* Rachit Shukla