In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import linear_model  # will be using for plotting trend line
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import date
from datetime import datetime

%matplotlib inline
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
# Import the data
data = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_o.csv")
df = data.copy()
# View the shape and columns names
print(df.shape)
print(df.columns)
df.sample(5)

In [None]:
#Check Null values and correlation to drop un-needed columns
df.info()

In [None]:
corr = df.corr(method='pearson')
plt.figure(figsize=(14,8))
sns.heatmap(corr, annot= True)
plt.title('Correlation')
plt.show()


In [None]:
# For low correlation between these variables and popularity,
# or for implying weak effect we remove them.
df.drop(['valence', 'explicit', 'key',
       'liveness', 'loudness', 'mode'], axis = 1, inplace=True )

# we see correlations once more
corr = df.corr(method='pearson')
plt.figure(figsize=(14,8))
sns.heatmap(corr, annot= True)
plt.title('Correlation')
plt.show()

In [None]:
df.release_date = pd.to_datetime(df.release_date)
df.sort_values(by='release_date')
df.set_index('release_date',inplace=True)
df.head(5)

In [None]:
df.tail(5)

In [None]:
#Let's scale duration to minutes to be more readable  
df['duration_min'] = df['duration_ms'].apply(lambda x:round(x/60000))
df.drop('duration_ms',axis=1,inplace=True)

year_avg= df.groupby('year').mean()
year_avg.head(6)

In [None]:
year_avg.describe()

In [None]:
plt.figure(figsize=(14,8))
plt.title('Change over the years for tempo and popuarity')

#For scalability , we visualize tempo and popularity alone over time the years
tempo_pop = ['tempo','popularity']
for i in tempo_pop:
    ax = sns.lineplot(x='year', y = i , data = year_avg)

plt.ylabel("value")
plt.legend(tempo_pop)

In [None]:
plt.figure(figsize=(14,8))
plt.title('Change over the years for tempo and popuarity')
sns.lineplot(x='year', y = 'duration_min' , data = year_avg)

plt.ylabel("value")
plt.legend('duration_min')

In [None]:
against_year= [ 'acousticness', 'danceability',
        'energy','instrumentalness', 'speechiness' ] 

In [None]:
#Let's see how the rest of the features changed over the years
plt.figure(figsize=(14,8))
plt.title('Change over the years')

for i in against_year:
    ax = sns.lineplot(x='year', y = i , data = year_avg)

plt.ylabel("value")
plt.legend(against_year)

In [None]:
# so now lets analyze which features of songs is affecting popularity in songs
plt.figure(0, figsize=(24,10))
x_axis = ['acousticness', 'danceability',
        'energy','instrumentalness', 'speechiness', 'duration_min' , 'tempo']
z = 0
for i in range(2):
    for j in range(3):
        # values to be plotted on axis(x,y)
        x = df[x_axis[z]].values.reshape(-1,1)
        y = df["popularity"].values.reshape(-1,1)
        # linear model 
        regr = linear_model.LinearRegression()
        regr.fit(x, y)
        # sub-plot
        ax = plt.subplot2grid((2,3), (i,j))
        ax.scatter(x,y, c='purple')
        # adding trend line
        ax.plot(x, regr.predict(x), color="red", linewidth=2, linestyle='--')
        # adding title
        ax.title.set_text(f'{x_axis[z]} vs popularity')
        z += 1
plt.show()

# Genre Based Analysis

In [None]:
genre_df = pd.read_csv("../input/spotify-dataset-19212020-160k-tracks/data_by_genres_o.csv")
genre_df.describe()

In [None]:
genre_df.head(7)

In [None]:
#Let's see top 10 genres according to popularity
top_10 =genre_df.sort_values(by='popularity', ascending= False).head(10)

In [None]:
plt.figure(figsize=(16,8))
# line plot 
sns.lineplot('genres', 'popularity', color='maroon', data=top_10)
plt.title('Genre vs Popularity')
plt.xlabel('Genres', fontsize = 15)
plt.ylabel('Popularity', fontsize=15)
plt.show()

In [None]:
#let's find out the most repeated genre words to see if we can get a better picture of most famous genres
genre_names_text = " ".join(genre_df['genres'].tolist()).split(" ") 
#turning the genres into an iterable, splitting each word to loop over most repeated words

In [None]:
column_names = ["word","count"]
most_common_words_in_genres_df = pd.DataFrame([dict(zip(column_names,word_count)) for word_count in Counter(genre_names_text).most_common(30)])
most_common_words_in_genres_df # A New dataframe cointaing our top 30 'Big' words implying popular genres

In [None]:
sns.barplot(x='count' , y ='word', data=most_common_words_in_genres_df.sort_values('count'))
plt.title('Most Common Names In Genres')
plt.ylabel('Name Of Main Genre')
plt.xlabel('Frequency')
plt.show

In [None]:
#Now let's see how these Big genres was distrubted in our main dataset and know which one was most popular

pop_df = genre_df[genre_df['genres'].str.contains('pop')].sort_values('popularity').tail(10)
indie_df = genre_df[genre_df['genres'].str.contains('indie')].sort_values('popularity').tail(10)
rock_df = genre_df[genre_df['genres'].str.contains('rock')].sort_values('popularity').tail(10)
metal_df = genre_df[genre_df['genres'].str.contains('metal')].sort_values('popularity').tail(10)

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16,10))

axs[0, 0].barh(width=pop_df['popularity'], y=pop_df['genres'],color='cyan')
axs[0,1].barh(width=indie_df['popularity'], y= indie_df['genres'],color='purple')
axs[1,0].barh(width=rock_df['popularity'], y=rock_df['genres'], color='blue')
axs[1, 1].barh(width=metal_df['popularity'], y=metal_df['genres'],color = 'red')

fig.tight_layout(pad=1)
fig.show()
