In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from scipy import stats

## Data description as given by the author
### tracks.csv
Primary:
- id (Id of track generated by Spotify)
Numerical:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
Dummy:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)
Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- timesignature (The predicted timesignature, most typically 4)
- artists (List of artists mentioned)
- artists (Ids of mentioned artists)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)
### artists.csv
- id (Id of artist)
- name (Name of artist)
- followers (Total number of followers of artist)
- popularity (Popularity of given artist based on all his/her tracks)
- genres (Genres associated with this artist)

### dict_artists.json
{
"any": [
"first",
"second",
"third",
…,
"nth"
],
"blank": [],
"first: [
"any",
"third",
"second
],
…
}

In [None]:
path='/kaggle/input/spotify-dataset-19212020-160k-tracks/'

In [None]:
artists=pd.read_csv(os.path.join(path,'artists.csv'))
artists.head(3)

In [None]:
tracks=pd.read_csv(os.path.join(path,'tracks.csv'))
tracks.head(3)

In [None]:
dict_artists=open(os.path.join(path,'dict_artists.json'))
artists_json = json.load(dict_artists)

In [None]:
artists_json['0DvvojCMIqsOT1Btiwvq1h']

In [None]:
def removepunc(string):

    string=string.replace('[','').replace(']','').replace("'",'').replace("'",'')
    return string

In [None]:
tracks['id_artists']=tracks['id_artists'].apply(removepunc)

In [None]:
tracks['id_artists'].head()

In [None]:
artists.head()

In [None]:
artists['genres'].value_counts()/artists['genres'].count()*100

72.9% of genres are empty... We can drop this column or not use it

In [None]:
artists.drop(columns='genres',inplace=True)

In [None]:
(tracks[~tracks['id_artists'].isin(artists['id'])]['artists'].count()/tracks['id_artists'].count())*100

We can ignore 20% and join the rest for creating a features df. We will need to check more columns still

In [None]:
(artists[~artists['id'].isin(tracks['id_artists'])]['id'].count()/artists['id'].count())*100

#### Checking correlation/trends between variables

In [None]:
tracks.columns

In [None]:
tracks_sub=tracks[['popularity','duration_ms', 'explicit', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature']]

In [None]:
matrix = np.triu(tracks_sub.corr())
sns.set(rc={'figure.figsize':(20,10)})
ax = sns.heatmap(tracks_sub.corr(), annot=True, fmt='.1g',cmap= 'coolwarm',mask=matrix)

* We see high positive correlation between Loudness and energy,and moderately positive correlation between valence and danceability
* A strong negative correlation can be seen between acousticness and energy and moderate between loudness and acousticness

In [None]:
import matplotlib
matplotlib.rcParams['agg.path.chunksize'] = 10000
a=sns.regplot(tracks['loudness'],tracks['energy'])
a.set(ylim=(-1.5, 1.5))
a.set(xlim=(-70, 70))
plt.xlabel("Loudness")
plt.ylabel("Energy")

In [None]:
tracks.describe()

All columns are normalized except: key,loudness,tempo, time signature

In [None]:
tracks[['loudness','key','tempo','time_signature']]= preprocessing.normalize(tracks[['loudness','key','tempo','time_signature']])

In [None]:
fig,ax=plt.subplots()
slope, intercept, r_value, p_value, std_err = stats.linregress(tracks['popularity'],tracks['danceability'])
ax.set(xlim=[-5,100],ylim=[0,1])
sns.regplot(tracks['popularity'],tracks['danceability'],line_kws={'label':"y={0:.3f}x+{1:.1f}".format(slope,intercept)})
ax.legend()
plt.show()

Here the hypothesis for the p-value is that the slope of the fit line is 0

In [None]:
fig,ax=plt.subplots()
slope, intercept, r_value, p_value, std_err = stats.linregress(tracks['popularity'],tracks['acousticness'])
ax.set(xlim=[-5,100],ylim=[-0.5,1])
sns.regplot(tracks['popularity'],tracks['acousticness'],line_kws={'label':"y={0:.3f}x+{1:.1f} pvalue={2:.3f}".format(slope,intercept,p_value)})
ax.legend()
plt.show()