# Overview
* First, I think I'm going to just visualize the data as this is more like a time-series data (from 1921 to 2020) with approximately 169k entries of songs.
* Second, I will use simple Random Forest Regressor to predict popularity of a song based on the available characteristics

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading the Data

df = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/data_o.csv')
df.sort_values(['popularity'], ascending = True)

In [None]:
print('Info:', df.info())

In [None]:
df.describe().round(decimals=2)

In [None]:
df.columns

In [None]:
plt.figure(figsize = (15,15)) #creating the 'canvas'
sns.heatmap(df.corr(), annot=True)

In [None]:
#Looking at the overall Data Distribution
df.hist(figsize = (20,20))
plt.show()

# Some Key Time-Series Trends
* Acousticness and instrumentaliness decreases from time to time (lowest in 2020)
* Danceability, Energy, Explicit, Loudness, and Tempo increases through time
* Speechiness stays within 0.1 to 0.2 after 1960

In [None]:
columns = ['acousticness','danceability', 'duration_ms', 'energy', 'explicit',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'popularity', 'speechiness', 'tempo', 'valence']
for col in columns:
    y = df.groupby('year')[col].mean()
    x = y.index
    plt.figure(figsize=(16, 8))
    sns.set_style("darkgrid")
    sns.lineplot(x, y, data=df)

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
x = df.groupby("artists")["popularity"].mean().sort_values(ascending=False).head(20)
ax = sns.barplot(x.index, x)
ax.set_title('Top Artists with Popularity by Mean ')
ax.set_ylabel('Popularity')
ax.set_xlabel('Artists')
plt.xticks(rotation = 90)

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
x = df.groupby("artists")["popularity"].sum().sort_values(ascending=False).head(20)
ax = sns.barplot(x.index, x)
ax.set_title('Top Artists with Popularity by Sum')
ax.set_ylabel('Popularity')
ax.set_xlabel('Artists')
plt.xticks(rotation = 90)

In [None]:
df.columns

# Random Forest Simulation

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


#Creating Random Forest
y = df['popularity']
x = df[['valence', 'year', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode',
       'speechiness', 'tempo']]

train_x, val_x, train_y, val_y = train_test_split(x,y, random_state=1)

popularity_model = RandomForestRegressor(random_state=1)
popularity_model.fit(train_x, train_y)

popularity_prediction = popularity_model.predict(val_x)

In [None]:
r2_score(val_y, popularity_prediction), mean_squared_error(val_y, popularity_prediction)**0.5

In [None]:
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = popularity_model, X = train_x, y = train_y, cv = 5, n_jobs = -1)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
#test
print(popularity_model.predict(x.tail()))
print('==============================================')
print(df['popularity'].tail())

In [None]:
df['popularity_prediction'] = popularity_model.predict(x)
df

In [None]:
# Print feature importances
from sklearn.inspection import permutation_importance
print("Feature Importances: ")
pimp = permutation_importance(popularity_model, val_x, val_y, random_state = 1)

for i in pimp.importances_mean.argsort()[-10:]:
    print(x.columns[i], pimp.importances_mean[i])

# Conclusion

1. Predictive power of the RF model is about 80%
2. Root mean squared error is about 9
3. Year, loudness, instrumentalness, duration, and acousticness are five most important popularity predictors.