In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/../input/top50spotify2019'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Step 1: Preparing, Categorizing, and viewing our Data**

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import warnings 
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
top50 = pd.read_csv('../input/top50spotify2019/top50.csv', encoding='latin1')

top50['Track.Name'] = top50['Track.Name'].astype('category')
top50['Artist.Name'] = top50['Artist.Name'].astype('category')
top50['Genre'] = top50['Genre'].astype('category')

top50.rename(columns={"Loudness..dB..": "Loudness", "Acousticness..": "Acousticness"}, inplace=True)


In [None]:
top50.Genre.unique()

In [None]:
top50.head()

**Step 2: Basic Visualization of Some Stats**

In [None]:
sns.distplot(top50['Beats.Per.Minute'])

In [None]:
sns.distplot(top50['Energy'])

In [None]:
list1 = list()
mylabels = list()
for genre in top50.Genre.cat.categories:
    list1.append(top50[top50.Genre == genre].Danceability)
    mylabels.append(genre)
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(11.7,8.27)
h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels)
plt.title("Danceability By Genre",fontsize=35, color="DarkBlue", fontname="Console")
plt.ylabel("Number of Tracks", fontsize=35, color="Red")
plt.xlabel("Danceability", fontsize=35, color="Green")
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.legend(frameon=True,fancybox=True,shadow=True,framealpha=1,prop={'size':5})
plt.show()


In [None]:
list1 = list()
mylabels = list()
for genre in top50.Genre.cat.categories:
    list1.append(top50[top50.Genre == genre].Energy)
    mylabels.append(genre)
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(11.7,8.27)
h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels)
plt.title("Energy By Genre",fontsize=35, color="DarkBlue", fontname="Console")
plt.ylabel("Number of Tracks", fontsize=35, color="Red")
plt.xlabel("Energy", fontsize=35, color="Green")
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.legend(frameon=True,fancybox=True,shadow=True,framealpha=1,prop={'size':5})
plt.show()

In [None]:
list1 = list()
mylabels = list()
for genre in top50.Genre.cat.categories:
    list1.append(top50[top50.Genre == genre]['Length.'])
    mylabels.append(genre)
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(11.7,8.27)
h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels)
plt.title("Length By Genre",fontsize=35, color="DarkBlue", fontname="Console")
plt.ylabel("Number of Tracks", fontsize=35, color="Red")
plt.xlabel("Length", fontsize=35, color="Green")
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.legend(frameon=True,fancybox=True,shadow=True,framealpha=1,prop={'size':5})
plt.show()

In [None]:
list1 = list()
mylabels = list()
for genre in top50.Genre.cat.categories:
    list1.append(top50[top50.Genre == genre].Danceability)
    mylabels.append(genre)
sns.set_style("whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(11.7,8.27)
h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels)
plt.title("Danceability By Genre",fontsize=35, color="DarkBlue", fontname="Console")
plt.ylabel("Number of Tracks", fontsize=35, color="Red")
plt.xlabel("Danceability", fontsize=35, color="Green")
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.legend(frameon=True,fancybox=True,shadow=True,framealpha=1,prop={'size':5})
plt.show()

**Step 3: Determine possible relationships between Elements**

In [None]:
sns.jointplot(x='Energy', y='Popularity',data=top50,kind='hex')

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15,20)

sns.heatmap(top50.corr(), annot = True)

**It appears that Loudness and Energy are the most correlated factors in this data. We'll see the specifics later**

In [None]:
sns.lmplot(x='Loudness', y='Energy', data=top50)

**Step 4: Implement a Multi-variable regression to determine which factors are the most significant in popularity of a song**

In [None]:
X = top50[['Beats.Per.Minute', 'Energy', 'Danceability',
               'Loudness', 'Liveness', 'Valence.', 'Length.', 'Acousticness', 'Speechiness.']]
y = top50['Popularity']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
sns.distplot((y_test-predictions),bins=50);

In [None]:
from statsmodels.api import OLS

OLS(y_train,X_train).fit().summary()

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

**Step 5: A closer look at the relationship between loudness and energy**

In [None]:
from statsmodels.formula.api import ols
results = ols('Loudness ~ Energy', data=top50).fit()
results.summary()

In [None]:
X = top50[['Loudness']]
y = top50['Energy']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
sns.distplot((y_test-predictions),bins=50);

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
from statsmodels.api import OLS

OLS(y_train,X_train).fit().summary()

**There does appear to be a slight linear relationship between Loudness and Energy**