# Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
dataset = pd.read_csv('../input/spotify-dataset-19212020-160k-tracks/data.csv')

# Data Wrangling

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.describe(include = 'all')

In [None]:
dataset.isnull().sum()

# Plotting various columns of Dataset

In [None]:
plt.figure(figsize = (30, 10))
plt.subplot(231)
sns.distplot(dataset['danceability'])
plt.subplot(232)
sns.distplot(dataset['duration_ms'])
plt.subplot(233)
sns.distplot(dataset['energy'])
plt.subplot(234)
sns.distplot(dataset['instrumentalness'])
plt.subplot(235)
sns.distplot(dataset['liveness'])
plt.subplot(236)
sns.distplot(dataset['loudness'])
plt.show()

# Visualizing the basic statistics related to every column

In [None]:
plt.figure(figsize = (40, 20))
plt.subplot(331)
sns.boxplot(dataset['danceability'])
plt.subplot(332)
sns.boxplot(dataset['duration_ms'])
plt.subplot(333)
sns.boxplot(dataset['energy'])
plt.subplot(334)
sns.boxplot(dataset['liveness'])
plt.subplot(335)
sns.boxplot(dataset['loudness'])
plt.subplot(336)
sns.boxplot(dataset['instrumentalness'])
plt.show()

In [None]:
plt.figure(figsize = (10, 7))
sns.distplot(dataset.popularity)
plt.title("Dist plot of Popularity", fontdict = {'fontsize' : 15})
plt.xlabel('Popularity')
plt.show()
plt.show()

# Top 20 artists by song count

In [None]:
l = []
for i in dataset.artists:
    a=i[2:-2]
    l.append(a)
l = list(set(l))
lst = []
for i in l:
    res = i.strip('][').split(', ')
    lst.append(res)
d=[]
for i in lst:
    for j in i:
        d.append(j)
dff=pd.DataFrame(d)
dff = dff[0].value_counts()

dff = dff[:20, ]
plt.figure(figsize = (12, 8))
sns.barplot(dff.values, dff.index, alpha = 0.8)
plt.title("Top 20 Artists", fontdict = {'fontsize' : 15})
plt.ylabel("Artists")
plt.xlabel("Count")
plt.show()

In [None]:
def year(df):
    if df>1920 and df<=1945:
        return "Post-Great War"
    if df>1945 and df<=1970:
        return "Retro"
    if df>1970 and df<=1995:
        return "Modern"
    else:
        return "Post-Modern"
dataset['era'] = dataset['year'].apply(year)
px.pie(data_frame = dataset, names = 'era', hole = 0.2, title = 'Eras of Music')

## Binning 'explicit' to classify songs as explicit or not

In [None]:
def func(df):
    if df == 1:
        return 'Yes'
    else:
        return 'No'
dataset['isExplicit'] = dataset['explicit'].apply(func)
px.pie(data_frame = dataset, names = 'isExplicit', hole = 0.2, title = 'Explicit')

## Binning 'loudness' to classfiy songs by how loud they are

In [None]:
def loud(row):
    m=dataset['loudness'].median()
    sd=dataset['loudness'].std()
    if row['loudness']>=m+(1.5*sd):
        return "Extreme"
    elif row['loudness']>=m+(sd):
        return "Very Loud"
    elif row['loudness']>=m+(0.5*sd):
        return "Loud"
    elif row['loudness']>=m-(0.5*sd):
        return "Soft"
    elif row['loudness']>=m-(sd):
        return "Very Soft"
    else:
        return "Mellow"
dataset['is_loud']=dataset.apply(lambda row: loud(row), axis=1)
px.pie(data_frame = dataset, names = 'is_loud', hole = 0.2, title = 'IS LOUD')

## Binning 'energy' to classify songs by high/low energy

In [None]:
def energy(row):
    if row['energy']>=dataset['energy'].mean():
        return "High"
    else:
        return "Low"
dataset['en_type']=dataset.apply(lambda row: energy(row),axis=1)
px.pie(names=dataset['en_type'],hole=0.2)

## Binning 'popularity' to classify songs on the basis of their popularity

In [None]:
def func(df):
    if df > 75:
        return 'Very Popular'
    elif df > 50 and df < 76:
        return 'Popular'
    elif df > 25 and df < 51:
        return 'Average'
    else:
        return 'Not popular'
dataset['isPopular'] = dataset['popularity'].apply(func)
px.pie(data_frame = dataset, names = 'isPopular', hole = 0.2, title = 'Popularity')

## Exploratory Data Analysis

In [None]:
fig = px.scatter(dataset, x="loudness", y="danceability", size="liveness", 
                 color="isExplicit", log_x=True,size_max=30)
fig.show()

In [None]:
fig = px.scatter(dataset, x="loudness", y="danceability", size="popularity", 
                 color="isExplicit", log_x=True,size_max=30)
fig.show()

In [None]:
art=dataset
n=[]
g=[]
for name, group in art.groupby(['artists'])['popularity']:
    n.append(name)
    g.append(group.mean())
artist_pop=pd.DataFrame(n,g)

In [None]:
artist_pop.columns=['Name']

In [None]:
artist_pop['popularity']=artist_pop.index

In [None]:
artist_pop.sort_values(by='popularity',ascending=False,inplace=True)

In [None]:
px.bar(x=artist_pop['Name'].head(10),y=artist_pop['popularity'].head(10)).update_layout(yaxis_title_text='Popularity')

In [None]:
dataset.sort_values(by='popularity',ascending=False,inplace=True)
dff=dataset.head(10)
px.histogram(x=dff.name,y=dff.popularity,color=dff.isExplicit,labels={'x':'Top 10 songs','y':'Popularity'})

In [None]:
dff=dataset.head(20)
px.scatter_3d(x=dff.name,y=dff.is_loud,z=dff.danceability,color=dff.popularity,labels={'x':'name','y':'type','z':'danceability','color':'popularity'})

In [None]:
dff=dataset.head(20)
px.scatter_3d(x=dff.name,y=dff.en_type,z=dff.danceability,color=dff.popularity,labels={'x':'name','y':'type','z':'danceability','color':'popularity'})

In [None]:
fig = px.scatter(dataset, x = "popularity", y = "danceability", animation_frame = dataset['year'].sort_values(), animation_group = "isPopular",
           size = "key", color = "isExplicit", hover_name = "isPopular",
           log_x=True, size_max=45, range_x=[1,101], range_y=[0,1])
fig.show()

## Encoding categorical variables

In [None]:
isPopular= pd.get_dummies(dataset['isPopular'], drop_first=True)
isExplicit= pd.get_dummies(dataset['isExplicit'], drop_first=True)
isLoud= pd.get_dummies(dataset['is_loud'], drop_first=True)
dataset = pd.concat([dataset, isPopular, isExplicit, isLoud], axis = 1)
dataset.head()

In [None]:
dataset.head()

In [None]:
dataset.drop(['isPopular', 'isExplicit', 'is_loud', 'era', 'id', 'name','en_type', 'artists', 'release_date', 'duration_ms'], 
             axis = 1, inplace = True)

In [None]:
dataset.columns

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(dataset.corr(),linecolor='white',linewidths=1,cmap='coolwarm',annot=True)
plt.show()

### Applying Regression

In [None]:
X= dataset.loc[:,dataset.columns!='popularity']
y= dataset.loc[:,dataset.columns=='popularity']

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X= StandardScaler()
sc_y= StandardScaler()
X=sc_X.fit_transform(X)
y=sc_y.fit_transform(y)

### Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=0)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor_lin=LinearRegression()
regressor_lin.fit(X_train,y_train)

### Predicting the values

In [None]:
y_pred_lin = regressor_lin.predict(X_test)

### Using various metrics to check the results

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
print("Training Score of Linear Regression is: {}\n".format(regressor_lin.score(X_train, y_train)))
print("R2 Score of Linear Regression is: {}\n".format(r2_score(y_test, y_pred_lin)))
print("Mean Squared Error of Linear Regression is: {}\n".format(mean_squared_error(y_test, y_pred_lin)))
print("Mean Absolute Error of Linear Regression is: {}\n".format(mean_absolute_error(y_test, y_pred_lin)))

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor_dt= DecisionTreeRegressor(random_state=0)
regressor_dt.fit(X_train,y_train)

y_pred_dt= regressor_dt.predict(X_test)

print("Training Score of Decision Tree Regressor is: {}\n".format(regressor_dt.score(X_train, y_train)))
print("R2 Score of Decision Tree Regressor is: {}\n".format(r2_score(y_test, y_pred_dt)))
print("Mean Squared Error of Decision Tree Regressor is: {}\n".format(mean_squared_error(y_test, y_pred_dt)))
print("Mean Absolute Error of Decision Tree Regressor is: {}\n".format(mean_absolute_error(y_test, y_pred_dt)))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf=RandomForestRegressor(n_estimators=300, random_state=0)
regressor_rf.fit(X_train,y_train)

y_pred_rf=regressor_rf.predict(X_test)

print("Training Score of rfear Regression is: {}\n".format(regressor_rf.score(X_train, y_train)))
print("R2 Score of rfear Regression is: {}\n".format(r2_score(y_test, y_pred_rf)))
print("Mean Squared Error of rfear Regression is: {}\n".format(mean_squared_error(y_test, y_pred_rf)))
print("Mean Absolute Error of rfear Regression is: {}\n".format(mean_absolute_error(y_test, y_pred_rf)))

### Linear Regression R2 score plotting

In [None]:
fig, ax = plt.subplots()
#ax.scatter(y_test, y_pred_lin)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
#regression line
y_test, y_predicted = y_test.reshape(-1,1), y_pred_lin.reshape(-1,1)
ax.plot(y_test, LinearRegression().fit(y_test, y_pred_lin).predict(y_test))
ax.set_title('R2: ' + str(r2_score(y_test, y_predicted)))
plt.show()