In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,6)
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
#from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from matplotlib.colors import ListedColormap

In [None]:

df_spotify=pd.read_csv("spotify_songs_data.csv")
df_spotify.head()

In [None]:
col = df_spotify.columns.tolist()
print("Columns Names:")
print(col)

In [None]:
df_spotify.describe()

In [None]:
df_spotify.shape

In [None]:
#df_spotify.info()

In [None]:
# Filter the dataset for only song features on spotify

df_filtered_Spotify = pd.DataFrame(df_spotify)
df_filtered_Spotify.info()

In [None]:
df_filtered_Spotify = df_filtered_Spotify.drop(columns=['artists', 'id', 'name','release_date','year'])
df_filtered_Spotify.info()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Box(
    y=df_filtered_Spotify['acousticness'],
    name="acousticness",
    jitter=0.3,
    pointpos=-1.8,
    boxpoints='all', # represent all points
    marker_color='rgb(7,40,89)',
    line_color='rgb(7,40,89)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['danceability'],
    name="danceability",
    boxpoints=False, # no data points
    marker_color='rgb(9,56,125)',
    line_color='rgb(9,56,125)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['energy'],
    name="energy",
    boxpoints=False, # no data points
    marker_color='rgb(9,56,125)',
    line_color='rgb(9,56,125)'
))


fig.add_trace(go.Box(
    y=df_filtered_Spotify['instrumentalness'],
    name="instrumentalness",
    boxpoints=False, # only outliers
    marker_color='rgb(107,174,214)',
    line_color='rgb(107,174,214)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['loudness'],
    name="loudness",
    boxpoints=False, # only outliers
    marker_color='rgb(107,174,214)',
    line_color='rgb(107,174,214)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['liveness'],
    name="liveness",
    boxpoints=False, # only outliers
    marker_color='rgb(107,174,214)',
    line_color='rgb(107,174,214)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['popularity'],
    name="popularity",
    boxpoints=False, # only outliers
    marker_color='rgb(107,174,214)',
    line_color='rgb(107,174,214)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['tempo'],
    name="tempo",
    boxpoints=False, # only outliers
    marker_color='rgb(107,174,214)',
    line_color='rgb(107,174,214)'
))

fig.add_trace(go.Box(
    y=df_filtered_Spotify['popularity'],
    name="popularity",
    boxpoints=False, # only outliers
    marker_color='rgb(107,174,214)',
    line_color='rgb(107,174,214)'
))


fig.update_layout(title_text="Box Plot Styling Outliers")
fig.show()


In [None]:
df_spotify_corr = df_filtered_Spotify.corr()
print(df_spotify_corr)

In [None]:
fig, ax = plt.subplots(figsize=(15,10)) 
ax = sns.heatmap(df_spotify_corr, annot = True, cmap= 'viridis')

In [None]:
x_values = df_filtered_Spotify.iloc[:, :].values
y_values = df_filtered_Spotify.iloc[:, -1].values

In [None]:
print(y_values)

In [None]:
# Splitting the data into train and test dataset
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size = 0.2, random_state = 0)
print("x_train: ",x_train.shape)
print("x_values: ",x_values.shape)

In [None]:
# Scale the features - Standardize the dataset as it is required for doing PCA
std_scaler = StandardScaler()
x_train = std_scaler.fit_transform(x_train)
x_test = std_scaler.transform(x_test)

In [None]:
print(x_train)

In [None]:
print(x_test)

In [None]:
# Do PCA(principal component analysis)
pca = PCA(n_components = 2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)


In [None]:
print(pca.explained_variance_ratio_)

In [None]:
print(x_train)

In [None]:
#TRAINING THE SUPPORT VECTOR MODEL(LINEAR) ON THE TRAINING DATASET
#svr = svm.SVR(kernel='linear')
#svr.fit(x_train, y_train)

In [None]:
#TRAINING THE LINEAR REGRESSION MODEL ON THE TRAINING DATASET
lm = LinearRegression(n_jobs=-1)
lm.fit(x_train, y_train)

In [None]:
#CREATING THE CONFUSION MATRIX

y_pred = lm.predict(x_test)

#decide on a cutoff limit
cutoff = 0.7 

#initialise a matrix full with zeros
y_pred_classes = np.zeros_like(y_pred)    

#add a 1 if the cutoff was breached
y_pred_classes[y_pred > cutoff] = 1 

#do the same with test
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1

#print a confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
print(cm)

In [None]:
print(classification_report(y_test_classes, y_pred_classes))
linearReg_valence_as = accuracy_score(y_test_classes, y_pred_classes)
print(linearReg_valence_as)

# TRYING TO IMPROVE THE ACCURACY BY GIVING VARIANCE PERCENTAGE

In [None]:
#### TRYING TO IMPROVE THE ACCURACY BY GIVING VARIANCE PERCENTAGE

#95% of variance

pca = PCA(n_components = 0.95)
#pca.fit(data_rescaled)
#reduced = pca.transform(data_rescaled)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
#TRAINING THE LINEAR REGRESSION MODEL ON THE TRAINING DATASET
lm = LinearRegression(n_jobs=-1)
lm.fit(x_train, y_train)

In [None]:
#CREATING THE CONFUSION MATRIX

y_pred = lm.predict(x_test)

#decide on a cutoff limit
cutoff = 0.7 

#initialise a matrix full with zeros
y_pred_classes = np.zeros_like(y_pred)    

#add a 1 if the cutoff was breached
y_pred_classes[y_pred > cutoff] = 1 

#do the same with test
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1

#print a confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
print(cm)

In [None]:
print(classification_report(y_test_classes, y_pred_classes))
linearReg_withPercent_as = accuracy_score(y_test_classes, y_pred_classes)
print(linearReg_withPercent_as)

# TRYING TO IMPROVE ACCURACY SCORE BY USING ANOTHER COLUMN(ENERGY IN THIS CASE)

In [None]:
x_values = df_filtered_Spotify.iloc[:, :].values
y_values = df_filtered_Spotify.iloc[:, 3].values #Energy column gives good variablity so does the loudness column

In [None]:
# Splitting the data into train and test dataset
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size = 0.2, random_state = 0)
print("x_train: ",x_train.shape)
print("x_values: ",x_values.shape)

In [None]:
std_scaler = StandardScaler()
x_train = std_scaler.fit_transform(x_train)
x_test = std_scaler.transform(x_test)

In [None]:
pca = PCA(n_components = 2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)


In [None]:
print(x_test)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
print(pca.components_)

In [None]:
#COVARIANCE MATRIX
pca.get_covariance()

In [None]:
#TRAINING THE LINEAR REGRESSION MODEL ON THE TRAINING DATASET
lm = LinearRegression(n_jobs=-1)
lm.fit(x_train, y_train)

In [None]:
lm.score(x_test, y_test)

In [None]:
#CREATING THE CONFUSION MATRIX

y_pred = lm.predict(x_test)

#decide on a cutoff limit
cutoff = 0.7 

#initialise a matrix full with zeros
y_pred_classes = np.zeros_like(y_pred)    

#add a 1 if the cutoff was breached
y_pred_classes[y_pred > cutoff] = 1 

#do the same with test
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1

#print a confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
print(cm)

In [None]:
print(classification_report(y_test_classes, y_pred_classes))
linearReg_energycol_as = accuracy_score(y_test_classes, y_pred_classes)
print(linearReg_energycol_as)

# TRYING TO IMPROVE ACCURACY SCORE USING RANDOM FOREST REGRESSOR

In [None]:
#USING RANDOM FOREST REGRESSOR

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)


In [None]:
regressor.score(x_test, y_test)

In [None]:
#decide on a cutoff limit
cutoff = 0.7 

#initialise a matrix full with zeros
y_pred_classes = np.zeros_like(y_pred)    

#add a 1 if the cutoff was breached
y_pred_classes[y_pred > cutoff] = 1 

#do the same with test
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1

#print a confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
print(cm)

In [None]:
print(classification_report(y_test_classes, y_pred_classes))
random_forest_as = accuracy_score(y_test_classes, y_pred_classes)
print(random_forest_as)

# TRYING TO IMPROVE ACCURACY SCORE USING DECISION TREE REGRESSOR

In [None]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor

In [None]:
DT_regressor = DecisionTreeRegressor(random_state = 0)

In [None]:
# fit the regressor with X and Y data 
DT_regressor.fit(x_train, y_train) 

In [None]:
y_pred = DT_regressor.predict(x_test)

In [None]:
DT_regressor.score(x_test, y_test)

In [None]:
#decide on a cutoff limit
cutoff = 0.7 

#initialise a matrix full with zeros
y_pred_classes = np.zeros_like(y_pred)    

#add a 1 if the cutoff was breached
y_pred_classes[y_pred > cutoff] = 1 

#do the same with test
y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_test > cutoff] = 1

#print a confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
print(cm)

In [None]:
print(classification_report(y_test_classes, y_pred_classes))
decisionTree_AS = accuracy_score(y_test_classes, y_pred_classes)
print(decisionTree_AS)

In [None]:
#decisionTree_AS, random_forest_as, linearReg_energycol_as, linearReg_withPercent_as, 
#linearReg_valence_as
#Create table for the accuracy score using different regressor and techniques

import plotly.graph_objects as go

fig = go.Figure(data=[go.Table(header=dict(values=['Model', 'Accuracy Scores']),
                 cells=dict(values=[['LINEAR REGRESSION(VALENCE COLUMN)', 'LINEAR REGRESSION(USING PERCENT VARIANCE)', 
                                     'LINEAR REGRESSION(ENERGY COLUMN)', 'RANDOM FOREST REGRESSOR','DECISION TREE REGRESSOR'], 
                                    [linearReg_valence_as, linearReg_withPercent_as, linearReg_energycol_as, random_forest_as, decisionTree_AS]])
                              )])
fig.show()
