In [103]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [104]:
#load final data set as spotify
spotify = pd.read_csv("data.csv")
spotify.head(5)

#calculate the popularity mean so we can make popularity a binary variable in following steps
popularity_mean = np.mean(spotify['popularity'])
popularity_mean

25.693380889849703

In [105]:
#Going to drop the name, relesase date, id since it is not significant to the dependant variable, and we have season in lieu of release date
spotify = spotify.drop(['name', 'artists', 'release_date', 'id'], axis = 1)
#dummy variable for season
season_dum = pd.get_dummies(spotify['Season'])
spotify['Season'] = season_dum

In [106]:
spotify.dtypes

Unnamed: 0            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
explicit              int64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
popularity            int64
speechiness         float64
tempo               float64
valence             float64
year_x                int64
Collaboration         int64
Season                uint8
Name Length           int64
live                  int64
love                  int64
mix                   int64
no                    int64
op                    int64
remast                int64
version               int64
year_y                int64
dtype: object

In [107]:
#dependant variable of the model is to predict the popularity of the artist.
#we set popularity as a binary parameter. Popular = when the popularity of the artist is above the popularity_mean (25.69); Unpopular otherwise.

#popular = np.sum()
#unpopular = np.sum()

In [108]:
#modify dataframe to treat popularity as a binary variable
def popularity(c):
    if c['popularity'] > popularity_mean:
        return 1
    else:
        return 0


In [109]:
#set popularity as a binary column: 1 means popular 0 means unpopular
spotify['popularity'] = spotify.apply(popularity, axis = 1)

In [110]:
#split test and training data 30% test, 70% training set

from sklearn.model_selection import train_test_split

spotify_train, spotify_test = train_test_split(spotify, test_size=0.3, random_state=88)
spotify_train.shape, spotify_test.shape

((122072, 27), (52317, 27))

In [127]:
x_train = spotify_train.drop(['popularity'], axis =1)
y_train = spotify_train[['popularity']]

x_test = spotify_test.drop(['popularity'], axis =1)
y_test = spotify_test[['popularity']]

In [128]:
import statsmodels.api as smf
#building model and fitting data
log_reg = smf.Logit(np.asarray(y_train), np.asarray(x_train)).fit()

#visualize summary. Can drop columns x1, x4, x15 since coef are minimal
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.513775
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               122072
Model:                          Logit   Df Residuals:                   122046
Method:                           MLE   Df Model:                           25
Date:                Sun, 02 May 2021   Pseudo R-squ.:                  0.2588
Time:                        18:27:22   Log-Likelihood:                -62718.
converged:                       True   LL-Null:                       -84613.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1         -7.761e-06   1.43e-07    -54.269      0.000   -8.04e-06   -7.48e-06
x2            -2.1979      0.

In [129]:
#define y_hat as the predicted value of y for test dataset
y_hat = log_reg.predict(x_test)
prediction = list(map(round, y_hat))

from sklearn.metrics import (confusion_matrix,
                            accuracy_score)

#confusion matrix
cm = confusion_matrix(y_test, prediction)
print("Confusion matrixx : \n", cm)

Confusion matrixx : 
 [[19141  7019]
 [ 5768 20389]]


In [130]:
#accuracy of the model
print('Test Accuracy = ', accuracy_score(y_test, prediction))

Test Accuracy =  0.7555861383489114
