In [17]:
# Import the dependencies

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sqlalchemy import create_engine, text

In [18]:
# Create a reference to the file. 

database_path = Path("Billboard.db")

In [19]:
# Create an engine that can talk to the database

engine = create_engine(f"sqlite:///{database_path}")
conn = engine.connect()

In [20]:
# Query All Records in the the Database

df = pd.read_sql("SELECT * FROM Billboard_Data", conn)
df.head()

Unnamed: 0,SongID,Performer,Song,duration_ms,explicit,danceability,energy,key,loudness,mode,...,time_signature,genre,duration_m,Month,Day,Year,Overall Peak,Total Weeks on Chart,Total Instances,Top 20
0,......And Roses And Rosesandy Williams,Andy Williams,......And Roses And Roses,166106,False,0.154,0.185,5,-14.063,1,...,4,adult standards,2.768433,5,8,1965,36,7,1,0
1,...And Then There Were Drumssandy Nelson,Sandy Nelson,...And Then There Were Drums,172066,False,0.588,0.672,11,-17.278,0,...,4,rock,2.867767,10,6,1962,65,4,1,0
2,...Baby One More Timebritney Spears,Britney Spears,...Baby One More Time,211066,False,0.759,0.699,0,-5.745,0,...,4,pop,3.517767,1,30,1999,1,32,1,1
3,...Ready For It?Taylor Swift,Taylor Swift,...Ready For It?,208186,False,0.613,0.764,2,-6.509,1,...,4,pop,3.469767,9,23,2017,4,19,1,1
4,'65 Love Affairpaul Davis,Paul Davis,'65 Love Affair,219813,False,0.647,0.686,2,-4.247,0,...,4,rock,3.66355,5,22,1982,6,20,1,1


In [21]:
# Create a new dataframe, dropping unneeded columns

new_df = df[["danceability", "valence", "speechiness", "loudness", "Total Weeks on Chart", "instrumentalness", "liveness", "energy"]]

new_df

Unnamed: 0,danceability,valence,speechiness,loudness,Total Weeks on Chart,instrumentalness,liveness,energy
0,0.154,0.150,0.0315,-14.063,7,0.000267,0.1120,0.185
1,0.588,0.801,0.0361,-17.278,4,0.745000,0.1450,0.672
2,0.759,0.907,0.0307,-5.745,32,0.000131,0.4430,0.699
3,0.613,0.417,0.1360,-6.509,19,0.000000,0.1970,0.764
4,0.647,0.952,0.0274,-4.247,20,0.000006,0.1330,0.686
...,...,...,...,...,...,...,...,...
22622,0.393,0.927,0.0267,-5.986,7,0.000000,0.0479,0.594
22623,0.448,0.190,0.0319,-3.244,18,0.000000,0.1170,0.826
22624,0.852,0.627,0.4260,-7.673,1,0.000000,0.2630,0.438
22625,0.531,0.192,0.3230,-12.702,12,0.279000,0.0584,0.642


In [22]:
# create X and Y values

y = df["Top 20"]

X = new_df

In [23]:
# Inspect Y values

y.head()

0    0
1    0
2    1
3    1
4    1
Name: Top 20, dtype: int64

In [24]:
# Inspect X values

X.head()

Unnamed: 0,danceability,valence,speechiness,loudness,Total Weeks on Chart,instrumentalness,liveness,energy
0,0.154,0.15,0.0315,-14.063,7,0.000267,0.112,0.185
1,0.588,0.801,0.0361,-17.278,4,0.745,0.145,0.672
2,0.759,0.907,0.0307,-5.745,32,0.000131,0.443,0.699
3,0.613,0.417,0.136,-6.509,19,0.0,0.197,0.764
4,0.647,0.952,0.0274,-4.247,20,6e-06,0.133,0.686


In [25]:
# Use sklearn to split dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [26]:
# Create Logistic Regression model

lr_model = LogisticRegression(solver = "lbfgs", random_state = 1)

# Fit the model

fitted_model = lr_model.fit(X_train, y_train)

In [27]:
# Create a prediction model

prediction = fitted_model.predict(X_test)
print(prediction)

[0 0 0 ... 0 1 0]


In [28]:
# Create confusion matrix

s_matrix = confusion_matrix(y_test, prediction)
print(s_matrix)

[[3618  433]
 [ 664  942]]


In [29]:
# Create classification report

s_report = classification_report(y_test, prediction)
print(s_report)

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      4051
           1       0.69      0.59      0.63      1606

    accuracy                           0.81      5657
   macro avg       0.77      0.74      0.75      5657
weighted avg       0.80      0.81      0.80      5657



Using Oversampling to test whether the accuracy improves

In [30]:
# Import the RandomOverSampler module from imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_oversampler = RandomOverSampler(random_state=1)

# # Fit the original training data to the random_oversampler model
random_oversampler.fit(X_train, y_train)

In [31]:
# Create Logistic Regression model

lr_model = LogisticRegression(solver = "lbfgs", random_state = 1)

# Fit the model

fitted_model = lr_model.fit(X_train, y_train)

In [32]:
# Create a prediction model

prediction = fitted_model.predict(X_test)
print(prediction)

[0 0 0 ... 0 1 0]


In [33]:
# Create a confusion matrix

s_matrix = confusion_matrix(y_test, prediction)
print(s_matrix)

[[3618  433]
 [ 664  942]]


In [34]:
# Create a classification report

s_report = classification_report(y_test, prediction)
print(s_report)

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      4051
           1       0.69      0.59      0.63      1606

    accuracy                           0.81      5657
   macro avg       0.77      0.74      0.75      5657
weighted avg       0.80      0.81      0.80      5657

