In [52]:
import time
import numpy as np
import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [53]:
df = pd.read_csv('../data/finalDataSet2018to2021.csv')
df.head()

Unnamed: 0,song,performer,chart_position,previous_position,peak,weeks_on_chart,hitTF,id,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Dance Monkey,Tones And I,11.0,19.0,11.0,8.0,1.0,2XU0oxnq2qxCpomAAuJY8K,0.824,0.588,...,-6.4,0.0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438.0,4.0
1,Mine,Bazzi,56.0,0.0,56.0,1.0,1.0,7uzmGiiJyRfuViKKK3lVmR,0.71,0.789,...,-3.874,1.0,0.0722,0.0161,3e-06,0.451,0.717,142.929,131064.0,4.0
2,Final Fantasy,Drake,56.0,0.0,56.0,1.0,1.0,44Du2IM1bGY7dicmLfXbUs,0.5,0.449,...,-10.977,1.0,0.442,0.422,8e-06,0.115,0.104,144.206,219960.0,1.0
3,Hear Me Calling,Juice WRLD,56.0,0.0,56.0,1.0,1.0,13ZyrkCDmRz5xY3seuAWYk,0.699,0.687,...,-3.997,0.0,0.106,0.308,3.6e-05,0.121,0.499,88.932,189977.0,4.0
4,Liar,Camila Cabello,56.0,0.0,56.0,1.0,1.0,7LzouaWGFCy4tkXDOOnEyM,0.74,0.498,...,-6.684,0.0,0.0456,0.0169,0.00282,0.319,0.652,98.016,207039.0,4.0


In [54]:
X = df.loc[:, df.columns != 'hitTF'].select_dtypes(['int', 'float']).values
y = df.hitTF.values

In [55]:
X

array([[1.10000e+01, 1.90000e+01, 1.10000e+01, ..., 9.80270e+01,
        2.09438e+05, 4.00000e+00],
       [5.60000e+01, 0.00000e+00, 5.60000e+01, ..., 1.42929e+02,
        1.31064e+05, 4.00000e+00],
       [5.60000e+01, 0.00000e+00, 5.60000e+01, ..., 1.44206e+02,
        2.19960e+05, 1.00000e+00],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 9.89520e+01,
        1.67290e+05, 4.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 9.69790e+01,
        1.87720e+05, 4.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00098e+02,
        2.23450e+05, 4.00000e+00]])

In [56]:
def LogRegModel(X, y, size=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=size + 0.05)
    
    X_scaler = MinMaxScaler().fit(X_train)

    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    X_val_scaled = X_scaler.transform(X_val)
    
    LogReg = LogisticRegression()
    
    t = time.time()
    LogReg.fit(X_train_scaled, y_train)
    elapsed_time = time.time() - t
    print(elapsed_time)
    
    print(f"Training Data Score: {LogReg.score(X_train_scaled, y_train)}")
    print(f"Testing Data Score: {LogReg.score(X_test_scaled, y_test)}", '\n')
    
    predictions = LogReg.predict(X_val_scaled)
    
    print(classification_report(y_val, predictions, target_names=['not hit','hit']), '\n')
    
    pickle.dump(LogReg, open('../deployment/models/LogReg.sav', 'wb'))
    pickle.dump(X_scaler, open('../deployment/models/scaler.gz', 'wb'))
    return LogReg.predict(X_train_scaled), LogReg.predict(X_test_scaled), LogReg, X_scaler

In [67]:
s1, s2, model, scaler = LogRegModel(X, y, 0.2)

0.15213489532470703
Training Data Score: 0.9840785907859079
Testing Data Score: 0.9847715736040609 

              precision    recall  f1-score   support

     not hit       0.99      1.00      0.99       855
         hit       1.00      0.90      0.95       130

    accuracy                           0.99       985
   macro avg       0.99      0.95      0.97       985
weighted avg       0.99      0.99      0.99       985
 



In [74]:
drake_song = df.loc[:, df.columns != 'hitTF'].select_dtypes(['int', 'float']).values[3]

In [75]:
ds = scaler.transform([drake_song])

In [76]:
model.predict_proba(ds)

array([[0.00911754, 0.99088246]])

In [77]:
ds

array([[5.60000000e-01, 0.00000000e+00, 5.60000000e-01, 2.04081633e-02,
        7.13993871e-01, 6.86993740e-01, 6.36363636e-01, 9.04814024e-01,
        0.00000000e+00, 1.10994764e-01, 3.09546962e-01, 3.63000000e-05,
        1.03092784e-01, 5.06598985e-01, 4.31893236e-01, 4.98559425e-02,
        8.00000000e-01]])

In [78]:
model.coef_

array([[ 8.80994533,  3.10385754,  6.46755897,  3.72406671,  1.3569274 ,
        -1.48422635, -0.09288073,  0.49384728,  0.34910384,  1.1056262 ,
        -1.17512023, -1.52573595, -0.36707238, -0.48872859, -0.36756887,
        -0.39177874, -0.02068741]])

In [79]:
dance_monkey = df.loc[:, df.columns != 'hitTF'].select_dtypes(['int', 'float']).values[0]

In [80]:
dm = scaler.transform([dance_monkey])

In [81]:
model.predict_proba(dm)

array([[0.77852337, 0.22147663]])