## Baseline Model and Improved Versions

This notebook explores important features with Decision Tree and uses LogReg as a baseline model.
The baseline then gets improved by limiting features, using GridSearchCV, scaling and SMOTE.

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, precision_score, log_loss, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


# Manual Train test split -- splits session ID in .70 of the session for train and .3 of the session for test.
#Returns a train dataframe and a test
from src.train_test_split_df import *

In [2]:
df = pd.read_csv('../data/tracks_session_clean.csv', index_col=0)

In [None]:
df.info()

In [None]:
drop_col = ['session_id', 'session_length', 'date', 'track_id', 'hist_user_behavior_reason_start', 'beat_strength', 'dyn_range_mean']
def drop(df):
    df = df.drop(labels=drop_col, axis=1)
    return df

### Decision Tree

In [None]:
Train, test = train_test_split_df(df)

In [None]:
Train = drop(Train)
Test = drop(test)

In [None]:
Train['skipped'].value_counts(normalize=True)

In [None]:
X_train = Train.drop('skipped', axis=1)
X_test = Test.drop('skipped', axis=1)
y_train = Train['skipped']
y_test = Test['skipped']

In [None]:
dt = DecisionTreeClassifier()

In [None]:
param_grid = {'max_depth': range(2, 11),
             'min_samples_split': range(2,5)
             }
gs = GridSearchCV(dt, param_grid)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
dt = DecisionTreeClassifier(max_depth=8)

In [None]:
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

In [None]:
features=X_train.columns.tolist()
important = dt.feature_importances_
important_features = pd.DataFrame(important, index=features, columns=['Important'])
important_features.sort_values('Important', ascending=False).head(15)

In [None]:
preds= dt.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, preds)
ConfusionMatrixDisplay(cm).plot()

In [None]:
precision_score(y_test, preds)

Important features causing skips:

- pause_before_play	
- catalog	
- hist_seekback	
- radio	
- editorial_playlist
- hist_shuffle
- hist_seekfwd
- session_position

# Logistic Regression

In [None]:
Train, test = train_test_split_df(df)

In [None]:
Train = drop(Train)
Test = drop(test)

In [None]:
X_train = Train['pause_before_play'].array.reshape(-1, 1)
X_test = Test['pause_before_play'].array.reshape(-1, 1)
y_train = Train['skipped']
y_test = Test['skipped']

In [None]:
logreg = LogisticRegression()

In [None]:
param_grid = {'penalty': ['l2'],
             'C': [.1, 10, 100, 1000, 10000],
              'solver': ['liblinear', 'newton-cg'],
             }
gs = GridSearchCV(logreg, param_grid)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
logreg = LogisticRegression(penalty='l2', C=.1, solver='liblinear')

In [None]:
logreg.fit(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

In [None]:
coef = logreg.coef_

In [None]:
odds = np.exp(coef)
odds.astype(float)

If there is a pause before playing, that improves the chance of skipping by 1.76 times. 

# Logistic Regression version 2

In [None]:
#Train, test = train_test_split_df(df)

In [None]:
#Train = drop(Train)
#Test = drop(test)

In [None]:
#X_train = Train[['pause_before_play', 'session_position', 'catalog', 'hist_shuffle', 'premium']]
#X_test = Test[['pause_before_play', 'session_position', 'catalog', 'hist_shuffle', 'premium']]
#y_train = Train['skipped']
#y_test = Test['skipped']

In [None]:
df = drop(df)

In [None]:
X = df[['pause_before_play', 'session_position', 'catalog', 'hist_shuffle', 'premium']]
y = df['skipped']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
scaled_pipeline = Pipeline([('smote', SMOTE(random_state=42)),
                            ('scaler', StandardScaler()), 
                            ('logreg', LogisticRegression(random_state=42))])

In [None]:
scaled_pipeline.fit(X_train, y_train)

scaled_pipeline.score(X_test, y_test)

In [None]:
grid = [{'logreg__C': [0.1, 1, 10, 100, 1000, 10000], 
         'logreg__solver': ['liblinear', 'newton-cg', 'lbfgs'], 
         'logreg__penalty': ['l2']}]

In [None]:
gridsearch = GridSearchCV(estimator=scaled_pipeline, 
                          param_grid=grid, 
                          scoring='recall', 
                          cv=3)

In [None]:
gridsearch.fit(X_train, y_train)

gridsearch.score(X_test, y_test)

In [None]:
gridsearch.best_estimator_

In [None]:
y_pred = gridsearch.predict(X_test)

In [None]:
recall_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()

# Gradient Boosting Classifier

In [None]:
X = df[['pause_before_play', 'session_position', 'catalog', 'hist_shuffle', 'premium']]
y = df['skipped']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
scaled_pipeline = Pipeline([('smote', SMOTE(random_state=42)),
                            ('scaler', StandardScaler()), 
                            ('boost', GradientBoostingClassifier(random_state=42))])

In [None]:
scaled_pipeline.fit(X_train, y_train)

In [None]:
grid = [{'boost__max_depth': [2, 3, 4, 5, 6], 
         'boost__learning_rate': [.01, .1, 1]}]

In [None]:
gridsearch = GridSearchCV(estimator=scaled_pipeline, 
                          param_grid=grid, 
                          scoring='recall', 
                          cv=3)

In [None]:
gridsearch.fit(X_train, y_train)

In [None]:
gridsearch.best_estimator_

In [None]:
gridsearch.score(X_test, y_test)