In [None]:
import pickle


In [None]:
X = pickle.load( open( "save.p", "rb" ) )

In [None]:
X.shape

In [None]:
y = pickle.load( open( "save2.p", "rb" ) )
# favorite_color is now { "lion": "yellow", "kitty": "red" }

In [None]:
y.shape

In [None]:
X_train = pickle.load(open( "X_train.p", "rb" ) )

In [None]:
X_test = pickle.load(open( "X_test.p", "rb" ) )

In [None]:
y_train = pickle.load(open( "y_train.p", "rb" ) )

In [None]:
y_test = pickle.load(open( "y_test", "rb" ) )

In [None]:
X_val = pickle.load(open( "X_val", "rb" ) )

In [None]:
y_val = pickle.load(open( "y_val", "rb" ) )

In [None]:
y_val[57034]

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [None]:
################
## IMBALANCED ##
################

In [None]:
import statsmodels.api as sm

In [None]:
# For this first example, we'll employ statsmodels
lm_1 = sm.Logit(y_train,  # with statsmodels, `y` comes first
                sm.add_constant(X_train[['hour']]))  # and then `x`
lm_1 = lm_1.fit()

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
lm_1.summary()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lm_1 = LogisticRegression(solver='newton-cg',  # For comparison, use the same solver as statsmodels default
                          C=100000)  # No regularization

lm_1.fit(X_train[['hour']], y_train)

In [None]:
print('intercept: ', round(lm_1.intercept_[0], 4))
print('hour coef: ', round(lm_1.coef_[0][0], 4))

In [None]:
df_eval = X_test.copy()
df_eval['pred'] = lm_1.predict(X_test[['hour']])
#df_eval.loc[:, 'pred'] = df_eval['pred'].astype('category')
df_eval['correct_pred'] = df_eval['pred'] == y_test

In [None]:
df_eval.head()

In [None]:
df_eval.loc[df_eval['correct_pred']== False]

In [None]:
sns.stripplot(data=df_eval.sample(10000),
              x='hour',
              y='pred',
              hue='correct_pred',
              palette={False: '#f03b20', True: '#3182bd'});

In [None]:
lm_1.predict_proba(X_test[['hour']])[:5]

In [None]:
df_eval['probability_fraud'] = lm_1.predict_proba(X_test[['hour']])[:, 1]

In [None]:
g = sns.scatterplot(data=df_eval.sample(10000),
                x='TransactionDT',
                y='probability_fraud',
                hue='correct_pred',
                #y_jitter=0.2,
                #x_jitter=0.2,
                marker='|',
                s=50);

g.legend(loc='right', bbox_to_anchor=(1.25, 0.5), ncol=1)

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler

In [None]:
#df_eval['correct_pred'] = df_eval['pred'] == y_test
confusion_matrix(df_eval['in_sf'], df_eval['pred'])

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:

#plt.grid(False)
cm = confusion_matrix(y_test, df_eval['pred'])
#plt.grid(False)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,
                                 df_eval['probability_fraud'])

In [None]:
def plot_roc(true, probas):
    auc = roc_auc_score(true, probas)

    plt.plot(fpr, tpr, marker='o')
    plt.xlabel('1 - Specificity (FPR)')
    plt.ylabel('Sensitivity (TPR)');
    plt.title(f"Area Under the ROC Curve: {round(auc, 3)}");

In [None]:
plot_roc(y_test, df_eval['probability_fraud'])

In [None]:
mask = tpr > 0.9
thresholds[mask].max()

In [None]:
#############################
## Multiple Log Regression ##
#############################

In [None]:
sns.scatterplot(data=X_train.sample(50000),
                x='hour',
                y='TransactionAmt',
                hue=y_train,
               alpha = .4);

In [None]:
sns.scatterplot(data=X_train.sample(50000),
                x='card6_credit',
                y='TransactionAmt',
                hue=y_train,
               alpha = .4);

In [None]:
features = ['TransactionAmt', 'hour']

# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()



In [None]:
X_train = scaler.fit_transform(X_train[features])
y_train = y_train

In [None]:
X_train[:4]

In [None]:
lm_2 = LogisticRegression()  # We'll also regularize our features

In [None]:
lm_2.fit(X_train, y_train)

In [None]:
X_test = scaler.transform(X_test[features])
preds = lm_2.predict(X_test)

In [None]:
confusion_matrix(y_test, 
                 preds)

In [None]:
plot_roc(y_test, lm_2.predict_proba(X_test)[:, 1])

In [None]:
#####################################
## Multi-Class Logistic Regression ##
#####################################

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets


In [None]:
lm_ovr = LogisticRegression(solver='newton-cg', multi_class='ovr')
lm_mn = LogisticRegression(solver='newton-cg', multi_class='multinomial')

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
lm_ovr.fit(X_train, y_train)
lm_mn.fit(X_train, y_train)

In [None]:
preds_ovr = lm_ovr.predict(X_test)
preds_mn = lm_mn.predict(X_test)

In [None]:
confusion_matrix(y_test, 
                 preds_ovr)

In [None]:
confusion_matrix(y_test, 
                 preds_mn)

In [None]:
preds_proba_ovr = lm_ovr.predict_proba(X_test)
preds_proba_mn = lm_mn.predict_proba(X_test)

In [None]:
def get_multiclass_aucs(labels, scores, name='One-vs-Rest', kind='ovr'):
    ohe = OneHotEncoder()
    labels_ohe = ohe.fit_transform(labels)
    labels_ohe = labels_ohe.toarray()
    
    print(f'Average: {roc_auc_score(labels_ohe, scores, multi_class=kind)}')
    
    auc_scores = roc_auc_score(labels_ohe, scores, multi_class=kind, average=None)
    auc_scores = {i:s for i, s in enumerate(auc_scores)}
    
    return auc_scores