# Code Snippets

## Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
df['col_name'].plot.hist(bins=30)

In [None]:
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
X = df.drop(['target'], axis=1)
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   random_state=101)

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)

In [None]:
y_pred = logmodel.predict(X_train)

In [4]:
print('\n TRAINING STATS:')
print('classification accuracy:', metrics.accuracy_score(y_pred, y_train))
print('confusion matrix: \n', metrics.confusion_matrix, y_train, y_pred)

In [None]:
print('\n TESTING STATS:')
print('classification accuracy:', metrics.accuracy_score(y_pred, y_test))
print('confusion matrix: \n', metrics.confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions)
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, predictions)).plot();

In [None]:
# train_test split
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,
                                                                    test_size=0.20)

# create a 10-fold cross-validation set
kf = cross_validation.KFold(n=y.shape[0],
                           n_folds=10,
                           shuffle=True,
                           random_state=0)

# search for parameter along the following:
C = np.arange(2, 20,)
acc = np.zeros((10,18))
i = 0
for train_index, val_index in kf:
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    j = 0
    for c in C:
        dt = tree.DecisionTreeClassifier(
            min_smaples_leaf=1, 
            max_depth=c)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_val)
        acc[i][j] = metrics.accuracy_score(y_pred, y_val)
        j = j+1
    i = i+1

print('Mean accuracy:', np.mean(acc, axis=0))
print('Selected model index:', np.argmax(np.mean(acc, axis=0)))
    
    
# source intro to data science (Igual & Seguí) p. 84

#### Extending to 2+ Features

From a theory standpoint, adding more features to our logistic regression model simply means learning a new coefficient in the linear combination for each feature when training -- from another perspective, it means finding a higher dimensional hyperplane that separates the data as well as possible.

Practically, with **sklearn logistic regression we should make sure to scale our features** prior to fitting, since regularzation is used by default (that or set C very high). It can be as simple as the below-- 

source metis lecture on logistic regression

In [None]:
std_scale = StandardScaler()

X_train = train_df[['elevation', 'price_per_sqft']]
X_train_scaled = std_scale.fit_transform(X_train)

lm3 = LogisticRegression()
lm3.fit(X_train_scaled, y_train)

y_predict = lm3.predict(X_train_scaled) 
lm3.score(X_train_scaled, y_train)



In [None]:
logit = LogisticRegression(C = 0.95)
logit.fit(X_train, label_train)
print("The score for logistic regression is")
print("Training: {:6.2f}%".format(100*logit.score(X_train, label_train)))
print("Test set: {:6.2f}%".format(100*logit.score(X_test, label_test)))

# source metis classification_error_metrics_solutions_revised

In [None]:
# Print confusion matrix for logistic regression
logit_confusion = confusion_matrix(label_test, logit.predict(X_test))
plt.figure(dpi=150)
sns.heatmap(logit_confusion, cmap=plt.cm.Blues, annot=True, square=True,
           xticklabels=iris_dataset['target_names'],
           yticklabels=iris_dataset['target_names'])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Logistic regression confusion matrix');

plt.savefig("confusion_matrix_logit_iris")

# source metis classification_error_metrics_solutions_revised

In [None]:
def make_confusion_matrix(model, threshold=0.5):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    # (model.predict(X_test) does this automatically with a threshold of 0.5)
    y_predict = (model.predict_proba(X_test)[:, 1] >= threshold)
    fraud_confusion = confusion_matrix(y_test, y_predict)
    plt.figure(dpi=80)
    sns.heatmap(fraud_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['legit', 'fraud'],
           yticklabels=['legit', 'fraud']);
    plt.xlabel('prediction')
    plt.ylabel('actual')
    
make_confusion_matrix(lm)


# source metis classification_error_metrics_solutions_revised

In [None]:
# Let's see how our confusion matrix changes with changes to the cutoff! 
from ipywidgets import interactive, FloatSlider
interactive(lambda threshold: make_confusion_matrix(lm, threshold), threshold=(0.0,1.0,0.02))

# source metis classification_error_metrics_solutions_revised

- **Precision:** The fraction of postive predictions you made that were correct. 
  High precision means that if your model predicted a positive case, you believe it with high confidence. It doesn't tell us how many postive cases we missed (i.e. it doesn't tell us how sure we are about the cases we predicted were negative).
- **Recall**: The fraction of positive cases you predicted correctly.
  High recall means that you are confident that you didn't miss any positive cases. 
 

## Big takeaways:

* Using the **same** logistic regression model, we can change the threshold to bias toward more precision (making positives from test more relevant) or recall (increasing the fraction of postives found).
* Precision goes down as you decrease the threshold, while recall goes up. This is called the _precision-recall tradeoff_.
* Which is worse, low recall or low precision? Depends on the cost of making the different types of error.
* We just need the final predictions of the model to calculate precision and recall. We can get sklearn.metrics to calculate them for us

In [None]:
# using the default threshold of 0.5, which is what vanilla predict does
y_predict = lm.predict(X_test)
print("Default threshold:")
print("Precision: {:6.4f},   Recall: {:6.4f}".format(precision_score(y_test, y_predict), 
                                                     recall_score(y_test, y_predict)))

In [None]:
# using the new threshold of 0.06
y_predict = (lm.predict_proba(X_test)[:,1] > 0.06)
print("Threshold of 0.06:")
print("Precision: {:6.4f},   Recall: {:6.4f}".format(precision_score(y_test, y_predict), 
                                                     recall_score(y_test, y_predict)))

In [None]:
# We can also use the probabilities to make a curve showing us how recall 
# and thresholds trade off 

precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, lm.predict_proba(X_test)[:,1] )

plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as fraud)');
plt.title('Precision and Recall Curves');

In [None]:
plt.figure(dpi=80)
plt.plot(recall_curve[1:], precision_curve[1:],label='precision')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve");

In [None]:
# Or we can just ask sklearn
y_predict = lm.predict(X_test)
f1_score(y_test, y_predict)

In [None]:
# What about the threshold of 0.06?
y_predict = (lm.predict_proba(X_test)[:, 1] > 0.06)
f1_score(y_test, y_predict)

In [None]:
## The ROC curve

# We've already seen that we don't have to accept a 50% threshold cutoff. 
# As we've seen, we can plot our models with different thresholds on the same chart and get a ROC curve. 
# This curve plots the *true positive rate* on the y axis, and the *false positive rate* on the x axis.

# Precision = TP / (TP + FP)
# Recall = TP/P = True positive rate
# false positive rate = FP / true negatives = FP / (FP + TN) 
from sklearn.metrics import roc_auc_score, roc_curve

fpr, tpr, thresholds = roc_curve(y_test, lm.predict_proba(X_test)[:,1])

In [None]:
plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for fraud problem');
print("ROC AUC score = ", roc_auc_score(y_test, lm.predict_proba(X_test)[:,1]))

In [None]:
# An ROC curve (receiver operating characteristic curve) is a graph showing the performance 
# of a classification model at all classification thresholds. This curve plots two parameters: 
# True Positive Rate. False Positive Rate.

#  source https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc


