
<a id = '4.0'></a>
<p style = "font-size : 35px; color : #34656d ;  text-align : center; background-color : #08f9f9; border-radius: 5px 5px;"><strong>Logistic Regression (Diabetes)</strong></p> 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#%matplotlib inline
#%matplotlib notebook
plt.rcParams["figure.figsize"] = (10,6)
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#pd.options.display.float_format = '{:.3f}'.format

In [None]:
df=pd.read_csv("diabetes.csv")

In [None]:
df.head()

##### DiabetesPedigreeFunction: diabetes history in relatives
##### BMI: Body mass index

In [None]:
df.shape

## Exploratory Data Analysis and Visualization

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.Outcome.value_counts()

In [None]:
sns.countplot(df.Outcome);

In [None]:
sns.boxplot(df.Pregnancies);

In [None]:
#df=df[df.Pregnancies<13]

In [None]:
sns.boxplot(df.Pregnancies);

In [None]:
sns.boxplot(df.SkinThickness);

In [None]:
df=df[df.SkinThickness<70]

In [None]:
sns.boxplot(df.SkinThickness);

In [None]:
sns.boxplot(df.Insulin);

In [None]:
sns.boxplot(df.Glucose);

In [None]:
df=df[df.Glucose>0]

In [None]:
sns.boxplot(df.Glucose);

In [None]:
sns.boxplot(df.BloodPressure);

In [None]:
df=df[df.BloodPressure>35]

In [None]:
sns.boxplot(df.BloodPressure);

In [None]:
sns.boxplot(df.BMI);

In [None]:
df=df[df.BMI>0]

In [None]:
sns.boxplot(df.BMI);

In [None]:
df.shape

In [None]:
df.Outcome.value_counts()

In [None]:
index = 0
plt.figure(figsize=(20,20))
for feature in df.columns:
    if feature != "Outcome":
        index += 1
        plt.subplot(3,3,index)
        sns.boxplot(x='Outcome',y=feature,data=df)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True);

In [None]:
# df.corr()
# df.corr()["Outcome"].sort_values().plot.barh()
df.corr()["Outcome"].drop("Outcome").sort_values().plot.barh();

In [None]:
sns.pairplot(df, hue = "Outcome");

## Train | Test Split and Scaling

In [None]:
X=df.drop(["Outcome"], axis=1)
y=df["Outcome"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42) # stratify is....

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modelling

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model=LogisticRegression()

In [None]:
log_model.fit(X_train_scaled, y_train)

In [None]:
y_pred=log_model.predict(X_test_scaled)

In [None]:
y_pred_proba = log_model.predict_proba(X_test_scaled)

In [None]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data["pred"] = y_pred
test_data["pred_proba"] = y_pred_proba[:,1]
test_data.sample(10)

## Model Performance on Classification Tasks

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

## Cross Validate

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
model = LogisticRegression()

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['precision','recall','f1','accuracy'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

## Cross Validate for 0 class

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

f1_0 = make_scorer(f1_score, pos_label =0)
precision_0 = make_scorer(precision_score, pos_label =0)
recall_0 = make_scorer(recall_score, pos_label =0)

In [None]:
model = LogisticRegression()

scores = cross_validate(model, X_train_scaled, y_train, scoring = {"precision_0":precision_0, "recall_0":recall_0, "f1_0":f1_0}, 
                        cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

## GridSearchCV

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

penalty = ["l1", "l2"]
C = np.logspace(-1, 5, 20)
class_weight= ["balanced", None] 
# The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies 
# in the input data
solver = ["lbfgs", "liblinear", "sag", "saga"]

param_grid = {"penalty" : penalty,
              "C" : C,
              "class_weight":class_weight,
              "solver":solver}


grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=10,
                          scoring = "recall",   #f1_0 = make_scorer(f1_score, pos_label =0)
                          n_jobs = -1)

In [None]:
grid_model.fit(X_train_scaled,y_train)

In [None]:
grid_model.best_params_

In [None]:
eval_metric(grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

## ROC (Receiver Operating Curve) and AUC (Area Under Curve)

https://towardsdatascience.com/calculating-and-setting-thresholds-to-optimise-logistic-regression-performance-c77e6d112d7e

In [None]:
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_auc_score, auc, roc_curve, average_precision_score, precision_recall_curve

In [None]:
plot_roc_curve(grid_model, X_test_scaled, y_test);

In [None]:
plot_precision_recall_curve(grid_model, X_test_scaled, y_test);

## Finding Best Threshold for max recall score

In [None]:
plot_roc_curve(log_model, X_train_scaled, y_train);

In [None]:
y_pred_proba = log_model.predict_proba(X_train_scaled) #
roc_auc_score(y_train, y_pred_proba[:,1])

In [None]:
fp_rate, tp_rate, thresholds = roc_curve(y_train, y_pred_proba[:,1])

In [None]:
optimal_idx = np.argmax(tp_rate - fp_rate)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

In [None]:
plot_precision_recall_curve(log_model, X_train_scaled, y_train);

In [None]:
y_pred_proba = log_model.predict_proba(X_train_scaled)
average_precision_score(y_train, y_pred_proba[:,1])

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_pred_proba[:,1])

In [None]:
optimal_idx = np.argmax((2 * precisions * recalls) / (precisions + recalls))
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

In [None]:
grid_model.predict_proba(X_test_scaled)[:,1]

In [None]:
y_pred2 = pd.Series(grid_model.predict_proba(X_test_scaled)[:,1]).apply(lambda x : 1 if x >= optimal_threshold else 0)

In [None]:
print(confusion_matrix(y_test,y_pred2))
print(classification_report(y_test,y_pred2))

In [None]:
y_train_pred2 = pd.Series(grid_model.predict_proba(X_train_scaled)[:,1]).apply(lambda x : 1 if x >= optimal_threshold else 0)
print(confusion_matrix(y_train, y_train_pred2))
print(classification_report(y_train, y_train_pred2))

## Final Model and Model Deployment

In [None]:
scaler = StandardScaler().fit(X)

In [None]:
import pickle
pickle.dump(scaler, open("scaler_diabates", 'wb'))

In [None]:
X_scaled = scaler.transform(X)

In [None]:
final_model = LogisticRegression(C= 0.1, class_weight= 'balanced',penalty= 'l1',solver= 'liblinear').fit(X_scaled, y)

In [None]:
pickle.dump(final_model, open("final_model_diabates", 'wb'))

In [None]:
X.describe().T

# Using the Model

In [None]:
my_dict = {"Pregnancies": [3, 6, 5],
           "Glucose": [117, 140, 120],
           "BloodPressure": [72, 80, 75],
           "SkinThickness": [23, 33, 25],
           "Insulin": [48, 132, 55],
           "BMI": [32, 36.5, 34],
           "DiabetesPedigreeFunction": [0.38, 0.63, 0.45],
           "Age": [29, 40, 33]
          }

In [None]:
sample = pd.DataFrame(my_dict)
sample

In [None]:
scaler_diabates = pickle.load(open("scaler_diabates", "rb"))

In [None]:
sample_scaled = scaler_diabates.transform(sample)
sample_scaled

In [None]:
final_model = pickle.load(open("final_model_diabates", "rb"))

In [None]:
predictions = final_model.predict(sample_scaled)
predictions_proba = final_model.predict_proba(sample_scaled)
predictions2 = [1 if i >= optimal_threshold else 0 for i in predictions_proba[:,1]]

In [None]:
sample["pred_proba"] = predictions_proba[:,1]
sample["pred_0.50"] = predictions
sample["pred_0.34"] = predictions2
sample