In [1]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

### Import the dataset

In [3]:
pima = pd.read_csv("pima_indian_diabetes.csv")
pima.head()

Unnamed: 0,No_Times_Pregnant,Plasma_Glucose,Diastolic_BP,Triceps,Insulin,BMI,Age,Diabetes
0,1,89,66,23,94,28.1,21,0
1,0,137,40,35,168,43.1,33,1
2,3,78,50,32,88,31.0,26,1
3,2,197,70,45,543,30.5,53,1
4,1,189,60,23,846,30.1,59,1


### Train Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(pima.iloc[:, :-1], 
                                                    pima.Diabetes,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=100)

### Model Training

In [5]:
THRESHOLD = 0.65

pima_pipe = Pipeline([('feature_scaler', StandardScaler()), 
                      ('logistic_regression',LogisticRegression(penalty='l1', C=1, solver='liblinear'))])
pima_pipe.fit(X_train, y_train)

summary = pd.Series(pima_pipe.named_steps['logistic_regression'].coef_[0]).round(6)
summary.index = X_train.columns

# predict with train data
train_pred_prob = pima_pipe.predict_proba(X_train)
train_pred = np.where(train_pred_prob[:,1] > THRESHOLD, 1, 0)
summary["Train accuracy"] = str(round(accuracy_score(y_train,train_pred) *100,2))+"%"

# predict with test data
test_pred_prob = pima_pipe.predict_proba(X_test)
test_pred = np.where(test_pred_prob[:,1] > THRESHOLD, 1, 0)
summary["Test accuracy"] = str(round(accuracy_score(y_test,test_pred) *100,2))+"%"

summary

No_Times_Pregnant     0.27306
Plasma_Glucose         1.1809
Diastolic_BP        -0.017993
Triceps              0.125448
Insulin             -0.069581
BMI                  0.408856
Age                  0.329335
Train accuracy          78.1%
Test accuracy          79.66%
dtype: object

### Model Persist

In [6]:
joblib.dump(pima_pipe, "diabetes_pipeline.joblib")

['diabetes_pipeline.joblib']