# End to End Machine Learning with Deployment

### Part1- EDA of Medical Dataset 
1. Import the libraries
2. Load the data and view it
3. Clean the data 
4. Perform EDA 

### Part2- Modelling of Dataset 
5. Preprocessing
6. Fitting and Evaluation
7. Optimization
8. Interpretation
9. Model Deploytment

In [2]:
!pip install xgboost --quiet

In [3]:
!pip install streamlit --quiet


In [4]:
!pip install imbalanced-learn --quiet

In [5]:
!pip install xgboost --quiet

In [8]:
!pip install scikit-learn


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\praj\\anaconda4\\envs\\demo\\Lib\\site-packages\\~klearn\\decomposition\\_cdnmf_fast.cp38-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp38-cp38-win_amd64.whl (8.3 MB)
     ---------------------------------------- 8.3/8.3 MB 2.8 MB/s eta 0:00:00
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2


### Step1: Import the libraries


In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
plt.style.use('fivethirtyeight')
warnings.filterwarnings("ignore")

# libraries for preprocessing 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# libraries for model fitting
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# libraries for model evaluation
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve

print("All libraries are imported")

AttributeError: module 'matplotlib._api' has no attribute 'define_aliases'

In [None]:
pip uninstall matplotlib

In [None]:
pip install matplotlib

### Step2:Load the data and view it

In [None]:
data=pd.read_csv("data.csv")
data.head(10)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data[~data.applymap(np.isreal).any(1)]

**Observations**
1. The dataset has 768 rows and 10 columns
2. The column 'Unnamed: 0' is redundant
3. There are no nulls 
4. However there are 0's present as null which we have to take care of
5. There are no corrupt characters

### Step3: Clean the data 

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
zerofiller=lambda x:x.replace(0, x.median())
cols=data.columns[1:6]
data[cols]=data[cols].apply(zerofiller, 0)

In [None]:
# encode the categorical outcome variable
df=data.copy()
d={"Yes":1, 'No':0}
df['Outcome']=df['Outcome'].map(d)

In [None]:
df.head(2)

### Step4: Perform EDA 
1. Univariates- NUmerical
2. UNivariates- Categorical
3. BUvariate - Categorical vs Numerical
4. Bivariate - Numerical vs Numerical 
5. MUltivaraite Pairplot
6. Correlations and Heatmpap
7. Outcome is a binary categorical variable


In [None]:
def histograms(df):
    df.hist()
    plt.tight_layout()
    plt.show()

In [None]:
histograms(df)

In [None]:
def barplot(data, feature):
    print("Bar plot of the variable ", feature)
    plt.figure(figsize=(10,7))
    ax=sns.countplot(data=data, x=feature, color='green')
    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
        ax.annotate("{:.2g}%".format(100.*y/len(df)), (x.mean(), y), ha='center', va='bottom')
    plt.show()

In [None]:
barplot(df, 'Outcome')

In [None]:
def boxplot_histplot(data, feature, bins=None, figsize=(12,7)):
    print('Boxplot and Histplot for ', feature)
    fig, (ax_box, ax_hist)=plt.subplots(
    nrows=2,
    sharex=True,
    gridspec_kw = {"height_ratios":(0.25, 0.75)},
    figsize=figsize
    )
    sns.boxplot(data=data, x=feature, color='violet', ax=ax_box, showmeans=True)
    sns.histplotplot(data=data, x=feature, ax=ax_hist, bins=bins)   if bins else sns.histplot(data\
                            =data, x=feature, ax=ax_hist)  
    ax_hist.axvline(data[feature].mean(), color='green', linestyle='--')
    ax_hist.axvline(data[feature].median(), color='black', linestyle='-')
    plt.show()
    

In [None]:
for col in df.select_dtypes(exclude='O').columns:
    boxplot_histplot(df, col)

**Observations**
1. We see that Pregnancies, Insulin, Dpf and Age are huighly right skewed
2. Wee see that Outcome is highly imbalanced
3. We see that skin thickness, insulin and Dpf have very high amount of outliers 

**Bivariate Analysis**

In [None]:
def catnum(data, feature1, feature2):
    print("The Bivariate barchart between {0} and {1}".format(feature1, feature2))
    data.groupby(feature1)[feature2].mean().plot(kind='bar', color='orange')
    plt.show()

In [None]:
for col in df.select_dtypes(exclude='O').columns:
    catnum(df,'Outcome' ,col)

**Observation**
The graphs show that women with higher no.of pregnancies, hiugher glucose level, higher insulin level , higher dpf and age are more likely tobe diabetic |

In [None]:
def lineplot_scatterplot(data, feature1, feature2):
    plt.figure(figsize=(16,7))
    print("Bivariates between {0} and {1}".format(feature1, feature2))
    plt.subplot(1,2,1)
    sns.lineplot(data=data, x=feature1, y=feature2, color='green')
    plt.title('Lineplot')
    plt.subplot(1,2,2)
    sns.scatterplot(data=data, x=feature1, y=feature2, color='blue')
    plt.title('Scatterplot')
    plt.show()
    

In [None]:
for col in df.select_dtypes(exclude='O').columns:
    lineplot_scatterplot(data=df,feature1='Age', feature2=col)
    

In [None]:
sns.pairplot(df)

**Observations**
There appears multicollinearity between Glucose and Insulin, BMI and skin thickness

In [None]:
# Lets look at variables most correlated with Outcome 
df[df.columns[:]].corr()['Outcome']

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df.corr(), cmap='Spectral', vmax=+1, vmin=-1, annot=True)

**Observations**
Glucose and BMI are the strongest predictors of Outcome 

Lets create an app top display these charts 

In [None]:
#!pip install streamlit

In [None]:
%%writefile eda.py
import streamlit as st
st.title("The EDA Page")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
st.set_option('deprecation.showPyplotGlobalUse', False)
plt.style.use('fivethirtyeight')

#load and clean the data
data=pd.read_csv("data/data.csv")
data.drop('Unnamed: 0', axis=1, inplace=True)
zerofiller=lambda x:x.replace(0, x.median())
cols=data.columns[1:6]
data[cols]=data[cols].apply(zerofiller, 0)
# encode the categorical outcome variable
df=data.copy()
d={"Yes":1, 'No':0}
df['Outcome']=df['Outcome'].map(d)
def view_data(data):
    st.write(df.head(10))
    st.pyplot()

def histograms(df):
    df.hist()
    plt.tight_layout()
    st.pyplot()
    
def barplot(data, feature):
    print("Bar plot of the variable ", feature)
    plt.figure(figsize=(10,7))
    ax=sns.countplot(data=data, x=feature, color='green')
    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
        ax.annotate("{:.2g}%".format(100.*y/len(df)), (x.mean(), y), ha='center', va='bottom')
    st.pyplot()
    
def boxplot_histplot(data, feature, bins=None, figsize=(12,7)):
    print('Boxplot and Histplot for ', feature)
    fig, (ax_box, ax_hist)=plt.subplots(
    nrows=2,
    sharex=True,
    gridspec_kw = {"height_ratios":(0.25, 0.75)},
    figsize=figsize
    )
    sns.boxplot(data=data, x=feature, color='violet', ax=ax_box, showmeans=True)
    sns.histplotplot(data=data, x=feature, ax=ax_hist, bins=bins)   if bins else sns.histplot(data\
                            =data, x=feature, ax=ax_hist)  
    ax_hist.axvline(data[feature].mean(), color='green', linestyle='--')
    ax_hist.axvline(data[feature].median(), color='black', linestyle='-')
    st.pyplot()
    
st.sidebar.subheader("Choose the Plot")
plot=st.sidebar.selectbox('plot', ('Data', 'Histograms', 'Barchart', 'Boxplot_Scatterplot', 'Correlations'))

if st.sidebar.button('PLOT'):
    if plot=='Data':
        view_data(df)
    if plot=='Histograms':
        histograms(df)
    if plot=='Barchart':
        barplot(df, feature='Outcome')
    if plot=='Boxplot_Scatterplot':
        for col in df.select_dtypes(exclude='O').columns:
            boxplot_histplot(df, col)
    if plot=='Correlations':
        plt.figure(figsize=(12,7))
        sns.heatmap(df.corr(), cmap='Spectral', vmax=+1, vmin=-1, annot=True)
        st.pyplot()
        
        
        
        

In [None]:
# Create fn to do perprocessing
def preprocess(data,label):
    X=df.drop(label,axis=1)
    y=df[label]
    # solve the imbalance
    sm=SMOTE()
    X,y=sm.fit_resample(X,y)
    # train test split
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
    return x_train,x_test,y_train,y_test

In [None]:
x_train,x_test,y_train,y_test=preprocess(df,'Outcome')

In [None]:
# scale the features
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [None]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [None]:
# The data is ready for modelling

### Step 6: Fit and Evaluate the model

In [None]:
def print_metrics(y_test,y_pred,model_name):
    print('Metrics for the model',model_name)
    print('')
    print('Accuracy Score=',accuracy_score(y_test,y_pred))
    print('')
    print('Recall Score=',recall_score(y_test,y_pred))
    print('')
    print('f1 Score=',f1_score(y_test,y_pred))
    print('')
    print('Roc Score=',roc_auc_score(y_test,y_pred))
    print('Classification Report')
    print(classification_report(y_test,y_pred))

In [None]:
def plot_metrics(clf,x_test,y_test,model_name):
    print('Metrics for the model')
    plot_confusion_matrix(clf,x_test,y_test,display_label[0,1])
    print('')
    plot_roc_curve(clf,x_test,y_test)
    print('')
    plot_precision_recall_curve(clf,x_test,y_test)

In [None]:
# fit and evcaluate the model
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)

In [None]:
y_pred=knn.predict(x_test)

In [None]:
print_metrics(y_test,y_pred,knn)

In [None]:
plot_metrics(knn,x_test,y_test,'KNN')

In [None]:
# Lets tune hyperparameter of knn


neighbors = np.arange(1,12)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(x_train , y_train)
    train_accuracy[i] = knn.score(x_train , y_train)
    test_accuracy[i] = knn.score(x_test , y_test)

#plot the model complexity curve
plt.title('Model Complexity Curves')
plt.plot(neighbors , train_accuracy , label = 'Train Accuracy')
plt.plot(neighbors , test_accuracy , label = 'Test Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#refitting the model with neighnbor = 8

knn = KNeighborsClassifier(n_neighbors = 8)
knn.fit(x_train , y_train)
y_pred = knn.predict(x_test)
print_metrics(y_test , y_pred , 'KNN')

In [None]:
#Lets fit all the models as once and decide the one to optimise

clfs = {'Logreg' : LogisticRegression(),
       'KNN' : KNeighborsClassifier(),
       'Naive Bayes' : GaussianNB(),
       'Decision Tree' : DecisionTreeClassifier(),
       'RandomForest' : RandomForestClassifier(),
       'AdaBoost' : AdaBoostClassifier(),
       'Gradient Boosting' : GradientBoostingClassifier(),
       'XGBoost' : XGBClassifier(),
       'SVM' : SVC()}

models_report = pd.DataFrame(columns = ['Model','Accuracy','Recall','Precision','F1'])

for clf , clf_name in list(zip(clfs.values() , clfs.keys())):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print('Fitting the Model..........',clf_name)
    t=pd.Series({
        'Model' : clf_name,
        'Accuracy' : accuracy_score(y_test , y_pred),
        'Recall' :recall_score(y_test , y_pred),
        'Precision' : precision_score(y_test,y_pred),
        'F1' : f1_score(y_test , y_pred)
    })
    models_report = models_report.append(t,ignore_index = True)
    
models_report = models_report.sort_values(by = 'F1' , ascending = False)
models_report





In [None]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc.predict(x_test)
print_metrics(y_test,y_pred,'RFC')

In [None]:
# Lets optimise the Random Forest
params_grid={
    'n_estimators':[100,150,200],
    'min_samples_leaf':range(1,5,1),
    'min_samples_split':range(2,10,2),
    'max_depth':[1,2,3,4,5],
    'criterion':['gini','entropy'],
    'max_features':['sqrt','log2']
}


n_folds=3

cv=GridSearchCV(estimator=rfc,param_grid=params_grid,cv=n_folds,n_jobs=-1,
                return_train_score=False,verbose=3)

cv.fit(x_train,y_train)

In [None]:
cv.best_score_

In [None]:
cv.best_estimator_

In [None]:
rfc_tuned = RandomForestClassifier(max_depth=5,max_features='log2',min_samples_leaf=4,min_samples_split=8)
rfc_tuned.fit(x_train,y_train)
y_pred = rfc_tuned.predict(x_test)
print_metrics(y_test,y_pred,'RFC¬†Tuned')

# Model Interpretation

In [None]:
!pip install shap --quiet

In [None]:
import shap
X=df.drop('Outcome',axis=1)
value=shap.TreeExplainer(rfc_tuned).shap_values(x_test)
shap.summary_plot(value,x_train,plot_type='bar',feature_names=X.columns)

## Create Pipeline for deployment

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.pipeline import Pipeline
sc=StandardScaler()
rfc_tuned=rfc_tuned
x_train,x_test,y_train, y_test=preprocess(df,'Outcome')
steps=[('scaling',sc),('rfc tuned',rfc_tuned)]
pipeline=Pipeline(steps)
pipeline.fit(x_train,y_train)
y_pred=pipeline.predict(x_test)
print_metrics(y_test,y_pred,'Pipeline')

In [None]:
# pickle the [pipeline]
import pickle

In [None]:
model=open('rfc.pickle','wb')
pickle.dump(pipeline,model)
model.close()

In [None]:
## Deployment of the pickle model

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
st.title('Medical Diagnostic App üë®üèΩ‚Äç‚öïÔ∏è')

# step 1: load the model
model=open('rfc.pickle','rb')
clf=pickle.load(model)
model.close()

# step 2: Get the front end user input

pregs=st.number_input('Pregnancies',1,20,step=1)
glucose=st.slider('Glucose',40.0,200.0,40.0)
bp=st.slider('BloodPressure',24,122,24)
skin=st.slider('SkinThickness',7,99,7)
insulin=st.slider('Insulin',18.0,850.0,18.0)
bmi=st.slider('BMI',18.0,67.0,18.0)
dpf=st.slider('DiabetesPedigreeFunction',0.05,2.5,0.05)
age=st.slider('Age',21,81,21)


# step 3: Converting User input To model output

data={'Pregnancies':pregs,'Glucose':glucose,'BloodPressure':bp,'SkinThickness':skin,
     'Insulin':insulin,'BMI':bmi,'DiabetesPedigreeFunction':dpf,'Age':age
     }

input_data=pd.DataFrame([data])


# Step 4: Get the predictions
preds=clf.predict(input_data)[0]
if st.button('Predict'):
    if preds==1:
        st.error('The person has Diabetes')
    if preds==0:
        st.success('The person is Diabetes Free')