# Import Libraries

In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv), data manipulation as in SQL
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph. I like it most for plot
%matplotlib inline
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression # to apply the Logistic regression
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.model_selection import KFold # use for cross validation
from sklearn.model_selection import GridSearchCV# for tuning parameter
from sklearn.ensemble import RandomForestClassifier # for random forest classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
#from xgboost import XGBRegressor, XGBClassifier
#from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn import svm # for Support Vector Machine
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
# Any results you write to the current directory are saved as output.
# dont worry about the error if its not working then insteda of model_selection we can use cross_validation

ModuleNotFoundError: No module named 'catboost'

# Dataset Description
The data is taken from "https://www.kaggle.com/mathchi/diabetes-data-set"

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases.All the patients here are female 21 years or older.It contains the following columns:

Pregnancies: Number of times pregnant
Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure: Diastolic blood pressure (mm Hg)
SkinThickness: Triceps skin fold thickness (mm)
Insulin: 2-Hour serum insulin (mu U/ml)
BMI: Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction: Diabetes pedigree function
Age: Age (years)
Outcome: Class variable (0 or 1)

# Objectives
Clean the data and deal with missing values 

Perform EDA to get a better understanding of underlying trends

Fit different models and tune their hyperparameter for best performance

# Data Cleaning

In [None]:
data=pd.read_csv('diabetes.csv')

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.describe()

### observation
We note that minimum value for columns such as Glucose,BP,Insulin,BMI are 0 which is not possible , hence these must be missing values which are encoded as 0 We can use imputation techniques to deal with these

In [None]:
corrupted_data=data.loc[:,'Glucose':'BMI']
corrupted_data.replace(0,np.nan,inplace=True)
data.loc[:,'Glucose':'BMI']=corrupted_data

In [None]:
data.describe()

In [None]:
data.isnull().sum()

Lets observe these missing values more closely and find out if their is any relation

In [None]:
import missingno as msno

In [None]:
msno.bar(data)

In [None]:
msno.matrix(data)

In [None]:
msno.matrix(data.sort_values(by="Insulin"))

Looking at the above matrix we can figure out that glucose and BMI are MCAR(Missing completely at random) and insulin , SkinThickness are MNAR(Missing not at random)

In [4]:
msno.heatmap(data)

NameError: name 'msno' is not defined

We can see heavy correlation between SkinThickness and Insulin


In [None]:
# drop rows having missing values for Glucose or BMI
data.dropna(subset=['Glucose','BMI'],inplace=True)

In [None]:
data_knn=data.copy()
knn_imputer=KNNImputer(n_neighbors=5)
data_knn.iloc[:]=knn_imputer.fit_transform(data_knn)

Using knnimputer we fill the missing values without impacting the variability of the dataset


In [None]:
data_knn.describe()

## outlier removal

In [None]:
cleaned_data=data_knn

In [None]:
sns.boxplot(data=data_knn,x='Pregnancies')

value above 13 is outlier

In [None]:
cleaned_data=cleaned_data[cleaned_data['Pregnancies']<=13]

In [None]:
sns.boxplot(data=data_knn,x='Glucose')

there is no outlier

In [None]:
sns.boxplot(data=data_knn,x='BloodPressure')

outlier below 30 and above 110

In [None]:
cleaned_data=cleaned_data[cleaned_data['BloodPressure']>=30]
cleaned_data=cleaned_data[cleaned_data['BloodPressure']<=110]

In [None]:
sns.boxplot(data=data_knn,x='SkinThickness')

outlier above 70

In [None]:
cleaned_data=cleaned_data[cleaned_data['SkinThickness']<=70]

In [None]:
sns.boxplot(data=data_knn, x='Insulin')

outlier above 500

In [5]:
cleaned_data=cleaned_data[cleaned_data['Insulin']<=500]

NameError: name 'cleaned_data' is not defined

In [None]:
sns.boxplot(data=data_knn,x='Age')

outlier above 75

In [None]:
cleaned_data=cleaned_data[cleaned_data['Age']<=75]

### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(cleaned_data.corr(),annot=True)

### The following conclusions can be drawn from the heatmap :

- There is not much correlation among different predictors
- Age and Pregnancy have a positive corr indicating that adults have more children
- There is positive correlation among Insulin and Glucose as well which could be explained by the      fact that perhaps the type 1 diabetic patients who generally have high Glucose , were given        Insulin injections.
- 

In [None]:
sns.displot(data=cleaned_data,x='Pregnancies',hue='Outcome',kde=True)

it seem number of pregnancy doesn't impact in output

In [None]:
sns.displot(data=cleaned_data,x='Glucose', hue='Outcome',kde=True)

People who don't have diabeties have glucose normally distributed around 100 as mean , while those with diabeties have much higher level of glucose ranging between 100 and 200

In [None]:
sns.displot(data=cleaned_data ,x='Age',hue='Outcome',kind='kde')

In [None]:
data_knn.head()

## Split Data for Training

In [None]:
X=cleaned_data.iloc[:,:-1]
y=cleaned_data.iloc[:,-1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,stratify=y,random_state=42) 
#with stratify=True we get same proportion in our train test split as in our original dataset

# Classification Models

In [6]:
lj = LogisticRegression(solver="liblinear").fit(X_train,y_train)
gnb = GaussianNB().fit(X_train,y_train)
knnc = KNeighborsClassifier().fit(X_train,y_train)
cartc = DecisionTreeClassifier(random_state=42).fit(X_train,y_train)
rfc = RandomForestClassifier(random_state=42,verbose=False).fit(X_train,y_train)
gbmc = GradientBoostingClassifier(verbose=False).fit(X_train,y_train)
catbc = CatBoostClassifier(verbose=False).fit(X_train,y_train)

NameError: name 'X_train' is not defined

In [None]:
modelsc = [lj,gnb,knnc,cartc,rfc,gbmc,catbc]
for model in modelsc:
    name = model.__class__.__name__
    predict = model.predict(X_test)
    R2CV = cross_val_score(model,X_test,y_test,cv=10,verbose=False).mean()
    error = -cross_val_score(model,X_test,y_test,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
    roc = roc_curve(y_test,predict)
    print(name + ": ")
    print("-" * 10)
    print("ACC-->",accuracy_score(y_test,predict))
    print("R2CV-->",R2CV)
    print("MEAN SQUARED ERROR-->",np.sqrt(error))
    print("ROC-->",roc)
    print("-" * 30)


best model is CatBoostClassifier