In [None]:
from mlxtend.plotting import plot_decision_regions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## OSEMN Pipeline
* ### O - Obtaining data
* ### S - Scrubbing data (cleaning)
* ### E - Exploring data (visualizing to find patterns)
* ### M - Modeling data
* ### N - iNterpreting data

In [None]:
# Loading dataset 
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

# Look at the first 5 rows
df.head()

In [None]:
# Basic EDA (Exploratory Data Analysis)
df.info()

In [None]:
df.describe()

In [None]:
# .T (transpose the table) --> maybe easier to inspect?
df.describe().T
# we can see that there are variables that have an invalid zero value
# Glucose, BloodPressure, SkinThickness, Insulin, BMI (at min)

In [None]:
# Copy the original dataframe to new variable
df_new = df.copy()

# we will replace 0 with NaN
df_new[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df_new[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0,np.NaN)

# check the counts of NaN
print(df_new.isnull().sum())

In [None]:
# But need to understand data distribution first before replacing NaN with methods
p = df.hist(figsize=(10,10))

In [None]:
# center at the middle -> mean
# skew -> median
# object with missing value filled if inplace = True
# Glucose, BloodPressure, SkinThickness, Insulin, BMI
df_new['Glucose'].fillna(df_new['Glucose'].mean(), inplace=True)
df_new['BloodPressure'].fillna(df_new['BloodPressure'].mean(), inplace=True)
df_new['SkinThickness'].fillna(df_new['SkinThickness'].median(), inplace=True)
df_new['Insulin'].fillna(df_new['Insulin'].median(), inplace=True)
df_new['BMI'].fillna(df_new['BMI'].mean(), inplace=True)

In [None]:
# Plotting after NaN removal
p = df_new.hist(figsize=(10,10))

In [None]:
df.shape

In [None]:
# data type analysis
# using seaborn
sns.countplot(y=df_new.dtypes.map(str), data=df_new)
plt.xlabel("Count of each data type")
plt.ylabel("Data types")
plt.show()

In [None]:
# null count analysis
import missingno as msno
pmis = msno.bar(df_new)

In [None]:
# checking bias in number of diabetic patients
p = df.Outcome.value_counts().plot(kind='bar')
# 0 = non-diabetic patients
# 1 = diabetic patients
# 0 is almost 2 times higher than 1

In [None]:
from pandas.plotting import scatter_matrix
p = scatter_matrix(df, figsize=(20,20))

In [None]:
p = sns.pairplot(df_new, hue='Outcome')
# can be related to Pearson's correlation coefficient

In [None]:
# Heatmap for unclean data
p = sns.heatmap(df.corr(), annot=True, cmap='RdYlGn')
plt.figure(figsize=(18,16))

In [None]:
# Heatmap for clean data
p = sns.heatmap(df_new.corr(), annot=True, cmap='RdYlGn')
plt.figure(figsize=(18,16))

In [None]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# scale and drop the target (so we get feature --> x)
X = pd.DataFrame(scaler.fit_transform(df_new.drop(["Outcome"],axis=1),), columns=['Pregnancies','Glucose','BloodPressure','SkinThickness',
                                                                                 'Insulin','BMI','DiabetesPedigreeFunction','Age'])

In [None]:
# All features are present
X.head()

In [None]:
# Target --> 0 and 1
y = df_new.Outcome
y.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# we will try knn range from 1-15 and check its score
test_score = []
train_score = []

for i in range(1,15):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    train_score.append(knn.score(X_train, y_train))
    test_score.append(knn.score(X_test, y_test))

In [None]:
max_train_score = max(train_score)
idx_train_max = max(range(len(train_score)), key=train_score.__getitem__) + 1
print('Max train score {} % and k = {}'.format(max_train_score*100, idx_train_max))

In [None]:
max_test_score = max(test_score)
idx_test_max = max(range(len(test_score)), key=test_score.__getitem__) + 1
print('Max test score {} % and k = {}'.format(max_test_score*100, idx_test_max))

In [None]:
# Result visualization
plt.figure(figsize=(15,8))
p = sns.lineplot(range(1,15), train_score, marker='*', label='Train score')
p = sns.lineplot(range(1,15), test_score, marker='o', label='Test score')

In [None]:
# So, the best result (for test score) is k=11, so we will use it for final model
knn = KNeighborsClassifier(11)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

### Model performance analysis
* Confusion matrix
* Classification report
* ROC-AUC

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import roc_curve
y_pred_proba = knn.predict_proba(X_test)[:,1]
fpr, tpr, threshold = roc_curve(y_test, y_pred_proba)

In [None]:
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=11) ROC curve')
plt.show()

In [None]:
#Area under ROC curve (AUC)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)

## Hyper Parameter optimization
Grid search is an approach to hyperparameter tuning that will methodically build and evaluate a model for each combination of algorithm parameters specified in a grid.

In [None]:
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors
param_grid = {'n_neighbors':np.arange(1,50)} # try n_neighbors from 1 to 50
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5) # cross validate = 5
knn_cv.fit(X,y)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))