### In this notebook i'll be trying to use some techniques to handle missing values and try out a few algorithms

### Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import v_measure_score
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
diabetes_data_copy = df.copy(deep = True)
df.head()

In [None]:
df.info()

# looks like no null values...

In [None]:
df.describe()

# the values of 'Glucose','BloodPressure','SkinThickness','Insulin','BMI' are not supposed to be zero
# thier probably NaN values

In [None]:
# lets replace the zero's in these columns with NaN

df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

df.head()

In [None]:
df.isna().sum()

In [None]:
sns.pairplot(df);

### pregnancies, skinThickness, insulin, BMI, DiabetesPedigreeFunction, age are all right skewed

### We can apply a few different techniques to get rid of missing values

### we will try to use both mean, median, mode and random sampling method

In [None]:
def view_mmm(df,col,type='bar'):
    df1 = df[col].fillna(df[col].mean())
    df2 = df[col].fillna(df[col].median())
    df3 = df[col].fillna(df[col].mode()[0])
    if type=='bar':
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,5))
        sns.distplot(df1, ax=ax1)
        sns.distplot(df[col],ax=ax1)
        ax1.set_xlabel(col+"_mean")

        sns.distplot(df2, ax=ax2) 
        sns.distplot(df[col],ax=ax2)
        ax2.set_xlabel(col+"_median")

        sns.distplot(df3, ax=ax3)
        sns.distplot(df[col],ax=ax3)
        ax3.set_xlabel(col+"_mode")
        plt.tight_layout()
    
    else:
        from scipy.stats import probplot
        probplot(df1, dist="norm", plot=plt)
        plt.show()
        probplot(df2, dist="norm", plot=plt)   
        plt.show()
        probplot(df3, dist="norm", plot=plt) 
        plt.show()

def to_mmm(df,col):
    """ pass dataframe and column name whose missing values you want replaced by mean, median, mode"""
    print(df.shape)
    df1 = df[col].fillna(df[col].mean())
    df2 = df[col].fillna(df[col].median())
    df3 = df[col].fillna(df[col].mode()[0])
    print(df1.size)
    df_mmm = pd.DataFrame({col+'_mean':df1,col+'_median':df2,col+'_mode':df3})
    print(df_mmm.shape)
    return df_mmm

view_mmm(df,'Glucose', type='proba');

In [None]:
view_mmm(df,'BloodPressure', type='proba');

In [None]:
view_mmm(df,'SkinThickness', type='proba');

In [None]:
view_mmm(df,'Insulin', type='proba');

In [None]:
view_mmm(df,'BMI', type='proba');

In [None]:
def rand_samp(df,col):
#     print(df)
    if df.shape[0]>2000:
        print(df.shape)
        return
    print(df.shape)
    rand_samp = df[col].dropna().sample(df[col].isna().sum(),random_state=0)
    rand_samp.index = df[df[col].isna()].index
    df_rand = df[col].copy()
    print(df_rand.shape)
    df_rand.loc[df[col].isna(),] = rand_samp
    df_rand = pd.DataFrame({col+"_rand_samp":df_rand})
    print(df_rand.shape)
    return df_rand

def view_rand_samp(df,col):
    rand_samp = df[col].dropna().sample(df[col].isna().sum(),random_state=0)
    rand_samp.index = df[df[col].isna()].index
    df_rand = df[col].copy()
    df_rand.loc[df[col].isna(),] = rand_samp
    
    fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(15,5))
    sns.distplot(df_rand, ax=ax1)
    sns.distplot(df[col],ax=ax1)
    from scipy.stats import probplot 
#     ax2.get_lines()[0].set_markerfacecolor('C0')
    probplot(df_rand, dist="norm", plot=plt)
    plt.show()

In [None]:
view_rand_samp(df,'Glucose')

In [None]:
view_rand_samp(df,'Insulin')

In [None]:
view_rand_samp(df,'BloodPressure')

In [None]:
view_rand_samp(df,'SkinThickness')

In [None]:
view_rand_samp(df,'BMI')

In [None]:
# Glucose has the least number of missing values so we'll just replace it with mode

df['Glucose'].fillna(df['Glucose'].mode()[0], inplace=True)

In [None]:
df = pd.concat([df, to_mmm(df,'BloodPressure')], axis=1)
df = pd.concat([df, to_mmm(df,'SkinThickness')], axis=1)
df = pd.concat([df, to_mmm(df,'Insulin')], axis=1)
df = pd.concat([df, to_mmm(df,'BMI')], axis=1)


In [None]:
# df = pd.concat([df, rand_samp(df,'Glucose')], axis=1) # view_rand_samp(df,'Age')
df = pd.concat([df, rand_samp(df,'BloodPressure')], axis=1) # view_rand_samp(df,'Age')
df = pd.concat([df, rand_samp(df,'SkinThickness')], axis=1) # view_rand_samp(df,'Age')
df = pd.concat([df, rand_samp(df,'Insulin')], axis=1) # view_rand_samp(df,'Age')
df = pd.concat([df, rand_samp(df,'BMI')], axis=1) # view_rand_samp(df,'Age')


In [None]:
df.drop(['BloodPressure','SkinThickness','Insulin','BMI'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(15,15))
cor = round(df.corr(),2)
sns.heatmap(cor, annot=True);

In [None]:
cor['Outcome'].sort_values(ascending=False)

In [None]:
# we'll just select these functions

X = df[['Glucose','BMI_rand_samp','Age','SkinThickness_mean','Pregnancies','Insulin_mean','BloodPressure_median','DiabetesPedigreeFunction']].copy()
y = df['Outcome']

### If we apply logistic regression then the correlation and normality matters... but for algorithms which use decision trees it doesn't matter

### We can also use a decision tree to predict most of missing values (NOT USED)

In [None]:
# lets make a model that predicts our missing values

# from sklearn.tree import DecisionTreeRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.linear_model import LinearRegression

# df = pd.read_csv("diabetes.csv")
# def filling_model(df1):
    
#     df = df1.copy()
#     values = {}
#     all_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
    
#     for col in ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']:
#         model_col = [x for x in all_cols if x != col]
#         test = df[df[col] == 0].copy()
#         train = df[df[col] !=0 ].copy()
#         for i in [x for x in ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"] if x != col]:
#             train[i].replace(0,np.nan,inplace=True)
#         train.dropna(inplace=True)
#         x_train,x_test,y_train = train[model_col],test[model_col],train[col]
#         model = LinearRegression()
#         model.fit(x_train,y_train)
#         values[col] = model.predict(x_test)
#     return values

# def update_predictions(row):
#     for col in ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']:
#         if row[col] == 0:
#             if values[col] != []:
#                 val = values[col][0]
#                 values[col] = values[col][1:]
#                 row[col] = val
#     return row


# values = filling_model(df)

# df = pd.DataFrame(df.apply(update_predictions, axis=1))

# for i in ['Glucose','BloodPressure','SkinThickness','Insulin']:
#     df[i] = df[i].fillna(df[i].mean())

### Using mean of the values (some values were still NaN)

In [None]:
# df = pd.read_csv("diabetes.csv")
# for i in ['Glucose','BloodPressure','SkinThickness','Insulin']:
#     df[i] = df[i].fillna(df[i].mean())

### Scaling and spliting into train and test data

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)


x_train,x_test,y_train,y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=30)

### Using decision tree model

In [None]:
model = DecisionTreeClassifier(random_state=44)
model.fit(x_train,y_train)
model.score(x_test,y_test)

### decision tree using hyperparameter tuning to reduce overfitting

In [None]:
model = DecisionTreeClassifier(random_state=44)
path = model.cost_complexity_pruning_path(x_train,y_train)
ccp_alphas = path.ccp_alphas

scores = []

for ccp_alpha in ccp_alphas:
    model = DecisionTreeClassifier(ccp_alpha=ccp_alpha,random_state=44)
    model.fit(x_train,y_train)
    scores.append((str(ccp_alpha) + "  " + str(model.score(x_train,y_train)) + "  " + str(model.score(x_test,y_test))))
    

scores

In [None]:
# we have to select value which looks like it generalizes our model...
# that is where train and test scores are balanced
# highest value is 9th element from last

In [None]:
model = DecisionTreeClassifier(ccp_alpha=0.00684675923328408,random_state=30)
model.fit(x_train,y_train)
model.score(x_test,y_test)

## Trying other classification algorithms

### Random forest

In [None]:
model = RandomForestClassifier(random_state=44)
model.fit(x_train,y_train)
model.score(x_test,y_test)

### logistic regression

In [None]:
model = LogisticRegression(random_state=44)
model.fit(x_train,y_train)
model.score(x_test,y_test)

### xgboost

In [None]:
model = xgb.XGBClassifier(random_state=44)
model.fit(x_train,y_train)
model.score(x_test,y_test)

### knn

In [None]:
model = KNeighborsClassifier(n_neighbors=25)
model.fit(x_train,y_train)
print(model.score(x_test,y_test))


test_scores = []
train_scores = []


for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(x_train,y_train)
    
    train_scores.append(knn.score(x_train,y_train))
    test_scores.append(knn.score(x_test,y_test))
    
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))

max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))

In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')

In [None]:
#lets try with 5 neighbors

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))

In [None]:
y_pred_proba = model.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

In [None]:
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=5) ROC curve')
plt.show()

In [None]:
roc_auc_score(y_test,y_pred_proba)

### svm

In [None]:
model = SVC()
model.fit(x_train,y_train)
model.score(x_test,y_test)

### Naive bayes

In [None]:
model = GaussianNB()
model.fit(x_train,y_train)
model.score(x_test,y_test)

### Lets try cross val score

In [None]:
models = [LogisticRegression(),DecisionTreeClassifier(),RandomForestClassifier(),xgb.XGBClassifier(),KNeighborsClassifier(),GaussianNB(),SVC()]
results = []
for model in models:
    results.append((model, cross_val_score(model, x_scaled, y, cv=5)))

In [None]:
results