In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os 
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
DATASET_PATH = "/kaggle/input/pima-indians-diabetes-database/"

In [None]:
df = pd.read_csv(os.path.join(DATASET_PATH, "diabetes.csv"))
df.head()

In [None]:
df.isnull().sum()

# No Null values

In [None]:
df.info()

### No categorical/datetime columns

In [None]:
df.describe()

### some features look have max outliers
### Many have 0 values assuming it's not recorded

In [None]:
df_copy = df.copy(deep = True)
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df_copy[
    ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
df_copy.isnull().sum()

In [None]:
df_copy.isnull().sum()/len(df_copy)

# Big diff now

In [None]:
df_copy.hist(figsize = (20,20))

### some have a right skew like (Pregnancies, Insulin, BMI, DiabetesPedigreeFunction, Age) other almost normal distriuted

In [None]:
temp_df_mean = df_copy.groupby(["Outcome"])["Glucose", "BloodPressure"].mean().reset_index()
temp_df_mean.columns = ["Outcome", "Glucose_mean", "BloodPressure_maen"]
temp_df_mean

In [None]:
temp_df_median = df_copy.groupby(["Outcome"])["SkinThickness", "Insulin", "BMI"].median().reset_index()
temp_df_median.columns = ["Outcome", "SkinThickness_median", "Insulin_median", "BMI_median"]
temp_df_median

### For the right skew will use median to fill nan
### normal distibution will use mean to fill nan

In [None]:
#Glucose                     0.006510
#BloodPressure               0.045573
#SkinThickness               0.295573
#Insulin                     0.486979
#BMI                         0.014323
# assign flag for the big fraction of missing values

df_copy["SkinThickness_missflag"] = df_copy["SkinThickness"].apply(lambda x: 0 if np.isnan(x) else 1)
df_copy["Insulin_missflag"] = df_copy["Insulin"].apply(lambda x: 0 if np.isnan(x) else 1)

In [None]:
#df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace = True)
#df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(), inplace = True)

df_copy = pd.merge(df_copy, temp_df_mean, on="Outcome", how="inner")

df_copy["Glucose"].fillna(df_copy["Glucose_mean"], inplace=True)
df_copy["BloodPressure"].fillna(df_copy["BloodPressure_maen"], inplace=True)

df_copy.drop(["Glucose_mean", "BloodPressure_maen"], axis=1, inplace=True)

In [None]:
#df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(), inplace = True)
#df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace = True)
#df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace = True)


df_copy = pd.merge(df_copy, temp_df_median, on="Outcome", how="inner")

df_copy["SkinThickness"].fillna(df_copy["SkinThickness_median"], inplace=True)
df_copy["Insulin"].fillna(df_copy["Insulin_median"], inplace=True)
df_copy["BMI"].fillna(df_copy["BMI_median"], inplace=True)

df_copy.drop(["SkinThickness_median", "Insulin_median", "BMI_median"], axis=1, inplace=True)

In [None]:
df_copy

In [None]:
df_copy.hist(figsize = (20,20))

In [None]:
df_copy.isnull().sum()

# No null now

In [None]:
df_copy["Outcome"].value_counts().plot(kind="bar")

### data is unblance almost 1 1/2 0

In [None]:
p=sns.pairplot(df_copy, hue = 'Outcome')

### almost nothing is linearly correlated except (SkinThickness, BMI)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df_copy.corr(), annot=True)

### some features have alomst 0.5 correlation factor like (Pregnancies, Age), (SkinThickness, BMI), (Glucose, Outcome)

In [None]:
X = df_copy.drop(["Outcome"],axis = 1)
y = df_copy["Outcome"]

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=0, stratify=y)

# spliting our dataset to 20% test and 80% train with stratify on y outcome to minize the bais 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)                      
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

def fit(model, param_grid, X_train, y_train, cv=10):    
    search = GridSearchCV(model, param_grid=param_grid, cv=cv)
    search.fit(X_train, y_train)
    return search.best_estimator_

In [None]:
def predict(model, X_test):    
    return model.predict(X_test)  

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report

def get_accuracy(y_true, y_hat, text=""):
    print(text)
    print(classification_report(y_true, y_hat))

In [None]:
param_grid = {"penalty":["l1","l2","elasticnet"], 
              "C":[1, 0.8, 0.6]
             }

In [None]:
model = fit(LogisticRegression(), param_grid, X_train, y_train)   
y_hat = predict(model, X_test)
get_accuracy(y_test, y_hat, text=model)

### The linear models did not do a great work as the data itself is not linear in some how

In [None]:
param_grid = {"n_neighbors":[1,2,3,5,6,7]
             }

In [None]:
model = fit(KNeighborsClassifier(), param_grid, X_train, y_train)   
y_hat = predict(model, X_test)
get_accuracy(y_test, y_hat, text=model)

### the knn classifier did not do much with precision, recall, accuracy

In [None]:
param_grid = {"kernel":["linear","poly","rbf"], 
              "C":[1, 0.8, 0.6],
              "degree": [3, 4, 5]
             }

In [None]:
model = fit(SVC(), param_grid, X_train, y_train)   
y_hat = predict(model, X_test)
get_accuracy(y_test, y_hat, text=model)

### The regularization coff c make a good jump with SVM accuracy