# import Libraries

In [None]:
import pandas as pd
import seaborn as sns

# read dataset 

In [None]:
data_df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv',na_values={'Glucose':0,'BloodPressure':0,'SkinThickness':0,'Insulin':0,'BMI':0})

## explore dataframe characteristics

In [None]:
data_df.head()

In [None]:
data_df.info()

In [None]:
data_df.describe().T

### we notice lots of null values

## explore data

In [None]:
sns.pairplot(data_df,hue='Outcome')

In [None]:
correlation_matrix = data_df.corr()
sns.heatmap(correlation_matrix, cmap='Reds',annot=True)

### get columns with null

In [None]:
isna_c = data_df.isna().sum()
isna_c[isna_c > 0]

In [None]:
data_df.groupby('Outcome').median().T

## fix Clucose data

In [None]:
sns.displot(data_df.Glucose)

In [None]:
data_df.Glucose.fillna(data_df.groupby('Outcome')['Glucose'].transform('median'),inplace=True)

In [None]:
sns.displot(data_df.Glucose)

In [None]:
sns.pairplot(data_df[['Outcome','Glucose']],hue='Outcome')

## fix BloodPressure

In [None]:
data_df['bmi_cat'] = data_df.BMI.apply(lambda x: 'U' if x < 18.5 else 'N' if x < 25 else 'V' if x < 30 else 'O')
data_df['age_cat'] = data_df.Age.apply(lambda x: 'A' if x < 30 else 'B' if x < 40 else 'C' if x < 50 else 'D')
data_df.bmi_cat = data_df.bmi_cat.astype('category')
data_df.age_cat = data_df.age_cat.astype('category')

In [None]:
sns.pairplot(data_df[['bmi_cat','BloodPressure','age_cat']],hue='bmi_cat')

In [None]:
data_df[['bmi_cat','BloodPressure','age_cat']].groupby(['bmi_cat','age_cat']).median()

In [None]:
data_df.BloodPressure.fillna(data_df.groupby(['bmi_cat','age_cat'])['BloodPressure'].transform('median'),inplace=True)

In [None]:
sns.pairplot(data_df[['Outcome','BloodPressure']],hue='Outcome')

In [None]:
isna_c = data_df.isna().sum()
isna_c[isna_c > 0]

In [None]:
sns.boxplot(data=data_df[['bmi_cat','BloodPressure']],y='BloodPressure',x='bmi_cat')

In [None]:
import matplotlib.pyplot as plt
plt.scatter(data=data_df[['Age','BloodPressure']],y='BloodPressure',x='Age')

In [None]:
data_df.BMI.fillna(data_df.groupby(['Outcome','age_cat'])['BMI'].transform('median'),inplace=True)

In [None]:
sns.pairplot(data_df[['Outcome','BMI']],hue='Outcome')

In [None]:
isna_c = data_df.isna().sum()
isna_c[isna_c > 0]

## exploring skinthikness

In [None]:
data_df.SkinThickness.fillna(data_df.groupby(['Outcome','bmi_cat'])['SkinThickness'].transform('median'),inplace=True)

In [None]:
sns.pairplot(data_df[['Outcome','SkinThickness']],hue='Outcome')

## Insulin preparation

In [None]:
sns.pairplot(data_df[['Outcome','Insulin']],hue='Outcome')

In [None]:
sns.boxplot(data=data_df[['Outcome','Glucose']],x='Outcome',y='Glucose')

In [None]:
data_df['glu_cat'] = data_df.Glucose.apply(lambda x: 'N' if x < 140 else 'U')


In [None]:
data_df.Insulin.fillna(data_df.groupby(['bmi_cat','glu_cat','Outcome'])['Insulin'].transform('median'),inplace=True)

In [None]:
sns.pairplot(data_df[['Outcome','Insulin']],hue='Outcome')

In [None]:
sns.pairplot(data_df,hue='Outcome')

# modeling

In [None]:
data_df.info()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_df.drop(['Outcome','glu_cat','age_cat','bmi_cat'],axis=1), data_df.Outcome, test_size=0.20, random_state=42)

## using KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(X_train, y_train)
neigh.score(X_train,y_train)

In [None]:
neigh.score(X_test,y_test)

In [None]:
from sklearn import metrics
y_pred = neigh.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
sns.heatmap(cnf_matrix)

## using logistic regression

In [None]:
X_train.describe().T

## Error with Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(random_state=42)
# clf.fit(X_train, y_train)
# clf.score(X_train,y_train)

In [None]:
# clf.score(X_test,y_test)

In [None]:
# y_pred = clf.predict(X_test)
# cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
# cnf_matrix

In [None]:
# sns.heatmap(cnf_matrix)

## SVM

In [None]:
from sklearn import svm
clf2 = svm.SVC()
clf2.fit(X_train, y_train)
clf2.score(X_train, y_train)

In [None]:
clf2.score(X_test,y_test)

In [None]:
y_pred = clf2.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
sns.heatmap(cnf_matrix)

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
 cross_validate(neigh, data_df.drop(['Outcome','glu_cat','age_cat','bmi_cat'],axis=1), data_df.Outcome, cv=3)

In [None]:
 cross_validate(clf2, data_df.drop(['Outcome','glu_cat','age_cat','bmi_cat'],axis=1), data_df.Outcome, cv=3)