In [None]:
# libraries we will use
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve,confusion_matrix,f1_score,precision_recall_curve
from sklearn.model_selection import GridSearchCV, cross_val_score,StratifiedShuffleSplit


Reading data and taking a quick look at it

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data_copy = data.copy()
data.head()

In [None]:
outcome_values = data['Outcome'].value_counts()
outcome_values.sort_index()
plt.pie(outcome_values.values, labels=['Negative', 'Positive'], autopct='%1.1f%%')
plt.title('Diabete result')
plt.show()

It is clear that there are enough examples of both classes and we will easily split data to train-test splits

In [None]:
data.info()

It looks like there are no missing values and there are no categorical variables , that makes it easier to work with datasets.

In [None]:
data.describe()

Some columns have minimum value of 0, which means they can be missing values, even though there are no NaN value in dataset

In [None]:
plt.figure(figsize = (17,14))
corr = data.corr()
sns.heatmap(corr, annot=True)
plt.show()

From correlation matrix, we can see that there are no highly correlated features with outcome, but we can note glucose whose corelation with outcome is 0.47. There is also moderate correlation between pregnancies and age, which is understandable. It can also be seen that BloodPressure and SkinThicknes aren't correlated with Outcome.
Let's look at the distribution of features:

In [None]:
fig, axes = plt.subplots(3,3,figsize = (18,16))

for i,feature in enumerate(data.columns):
    sns.histplot(data[feature], ax=axes[i//3, i%3])

There are several columns that can't be 0. If your blood pressure or glucose level is 0, then you are dead. BMI also can't be 0 as it is weight/(height\*height). Skin thickness can't be 0 too. Although insulin can be 0, it isn't likely that about half of women in this dataset to have 0 insulin. Let's compare insulin, glucose level and outcome:

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(data=data, x = 'Glucose', y='Insulin', hue = 'Outcome')
plt.show()

I am no medical expert, but I know that 0 insulin leads to type-I diabete and in this scatterplot we see that there are several patients with insulin level of 0, but they are not diagnosed with diabete, which means there is missing data. To be honest, I can't be sure that all those 0 level insulins are missing, nevertheless I will fill them. In the scatterplot we can also see that Outcome classes doesn't have the same distribution at least in Glucose-Insulin relation, so when we are imputing values, we have to be careful of class. Also let's look at how many zeros are in columns.

In [None]:
missing_cols = ['Glucose', 'Insulin', 'SkinThickness', 'BloodPressure', 'BMI']
missing_counts = {}
total_rows = data.shape[0]
for col in missing_cols:
    count = (data[col] == 0).sum()
    missing_counts[col] = count

plt.figure(figsize=(13,10))
ax = sns.barplot(x=list(missing_counts.keys()),y=list(missing_counts.values()))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2,
            height + 3,
            '{:1.2f}%'.format(height*100/total_rows),
            ha="center") 
plt.title('Distribution of missing values')
plt.show()

Let's mark 0 values as NaN to make things easier

In [None]:
for col in missing_cols:
    data.loc[data[col] == 0.0, [col]] = None

In [None]:
fig,ax = plt.subplots(2,3,figsize = (15,10))
for i, col in enumerate(missing_cols):
    sns.boxplot(y = data[col], x=[""]*(data.shape[0]),hue = data['Outcome'], ax = ax[i//3][i%3])
ax[1][2].set_visible(False)

Now we clearly see that, we can't just impute median value to missing columns. We should take Outcome into account

In [None]:

for col in missing_cols:
    
    positive_median = data[data['Outcome'] == 1][col].median()
    negative_median = data[data['Outcome'] == 0][col].median()
    
    data.loc[(data['Outcome']==0)&(data[col].isna()),col] = negative_median
    data.loc[(data['Outcome']==1)&(data[col].isna()),col] = positive_median

Let's look at age, pregnancy and BMI with regard to outcome. Even though they are continuous variables, it doesn't make sense to use them as it is.

In [None]:
fig, ax = plt.subplots(3,1,figsize=(15,15))

sns.histplot(x=data['BMI'],hue=data['Outcome'],multiple='stack',ax=ax[0])
sns.countplot(x=data['Age'],hue=data['Outcome'],ax=ax[1])
sns.countplot(x=data['Pregnancies'],hue=data['Outcome'],ax=ax[2])


plt.show()

Older people's chances of having diabete is slightly higher than younger people's, but it's not that clear from barplot (look at ages 37-38-39). Let's divide these features to ranges and treat them as categoric variables

In [None]:
data['CategoricalAge'] = pd.qcut(data['Age'], q = 5)
data['CategoricalBMI'] = pd.qcut(data['BMI'], q=5)
data['CategoricalPregnancies'] = pd.qcut(data['Pregnancies'], q=5)


fig, ax = plt.subplots(3,1,figsize=(15,15))

sns.countplot(x=data['CategoricalBMI'],hue=data['Outcome'],ax=ax[0])
sns.countplot(x=data['CategoricalAge'],hue=data['Outcome'],ax=ax[1])
sns.countplot(x=data['CategoricalPregnancies'],hue=data['Outcome'],ax=ax[2])

plt.show()

It now looks much better, let's one-hot encode these categorical columns and drop previous features

In [None]:
data = pd.get_dummies(data)
data.drop(['Age', 'Pregnancies', 'BMI'], axis = 1, inplace=True)

In [None]:
continuous_variables = ['Glucose','BloodPressure','SkinThickness','Insulin','DiabetesPedigreeFunction']
plt.figure(figsize = (17,14))
sns.pairplot(data[continuous_variables+['Outcome']],hue='Outcome')

There are outlier in data and we will deal with them after checking accuracies

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[continuous_variables] = pd.DataFrame(scaler.fit_transform(data[continuous_variables]))

Since there are not same amount of classes, we should use StratifiedShuffleSplit to keep ratio in test/train set

In [None]:
data_x = data.drop(['Outcome'], axis = 1)
data_y = data['Outcome']


sss = StratifiedShuffleSplit(test_size=0.3,n_splits=1,random_state=4321)
train_val_index, test_index = next(sss.split(data_x, data_y))
X_train, X_test = data_x.iloc[train_val_index, :], data_x.iloc[test_index]
y_train, y_test = data_y[train_val_index], data_y[test_index]

X_train.reset_index(drop = True, inplace=True)
y_train.reset_index(drop=True,inplace = True)

In [None]:
scores = {}
models = [LogisticRegression(max_iter=10000),KNeighborsClassifier(),RandomForestClassifier(random_state=42),GradientBoostingClassifier(random_state=42)]
for model in models:
    cv_scores = cross_val_score(model, X_train,y_train)
    estimator = model.__class__.__name__
    scores[estimator] = np.mean(cv_scores)*100
ax = sns.barplot(y=list(scores.keys()),x=list(scores.values()),orient='h')
for p in ax.patches:
    width = p.get_width()
    ax.text( width/2,
            p.get_y()+0.5,
            '{:1.2f}%'.format(width))

RandomForestClassifier look okay, let's fine tune it

In [None]:
params = {'n_estimators':np.arange(100,1001,100),
         'max_depth':np.arange(2,41,2)}
    
rfc_cv = GridSearchCV(RandomForestClassifier(),param_grid=params, cv = 5, verbose = 2,n_jobs=5,scoring = 'f1').fit(X_train, y_train)

In [None]:
rfc_model = rfc_cv.best_estimator_
rfc_cv.best_score_,rfc_cv.best_params_

In [None]:
def scoring(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rfc_mat = confusion_matrix(y_test, y_pred)
    print('Recall: ', recall_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Roc-auc score: ',roc_auc_score(y_test,y_pred))
    print('F1 score: ', f1_score(y_test, y_pred))
    sns.heatmap(rfc_mat,annot=True,fmt='1')
    plt.xlabel('True classes')
    plt.ylabel('Predictions')
    plt.show()
    
    

In [None]:
scoring(rfc_model, X_test, y_test)

It is sensitive data and we must increase recall to predict as much diabetic patients as possible, let's look at precision and recall and see if we can choose .

In [None]:
predict_proba = rfc_model.predict_proba(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, predict_proba[:,1])
thresholds=np.concatenate([thresholds, [1.0]])


In [None]:
fig, ax =   plt.subplots(1,2,figsize = (15,5))
ax[0].grid()
ax[0].plot(recall, precision)
ax[0].set_xlabel('Recall')
ax[0].set_ylabel('Precision')
ax[0].plot([0.9, 0.9], [min(precision),1],'g--')
ax[1].grid()
ax[1].plot(thresholds, precision,'r',label = 'Precision')
ax[1].plot(thresholds, recall, label = 'Recall')
ax[1].set_xlabel('Thresholds')
ax[1].legend()
ax[1].plot([min(thresholds), 1], [0.9,0.9],'g--')

plt.show()

Looks like we can get about 0.85 precision with 0.9 recall, We can find the threshold where recall is greater than 0.9, but also precision isn't too low.

In [None]:
ind = np.argmin(recall >= 0.90)-1
threshold = thresholds[ind]
print('Threshold is {}'.format(threshold))
y_pred = (predict_proba[:,1]>threshold).astype(np.int32)
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True,fmt = '1')
plt.show()

In [None]:
print('AUC:',roc_auc_score(y_test, y_pred))

We can get 90% without handling outliers, so I will leave them as they are. This is my first time sharing notebook here and I have mistakes of course. Please do tell me what can I improve. If you like my work, consider upvoting 