### Context
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

### Content
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

### Features
* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age: Age (years)
* Outcome: Class variable (0 or 1)

In [None]:
%matplotlib inline
RANDOM_STATE = 0

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option('display.float_format', '{:.3f}'.format)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, ParameterGrid, StratifiedKFold
from sklearn import metrics
from tqdm import tqdm

import eli5 
from eli5.sklearn import PermutationImportance

import itertools

import catboost as cb
from catboost import CatBoostClassifier
from catboost import Pool

In [None]:
def summary(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    return summary

In [None]:
def plot_cf_matrix_and_roc(model, 
                           X_train, 
                           y_train,
                           X_test, 
                           y_test,
                           y_pred, 
                           classes=[0,1],
                           normalize=False,
                           cmap=plt.cm.Blues):
    metrics_list = []
    
    # the main plot
    plt.figure(figsize=(15,5))

    # the confusion matrix
    plt.subplot(1,2,1)
    cm = metrics.confusion_matrix(y_test, y_pred)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        plt.title("Normalized confusion matrix")
    else:
        plt.title('Confusion matrix')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    # the result metrix
    summary_df = pd.DataFrame([[str(np.unique( y_pred )),
                               str(round(metrics.precision_score(y_test, y_pred.round()),3)),
                               str(round(metrics.accuracy_score(y_test, y_pred.round()),3)),
                               str(round(metrics.recall_score(y_test, y_pred.round(), average='binary'),3)),
                               str(round(metrics.roc_auc_score(y_test, y_pred.round()),3)),
                                str(round(metrics.cohen_kappa_score(y_test, y_pred.round()),3)),
                               str(round(metrics.f1_score(y_test, y_pred.round(), average='binary'),3))]], 
                              columns=['Class', 'Precision', 'Accuracy', 'Recall', 'ROC-AUC', 'Kappa', 'F1-score'])
    # print the metrics
    print("\n");
    print(summary_df);
    print("\n");
    
    plt.show()

In [None]:
def cross_val(X, y, param, cat_features='', class_weights = '', n_splits=3):
    results = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for tr_ind, val_ind in skf.split(X, y):
        X_train_i = X.iloc[tr_ind]
        y_train_i = y.iloc[tr_ind]
        
        X_valid_i = X.iloc[val_ind]
        y_valid_i = y.iloc[val_ind]
        
        if class_weights == '' :
            clf = CatBoostClassifier(iterations=param['iterations'],
                            loss_function = param['loss_function'],
                            depth=param['depth'],
                            l2_leaf_reg = param['l2_leaf_reg'],
                            eval_metric = param['eval_metric'],
                            leaf_estimation_iterations = 10,
                            use_best_model=True,
                            logging_level='Silent',
                            od_type="Iter",
                            early_stopping_rounds=param['early_stopping_rounds']
            )
        else:
            clf = CatBoostClassifier(iterations=param['iterations'],
                            loss_function = param['loss_function'],
                            depth=param['depth'],
                            l2_leaf_reg = param['l2_leaf_reg'],
                            class_weights = class_weights,
                            eval_metric = param['eval_metric'],
                            leaf_estimation_iterations = 10,
                            use_best_model=True,
                            logging_level='Silent',
                            od_type="Iter",
                            early_stopping_rounds=param['early_stopping_rounds']
            )
        
        
        if cat_features == '' :
            clf.fit(X_train_i, 
                    y_train_i,
                    eval_set=(X_valid_i, y_valid_i)
            )
        else:
            clf.fit(X_train_i, 
                    y_train_i,
                    cat_features=cat_features,
                    eval_set=(X_valid_i, y_valid_i)
            )
        
        # predict
        y_pred = clf.predict(X_valid_i)
        
        # select the right metric
        if(param['eval_metric'] == 'Recall'):
            metric = metrics.recall_score(y_valid_i, y_pred)
        elif(param['eval_metric'] == 'Accuracy'):
            metric = metrics.accuracy_score(y_valid_i, y_pred)
        elif(param['eval_metric'] == 'F1'):
            metric = metrics.f1_score(y_valid_i, y_pred)
        elif(param['eval_metric'] == 'AUC'):
            metric = metrics.roc_auc_score(y_valid_i, y_pred)
        elif(param['eval_metric'] == 'Kappa'):
            metric = metrics.cohen_kappa_score(y_valid_i, y_pred)
        else:
            metric = metrics.accuracy_score(y_valid_i, y_pred)
        
        # append the metric
        results.append(metric)
        
        print('Classes: '+str(np.unique( y_pred )))
        print('Precision: '+str(round(metrics.precision_score(y_valid_i, y_pred.round()),3)))
        print('Accuracy: '+str(round(metrics.accuracy_score(y_valid_i, y_pred.round()),3)))
        print('Recall: '+str(round(metrics.recall_score(y_valid_i, y_pred.round(), average='binary'),3)))
        print('Roc_Auc: '+str(round(metrics.roc_auc_score(y_valid_i, y_pred.round()),3)))
        print('F1 score: '+str(round(metrics.f1_score(y_valid_i, y_pred.round(), average='binary'),3)))
        print('Mean for '+param['eval_metric']+' OOF prediction: ',np.mean(results))
        print('Standard deviation for '+param['eval_metric']+' OOF prediction: ',np.std(results))
        print("\n")
    return sum(results)/n_splits

In [None]:
def catboost_GridSearchCV(X, y, params, cat_features='', class_weights='', n_splits=5):
    ps = {'score':0,'param': []}
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
        score = cross_val(X, y, prms, cat_features, class_weights, n_splits)
        if score > ps['score']:
            ps['score'] = score
            ps['param'] = prms
    print('Score: '+str(ps['score']))
    print('Params: '+str(ps['param']))
    return ps['param']

In [None]:
def check_target(df, target):
    sns.countplot(df[target])
    count_no = len(df[df[target]==0])
    count_yes = len(df[df[target]==1])
    pct_of_no_sub = count_no/(count_no+count_yes)*100
    pct_of_sub = count_yes/(count_no + count_yes)*100
    print('{} {} % YES '.format(count_yes, pct_of_sub))
    print('{} {} % NO '.format(count_no, pct_of_no_sub))

In [None]:
def num_vs_ctr(df, var1, var2):
    ctr = df[[var1, var2]].groupby(var1, as_index=False).mean().sort_values(var2, ascending=False)
    count = df[[var1, var2]].groupby(var1, as_index=False).count().sort_values(var2, ascending=False)
    merge = count.merge(ctr, on=var1, how='left')
    merge.columns=[var1, 'count', 'ctr%']
    return merge

def crosstab(df, features, target, label_cutoff = 'none'):
    for feature in features:
        if(label_cutoff != 'none' and label_cutoff > 0):
            # how many uninques
            unique_elements = data[feature].nunique()
            
            # if we have more uniques then the cutoff
            if(unique_elements > label_cutoff):
                # select the number most common values
                most_common_values = df.groupby(feature)[target].count().sort_values(ascending=False).nlargest(label_cutoff)
                # add another value "Other"
                df[feature] = np.where(df[feature].isin(most_common_values.index), df[feature], 'Other')
        
        # plot the crosstab
        pd.crosstab(df[feature],df[target]).plot(kind='bar', figsize=(20,5), stacked=True)
        plt.title(feature+' / '+target)
        plt.xlabel(feature)
        plt.ylabel(feature+' / '+target)
            
        # display the table obove each chart 
        return num_vs_ctr(df, feature, target)   
        

In [None]:
## Import the data file
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
data.head()

In [None]:
summary(data)

All the features in the data set are numerical. We will take deeper look at them and check for outliers, distribution and missing values.

## Pregnancies

In [None]:
data['Pregnancies'].describe()

The feature also contains values with 0, but these are not necessarily outlier because there are women who are not yet pregnant.

In [None]:
crosstab(data, ['Pregnancies'], 'Outcome')

### **Interpretation**

The data set includes only females at least 21 years old of Pima Indian heritage. During the pregnancy women can develop **gestational diabetes**. It is high blood sugar (glucose) that develops during pregnancy and usually disappears after giving birth.
It can happen at any stage of pregnancy, but is more common in the second or third trimester.

In women with gestational diabetes, blood sugar usually returns to normal soon after delivery. But if you've had gestational diabetes, you have a higher risk of getting type 2 diabetes. You'll need to be tested for changes in blood sugar more often.

In our data set we see, that women with much more pregnancy has higher chance to develop diabetes. 

## Was pregnant
Let's create a new feature for if the women was pregnant or not. This feature could be helpful in interaction with other features like the BMI or Insuline.

In [None]:
data['Was_pregnant'] = 'No'
data.loc[data.Pregnancies > 0, 'Was_pregnant'] = 'Yes'

In [None]:
crosstab(data, ['Was_pregnant'], 'Outcome')

**Interpretation**
* Women with pregnancy has a bit higher risk for diabetes
* Also the data set is very unbalanced, because we have 5 times more women with pregnancy

## Glucose
The blood sugar level, blood sugar concentration, or blood glucose level is the concentration of glucose present in the blood of humans and other animals.

In [None]:
data['Glucose'].describe()

In [None]:
plot = sns.boxplot('Glucose',data=data,orient = 'v',color='green')

It seems that there are some outliers with values equal to 0. We need to fix this and replace the 0 values with the mean value.

In [None]:
data.Glucose = data.Glucose.replace(0,data.Glucose.mean())

In [None]:
plot = sns.boxplot('Glucose',data=data,orient = 'v',color='green')

Let's check the correlation with the target variable 'Outcome'

In [None]:
crosstab(data, ['Glucose'], 'Outcome')

### **Interpretation**

The unit of measurement for the 2-hour OGTT in this dataset is assumed to be in milligrams per deciliter (mg/dl). It can be converted to Milimoles per liter (mmol/l) so that we may apply a qualitative test result to the numeric results. Multiplying the current results by 0.0555 will convert them to be measured in mmol/l.

In [None]:
data['Glucose_Mmol'] = data['Glucose']*0.0555

In [None]:
crosstab(data, ['Glucose_Mmol'], 'Outcome')

Normal blood glucose level (tested while fasting) for non-diabetics is between 3.9 and 7.1 mmol/L (70 to 130 mg/dL). The global mean fasting plasma blood glucose level in humans is about 5.5 mmol/L (100 mg/dL); however, this level fluctuates throughout the day. Blood sugar levels for those without diabetes and who are not fasting should be below 6.9 mmol/L (125 mg/dL). The blood glucose target range for diabetics, according to the American Diabetes Association, should be 5.0–7.2 mmol/l (90–130 mg/dL) before meals, and less than 10 mmol/L (180 mg/dL) two hours after meals (as measured by a blood glucose monitor). 
Source: [Wikipedia](https://en.wikipedia.org/wiki/Blood_sugar_level)

### **Interpretation**

- The higher the level of blood sugar, the higher the chance of diabetes

## Glucose_Mmol_Range
Let's create a new feature for the glucose levels, based on the ranges in Wikipedia for adults

In [None]:
data['Glucose_Mmol_Range'] = 'Normal'
data.loc[data.Glucose_Mmol < 3.9, 'Glucose_Mmol_Range'] = 'Hypoglycemia'
data.loc[((data.Glucose_Mmol >= 3.9) & (data.Glucose_Mmol < 7)), 'Glucose_Mmol_Range'] = 'Normal'
data.loc[((data.Glucose_Mmol >= 7)), 'Glucose_Mmol_Range'] = 'Hyperglycemia'

* Hypoglycemia: Blood sugar level < 3.9 mmol/L (Source: [Wikipedia](https://en.wikipedia.org/wiki/Hypoglycemia))
* Hyperglycemia: Blood sugar level > 7 mmol/L (Source: [Wikipedia](https://en.wikipedia.org/wiki/Hyperglycemia))


In [None]:
crosstab(data, ['Glucose_Mmol_Range'], 'Outcome')

### **Interpretation**

- The data shows very clear, that people with Hyperglycemia has very high risk to develop diabetes

## BloodPressure
Blood pressure (BP) is the pressure of circulating blood against the walls of blood vessels. Most of this pressure results from the heart pumping blood through the circulatory system. When used without qualification, the term "blood pressure" refers to the pressure in the large arteries. Blood pressure is usually expressed in terms of the systolic pressure (maximum pressure during one heartbeat) over diastolic pressure (minimum pressure between two heartbeats) in the cardiac cycle. 

Source [Wikipedia](https://en.wikipedia.org/wiki/Blood_pressure)



In [None]:
data['BloodPressure'].describe()

In [None]:
plot = sns.boxplot('BloodPressure',data=data,orient = 'v',color='green')

It seems that there are some outliers with values equal to 0. We need to fix this and replace the 0 values with the mean value.

In [None]:
data.BloodPressure = data.BloodPressure.replace(0,data.BloodPressure.mean())

In [None]:
plot = sns.boxplot('BloodPressure',data=data,orient = 'v',color='green')

We don't have any information if this is a Systolic BP or Diastolic BP. Becuase the range of the BP'a are between the 40 and 120, we assume, that this is Diastolic BP.

In [None]:
crosstab(data, ['BloodPressure'], 'Outcome')

## BloodPressure_Range
Let's create a new feature for the BP levels, based on the ranges in Wikipedia for adults

In [None]:
data['BloodPressure_Range'] = 'Normal'
data.loc[data.BloodPressure < 80, 'BloodPressure_Range'] = 'Optimal'
data.loc[((data.BloodPressure >= 80) & (data.BloodPressure <= 84)), 'BloodPressure_Range'] = 'Normal'
data.loc[((data.BloodPressure >= 85) & (data.BloodPressure <= 89)), 'BloodPressure_Range'] = 'High normal'
data.loc[((data.BloodPressure >= 90) & (data.BloodPressure <= 99)), 'BloodPressure_Range'] = 'Grade 1 hypertension'
data.loc[((data.BloodPressure >= 100) & (data.BloodPressure <= 109)), 'BloodPressure_Range'] = 'Grade 2 hypertension'
data.loc[((data.BloodPressure >= 110)), 'BloodPressure_Range'] = 'Grade 3 hypertension'

In [None]:
crosstab(data, ['BloodPressure_Range'], 'Outcome')

### **Interpretation**

- The data shows very clear, that people with hypertension 1, 2 or 3 has very high risk to develop diabetes

## SkinThickness
what is this for a feature??

In [None]:
crosstab(data, ['SkinThickness'], 'Outcome')

In [None]:
data.SkinThickness = data.SkinThickness.replace(0,data.SkinThickness.mean())

In [None]:
crosstab(data, ['SkinThickness'], 'Outcome')

## Insulin
how is this feature defined

In [None]:
crosstab(data, ['Insulin'], 'Outcome')

## DiabetesPedigreeFunction
what is this??

In [None]:
crosstab(data, ['DiabetesPedigreeFunction'], 'Outcome')

### BMI
* Underweight: BMI is less than 18.5
* Normal weight: BMI is 18.5 to 24.9
* Overweight: BMI is 25 to 29.9
* Obese: BMI is 30 or more

In [None]:
data['BMI'].describe()

In [None]:
#data.BMI = data.BMI.replace(0,data.BMI.mean())

In [None]:
data['BMI_Range'] = 'Underweight'
data.loc[data.BMI < 18.5, 'BMI_Range'] = 'Underweight'
data.loc[((data.BMI >= 18.5) & (data.BMI < 24.9)), 'BMI_Range'] = 'Normal weight'
data.loc[((data.BMI >= 24.9) & (data.BMI < 29.9)), 'BMI_Range'] = 'Overweight'
data.loc[data.BMI >= 30, 'BMI_Range'] = 'Obese'

In [None]:
crosstab(data, ['BMI_Range'], 'Outcome')

**Interpretation**

In [None]:
crosstab(data, ['Age'], 'Outcome')

In [None]:
summary(data)

In [None]:
#data['Age_BMI_Range']=data['Age'].astype('str')+'|'+data['BMI'].astype('str')
data['Age_Glucose_Mmol_Range']=data['Age'].astype('str')+'|'+data['Glucose_Mmol_Range'].astype('str')
data['Was_pregnant_BMI_Range']=data['Was_pregnant'].astype('str')+'|'+data['BMI_Range'].astype('str')
data['Was_pregnant_Insulin']=data['Was_pregnant'].astype('str')+'|'+data['Insulin'].astype('str')
#data['Was_pregnant_SkinThickness']=data['Was_pregnant'].astype('str')+'|'+data['SkinThickness'].astype('str')
#data['Age_Was_pregnant']=data['Age'].astype('str')+'|'+data['Was_pregnant'].astype('str')
#data['Age_Insulin']=data['Age'].astype('str')+'|'+data['Insulin'].astype('str')
#data['Age_SkinThickness']=data['Age'].astype('str')+'|'+data['SkinThickness'].astype('str')
#data['Was_pregnant_Glucose']=data['Was_pregnant'].astype('str')+'|'+data['Glucose'].astype('str')
#data['Age_BloodPressure_Range']=data['Age'].astype('str')+'|'+data['BloodPressure_Range'].astype('str')


In [None]:
data = data.drop(columns=[
    'Was_pregnant', 
    'Glucose'
 ])

## Split train and test

In [None]:
X = data.drop('Outcome', 1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=47)
X_train.shape, X_test.shape

### Check the distribution of the income for train, test and validation

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
check_target(train_df, 'Outcome')

In [None]:
test_df = pd.concat([X_test, y_test], axis=1)
check_target(test_df, 'Outcome')

In [None]:
summary(data)

In [None]:
cat_features=[i for i in X_train.columns if ((X_train.dtypes[i]!='int64') & (X_train.dtypes[i]!='float64'))]
cat_features

In [None]:
bool_features=[i for i in X_train.columns if ((X_train.dtypes[i]=='int64') & (len(X_train[i].unique()) == 2))]
bool_features

In [None]:
num_features=[i for i in X_train.columns if ((X_train.dtypes[i]=='int64') & (len(X_train[i].unique()) > 2))]
num_features

In [None]:
from sklearn.utils import class_weight
cw = list(class_weight.compute_class_weight('balanced',
                                             np.unique(data['Outcome']),
                                             data['Outcome']))

In [None]:
params = {'depth':[2, 3, 4, 5],
          'iterations':[1500],
          'loss_function': ['Logloss'],
          'l2_leaf_reg':np.logspace(-19,-20,3),
          'early_stopping_rounds': [500],
          'learning_rate':[0.01],
          'eval_metric':['F1']
}

# parameter tuning
#param = catboost_GridSearchCV(X_train, y_train, params, cat_features, cw)
#param

In [None]:
# pre-optimized parameters
param = {'depth': 3,
 'early_stopping_rounds': 500,
 'eval_metric': 'F1',
 'iterations': 1500,
 'l2_leaf_reg': 1e-19,
 'learning_rate': 0.01,
 'loss_function': 'Logloss',
 'leaf_estimation_iterations': 10
}

# create the model
clf2 = CatBoostClassifier(iterations=param['iterations'],
                        loss_function = param['loss_function'],
                        depth=param['depth'],
                        l2_leaf_reg = param['l2_leaf_reg'],
                        eval_metric = param['eval_metric'],
                        leaf_estimation_iterations = param['leaf_estimation_iterations'],
                        use_best_model=True,
                        early_stopping_rounds=param['early_stopping_rounds'],
                        class_weights = cw
)

# train the model
clf2.fit(X_train, 
        y_train,
        cat_features=cat_features,
        logging_level='Silent',
        eval_set=(X_test, y_test)
)

In [None]:
feature_score = pd.DataFrame(list(zip(X_train.dtypes.index, clf2.get_feature_importance(Pool(X_train, label=train_df['Outcome'], cat_features=cat_features)))),
                columns=['Feature','Score'])

feature_score = feature_score.sort_values(by='Score', ascending=False, inplace=False, kind='quicksort', na_position='last')
plt.rcParams["figure.figsize"] = (15,8)
ax = feature_score.plot('Feature', 'Score', kind='bar', color='c')
ax.set_title("Catboost Feature Importance Ranking", fontsize = 14)
ax.set_xlabel('')

rects = ax.patches

labels = feature_score['Score'].round(2)

for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 0.35, label, ha='left', va='bottom')
plt.xticks(rotation=85)

plt.gca().invert_xaxis()

plt.show()
print(feature_score)


In [None]:
pred_catboost2_train = clf2.predict(X_train)

In [None]:
plot_cf_matrix_and_roc(clf2, X_train, y_train, X_train, y_train, pred_catboost2_train , classes=['NO','YES'])

In [None]:
print(metrics.classification_report(y_train, pred_catboost2_train))

In [None]:
pred_catboost2_train = clf2.predict(X_test)

In [None]:
plot_cf_matrix_and_roc(clf2, X_train, y_train, X_test, y_test, pred_catboost2_train , classes=['NO','YES'])

In [None]:
print(metrics.classification_report(y_test, pred_catboost2_train))

## Cross validate the model

In [None]:
score = cross_val(X_train, y_train, param, cat_features, cw, 10)

The model seems not so stable. The standard deviation for F1 is 0.07 - not so good! 