In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
This is my first data science project with kaggle. Using the Pima Indian Diabates dataset, I will make a classification model that predicts whether the patients have diabates or not.

I used ScikitLearn and XGBoost libraries to preprocess the data, perform the grid search, and evaluate the model with various metrics.

# Prepare the dataset

In [None]:
# read the data
input_data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
input_data.describe()

Taking a first glance at the data, it has only numerical features and the data size is not large (768 x 7).

In [None]:
input_data.head()

Since there are only numerical features and no missing values, there is no need to perform encoding and to deal with missing values.

In [None]:
# split data
from sklearn.model_selection import train_test_split
import xgboost as xgb

y = input_data.Outcome
X = input_data.drop("Outcome", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# Training and Prediction
As of the model selection, I used `XGBClassifier` since this is a classic logistic regression problem. Also demonstrated the grid search with `sklearn.GridSearchCV` to find the best hyperparameters. 

In [None]:
# use xgboost and fit the model
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

model = XGBClassifier(use_label_encoder=False, verbosity=0)

# set parameters to be searched
params = {'learning_rate': np.arange(0.01, 0.1, 0.02),
          'n_estimator': np.arange(100, 1000, 200)}

# demonstrate grid search
folds = StratifiedKFold(n_splits=5, shuffle=True)
gscv = GridSearchCV(estimator=model, param_grid=params, cv=folds, scoring='f1')
gscv.fit(X_train, y_train)
print("Best Parameters: ", gscv.best_params_)

Lastly, I performed the model evaluation with various metrics using `sklearn.metrics.classification_report`.

In [None]:
# evaluate the model on test set
from sklearn.metrics import classification_report

my_model = gscv.best_estimator_
predictions = my_model.predict(X_test)
print(classification_report(y_test, predictions))

# Conclusion
In this notebook, I used ScikitLearn and XGBoost to make a prediction model of diabates. Things to improve includes model selection, feature engineering, etc.