# EDA and simple baselines 

In this notebook I will do the Exploratory Data Analysis of this dataset and create some baselines with Linear Regression, Random Forest, Logistic regression, Gradient Boost, and XGBoost.

# 0. Importing dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Defining plotting style
sns.set(style="ticks", color_codes=True)

# 1. EDA

## 1.1 Getting the data
First, let's create a dataframe from the `.csv` file.

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

## 1.2 Checking statistics and correlations of the data

Let's check the statistics and see if there is any null values in the data.

In [None]:
df.describe()

In [None]:
if not df.isnull().values.any():
    print("No missing values in the data.")
else: 
    print("There is missing values in the data, you need to preprocess those values.")

In [None]:
sns.boxplot(x="Pregnancies", y="Insulin", data=df)

In order to visualize the correlation of variables, let's do a pair plot using seaborn! I choose to do only the lower triangular 

In [None]:
sns.pairplot(df, hue="Outcome", markers=["o", "s"], corner=True);

Looking at the pairplot, we see that there is some data with Blood Pressure = 0, which seems odd... Let's look it further.

In [None]:
sns.distplot(df.BloodPressure.dropna());

Analysing this, we see that the it is higly likely that null values are written as 0, thus we can remove them.

In [None]:
df[df['BloodPressure'] == 0].describe()

We see that there is 35 counts that doesn't have data for insulin and blood pressure, since those are important factors for diabetes, I choose to delete this data since they seem to be out of place. Also we shall delete other data points which have 0 as the value, such as BMI and Glucose. Let's analyze the distributions:

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(20,10))
sns.distplot(df.Glucose, ax = axs[0])
sns.distplot(df.BMI, ax = axs[1])
sns.distplot(df.Insulin, ax = axs[2])
plt.show()

So analysing those plots, we see that we can discart the 0 values for Glucose and BMI, because they are probably null values.

In [None]:
df_clean = df[df['BloodPressure'] != 0]
df_clean = df_clean[df_clean['BMI'] != 0]
df_clean = df_clean[df_clean['Glucose'] != 0]
df_clean.describe()

## 1.3 Creating a training/test split

Now we create a training/test split in order to see how our model works for unseen data.

In [None]:
x = df_clean.drop("Outcome", axis=1)
y = df_clean["Outcome"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# 2. Creating models

Let's create some models to see which one works better.

## 2.1 Linear Regression

In [None]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(X_test)

In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, diabetes_y_pred))

Let's consider y_pred > 0.5 as pacients with diabetes and y_pred < 0.5 as pacients without diabetes and evaluate our accuracy.

In [None]:
y_pred = (diabetes_y_pred > 0.5).astype(int)

In [None]:
print(f"Accuracy: {np.around(sum(y_pred == y_test)/len(y_test)*100,1)}%")

### 2.1.2 Evaluating the model using ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

def roc(y_test, y_pred, model_name, title="ROC"):
    """Creates and plots the roc for a model.
    """
    
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    lw = 2
    plt.plot(fpr, tpr,
             lw=lw, label=f'{model_name} ROC curve area = {roc_auc:0.2f}')
    plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")

## 2.2 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(n_estimators=100, n_jobs=1, random_state=0)
randomforest.fit(X_train,y_train)

In [None]:
y_pred_RF = randomforest.predict_proba(X_test)
print(f"Accuracy: {np.around(sum(np.argmax(y_pred_RF, axis=1) == y_test)/len(y_test)*100,1)}%")
roc(y_test, y_pred_RF[:,1], "Random Forest")

## 2.3 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

Model = LogisticRegression();

# Let's rescale the data
X_scaled = preprocessing.scale(X_train)
Model.fit(X_scaled, y_train);

In [None]:
X_scale_test = preprocessing.scale(X_test)
y_pred_Log = Model.predict_proba(X_scale_test)
print(f"Accuracy: {np.around(sum(np.argmax(y_pred_Log, axis=1) == y_test)/len(y_test)*100,1)}%")
roc(y_test, y_pred_Log[:, 1], "Logistic regression")

## 2.4 Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.05, max_features=3, max_depth = 10, random_state = 0)
GB.fit(X_train,y_train)

In [None]:
y_pred_GB = GB.predict_proba(X_test)
print(f"Accuracy: {np.around(sum(np.argmax(y_pred_GB, axis=1) == y_test)/len(y_test)*100,1)}%")
roc(y_test, y_pred_GB[:,1], "Gradient Boost")

# 2.5 XGBoost

In [None]:
from xgboost import XGBRegressor, XGBClassifier

# Define the model
XGBR = XGBRegressor(n_estimators=1000, learning_rate=0.05) # Your code here
XGBC = XGBClassifier(n_estimators=1000, learning_rate=0.05)

# Fit the model
XGBR.fit(X_train, y_train,
               early_stopping_rounds=5,
              eval_set=[(X_test, y_test)],
              verbose=0)

XGBC.fit(X_train, y_train,
               early_stopping_rounds=5,
              eval_set=[(X_test, y_test)],
              verbose=0)

In [None]:
y_pred_XGBC = XGBC.predict_proba(X_test)
print(f"Classifier Accuracy: {np.around(sum(np.argmax(y_pred_XGBC, axis=1) == y_test)/len(y_test)*100,1)}%")
roc(y_test, y_pred_XGBC[:, 1], "Gradient Boost Classifier")

y_pred_XGBR = XGBR.predict(X_test)
y_pred = (y_pred_XGBR > 0.5).astype(int)
print(f"Regressor Accuracy: {np.around(sum(y_pred == y_test)/len(y_test)*100,1)}%")
roc(y_test, y_pred_XGBR, "Gradient Boost Regressor")

# Analysing all models

We see that all the models have similar ROC curves and the one that have the best accuracy and ROC curve area is the XGBoost Regressor.

In [None]:
roc(y_test, diabetes_y_pred, "Linear regression")
roc(y_test, y_pred_RF[:,1], "Random Forest")
roc(y_test, y_pred_Log[:, 1], "Logistic regression")
roc(y_test, y_pred_GB[:,1], "Gradient Boost")
roc(y_test, y_pred_XGBR, "XGBoost Regressor")
roc(y_test, y_pred_XGBC[:, 1], "XGBoost Classifier")

### If you like this notebook, please upvote! :)