In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        file_path = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Visualisation :
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
data = pd.read_csv(file_path)
print('Shape of the data:',data.shape)
data.head(1)

In [None]:
# Brief description of the data :
data.describe()

In [None]:
# check data type :
data.info()

### EDA :

In [None]:

sns.set_style(style="darkgrid")
plt.figure(figsize=(10,8))
ax = sns.countplot(y=data['Outcome'],data=data,palette="Set3")
plt.title('Distribution of Outcome')

total = len(data['Outcome'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))


In [None]:
 # Distribution af Age :
    
fig, (ax1, ax2) = plt.subplots(ncols=2,figsize=(15,8),sharey=True)

sns.distplot(data['Age'],ax=ax1)
ax1.set_title('Distribution of Age')

sns.boxplot(x='Outcome',y='Age',data=data,ax=ax2)
ax2.set_title('Outcome Vs Age')

In [None]:
data.head(1)

In [None]:
plt.figure(figsize=(10,8))
sns.set_style(style="darkgrid")
sns.scatterplot(y ='Glucose',x ='Age',hue = 'Outcome',data=data)
plt.title('Age Vs Glucose')

In [None]:
plt.figure(figsize=(10,8))
sns.set_style(style="darkgrid")
sns.scatterplot(y ='Insulin',x ='Age',hue = 'Outcome',data=data,palette='Dark2')
plt.title('Age Vs Insulin')

In [None]:
data.head(1)

In [None]:
# plt.figure(figsize=(10,8))
fig, (ax1, ax2) = plt.subplots(ncols=2,figsize=(15,8),sharey=True)
sns.set_style(style="darkgrid")

sns.scatterplot(y ='BloodPressure',x ='BMI',hue = 'Outcome',data=data,palette='Dark2',ax=ax1)
ax1.set_title('BP Vs BMI')

sns.boxplot(x='Outcome',y='BloodPressure',data=data,ax=ax2)
ax2.set_title('Diabetic Vs BP')


In [None]:
plt.figure(figsize=(10,8))
sns.set_style(style="darkgrid")
sns.scatterplot(y ='SkinThickness',x ='BMI',hue = 'Outcome',data=data,palette='Dark2')
plt.title('BMI Vs Skin thickeness')

> ****Observations :

1. Data provided has slightly imbalanced 
2. High glucose has high chance of getting Diabetes
3. BMI and skin thickness are almost linear**

In [None]:
print('% of records with BP ZERO :',data[data['BloodPressure']==0].shape[0]*100/data.shape[0])

In [None]:
#Remove outliers in data of each column:
from scipy import stats

print('Initial shape of the data :',data.shape)
data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]
print('shape of the data after removing outliers :',data.shape)

###  Model Building :

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from catboost import CatBoostClassifier

In [None]:
X = data.drop('Outcome',axis=1)
Y = data['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.10)
print('Shape of the train data:',X_train.shape)
print('Shape of the test data:',X_test.shape)

In [None]:
model_rf = RandomForestClassifier(n_estimators = 10,random_state=42)
model_rf.fit(X_train, y_train)

* Predicting on unseen data 

In [None]:
y_pred = model_rf.predict(X_test)
y_pred

In [None]:
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

### CatBoost classifier :


In [None]:
X.head(1)

In [None]:
X.dtypes

In [None]:
categorical_features_indices = np.where(X.dtypes == np.object)[0]
categorical_features_indices

In [None]:
clf = CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
)


clf.fit(X_train, y_train, 
        cat_features = categorical_features_indices,verbose=False)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)