In [5]:
import pickle
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'xgboost'

In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 2.3 MB/s eta 0:00:44
   ---------------------------------------- 0.5/99.8 MB 6.0 MB/s eta 0:00:17
   ---------------------------------------- 1.2/99.8 MB 9.7 MB/s eta 0:00:11
    --------------------------------------- 2.0/99.8 MB 12.9 MB/s eta 0:00:08
   - -------------------------------------- 2.9/99.8 MB 14.4 MB/s eta 0:00:07
   - -------------------------------------- 4.0/99.8 MB 15.8 MB/s eta 0:00:07
   -- ------------------------------------- 5.0/99.8 MB 16.9 MB/s eta 0:00:06
   -- ------------------------------------- 5.9/99.8 MB 17.1 MB/s eta 0:00:06
   -- ------------------------------------- 7.0/99.8 MB 17.9 MB/s eta 0:00:06
   --- ------------------------------------ 8.0/99.8 MB 18.2 MB/s eta 0:00:06
   ---

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python311\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 98, in read
    data: bytes = self.__fp.read(amt)
                  ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\welcome\AppData\Local\Programs\Python\Python311\Lib\http\client.py", line 466, in read
    s = se

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

FEATURE ENGINEERING

In [None]:
# dataset shape : number of records x number of features
print(df.shape)

In [None]:
df.info()

### Checking correlation between features

In [None]:
corr_mat = df.corr()
top_corr_features = corr_mat.index
plt.figure(figsize=(11, 11))
g = sns.heatmap(corr_mat[top_corr_features].corr(), annot=True, cmap="Blues")

In [None]:
# converting output label i.e. df[diabetes] from boolean to int.
df['Outcome'] = df['Outcome'].astype(int)
df.head()

In [None]:
diabetes_true_count = len(df.loc[df['Outcome'] == 1])
diabetes_false_count = len(df.loc[df['Outcome'] == 0])

print("Data having 1 as output: {}".format(diabetes_true_count))
print("Data having 0 as output: {}".format(diabetes_false_count))

### Train Test Split

In [None]:
all_features = ['Pregnancies', 
                 'Glucose', 
                 'BloodPressure', 
                 'SkinThickness', 
                 'Insulin',
                 'BMI', 
                 'DiabetesPedigreeFunction', 
                 'Age']

X = df[all_features]
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Checking values which are 0

In [None]:
print("total number of rows : {0}".format(len(df)))
print("number of rows missing Glucose: {0}".format(len(df[df['Glucose'] == 0])))
print("number of rows missing BloodPressure: {0}".format(len(df[df['BloodPressure'] == 0])))
print("number of rows missing insulin: {0}".format(len(df[df['Insulin'] == 0])))
print("number of rows missing bmi: {0}".format(len(df[df['BMI'] == 0])))
print("number of rows missing DiabetesPedigreeFunction: {0}".format(len(df[df['DiabetesPedigreeFunction'] == 0])))
print("number of rows missing age: {0}".format(len(df[df['Age'] == 0])))
print("number of rows missing SkinThickness: {0}".format(len(df[df['SkinThickness'] == 0])))

### Imputing these missing/zero values

In [None]:
missing_values_imputer = SimpleImputer(missing_values=0, strategy='mean')

X_train = missing_values_imputer.fit_transform(X_train)
X_test = missing_values_imputer.fit_transform(X_test)

In [None]:
X_train.shape, X_test.shape

### Training

In [None]:
# using random forest classifier
rfc = RandomForestClassifier(random_state=10)
rfc.fit(X_train, y_train)

# random forest classifier accuracy
y_preds = rfc.predict(X_test)
print(f"Accuracy : {accuracy_score(y_test, y_preds)*100}%")

### Random Forest Feature Importance

In [None]:
f_importance = pd.DataFrame(rfc.feature_importances_*100,index=all_features,columns=['Importance'])
f_importance.sort_values(by='Importance',ascending=False,inplace=True)
f_importance

### Using XGBoost

In [None]:
clf = xgboost.XGBClassifier()
clf.fit(X_train, y_train)

# XGBoost classifier accuracy
y_preds = clf.predict(X_test)
print(f"Accuracy : {accuracy_score(y_test, y_preds)*100}%")

### XGBoost Feature Importance

In [None]:
f_importance = pd.DataFrame(clf.feature_importances_*100,index=all_features,columns=['Importance'])
f_importance.sort_values(by='Importance',ascending=False,inplace=True)
f_importance

In [None]:
# saving trained model
filename = '../models/diabetes.sav'
pickle.dump(rfc, open(filename, 'wb'))