In [3]:
# Classic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#Models
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

#Metrics, Preprocessing and Tuning Tools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import missingno as msno
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

#Customization
import warnings
warnings.filterwarnings("ignore")
from termcolor import colored

In [4]:
data = pd.read_csv(r"diabetes.csv")

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Summary of the Dataset 

The dataset consists of 768 rows and 8 columns.
 
The target variable is Outcome, which contains categorical binary values 0 and 1.

The variables other than Outcome are numerical. 

There are technically no missing values because of lack NaN values, however when we examine closely, some 0's in the dataset indicate they are actually missing values

Descriptive statistics show that some features may have outliers (for example, 17 pregnancies can be an outlier).

In [5]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
print(data[data['BloodPressure']==0].shape[0])
print(data[data['Glucose']==0].shape[0])
print(data[data['SkinThickness']==0].shape[0])
print(data[data['Insulin']==0].shape[0])
print(data[data['BMI']==0].shape[0])

35
5
227
374
11


# Data Preprocessing

# This step will include filling missing values, outlier detection and removal, feature engineering, scaling and splitting data.

As you know, missing values and outliers have always the potential to directly affect the model. Since this may lead to undesired consequences, we should handle them in a correct way.

Even though there are many methods all of which are useful to some degree case by case, a common approach to handle missing values is filling them with unbiased statistics like mode, mean or median. Because removal of them means loss of information and imputation methods bring the possibility of changing the distribution of features.

As for outliers, the initial action is to detect them. Then we need to examine them carefully and determine if they affect the model or not. Analysts generally tend to keep them in the dataset, because removing them can lead to loss of information, which is an awful thing for the accuracy of models. However, if they contain extreme values, then removal of the outliers can be taken into account.

After these processes comes the feature engineering. This step is the key of a successful model.

As you can guess, raw data lack enough information or are bloated with irrelevant variables on most occasions. We should feel the urge to fix these issues whenever we see them in our projects and the way to do so is feature engineering. Selecting the relevant features or creating the new ones always increase the accuracy of models.

And after everythings is done, don't forget to scale your data and split it into two groups as test and train data.

In [7]:
missing_values = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
data[missing_values] = np.where(data[missing_values] == 0, np.nan, data[missing_values])

In [8]:
def missing_percentage(data):
    total = data.isnull().sum().sort_values(ascending = False)[data.isnull().sum().sort_values(ascending = False) != 0]
    percent = round(data.isnull().sum().sort_values(ascending = False)/len(data)*100,2)[round(data.isnull().sum().sort_values(ascending = False)/len(data)*100,2) != 0]
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])

In [9]:
missing_percentage(data)

Unnamed: 0,Total,Percent
Insulin,374,48.7
SkinThickness,227,29.56
BloodPressure,35,4.56
BMI,11,1.43
Glucose,5,0.65


In [10]:
data['Insulin'] = data['Insulin'].fillna(data.groupby(["Outcome"])['Insulin'].transform('median'))
data['Insulin'] = data['Insulin'].fillna(data.groupby('Outcome')['Insulin'].transform('median')) 
data['SkinThickness'] = data['SkinThickness'].fillna(data.groupby('Outcome')['SkinThickness'].transform('median'))  
data["BloodPressure"] = data["BloodPressure"].fillna(data.groupby("Outcome")["BloodPressure"].transform('median'))
data['BMI'] = data['BMI'].fillna(data.groupby(["Outcome"])['BMI'].transform('median'))
data["Glucose"] = data["Glucose"].fillna(data.groupby("Outcome")["Glucose"].transform('median'))

In [11]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,169.5,33.6,0.627,50,1
1,1,85.0,66.0,29.0,102.5,26.6,0.351,31,0
2,8,183.0,64.0,32.0,169.5,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [12]:
def missing_percentage(data):
    total = data.isnull().sum().sort_values(ascending = False)[data.isnull().sum().sort_values(ascending = False) != 0]
    percent = round(data.isnull().sum().sort_values(ascending = False)/len(data)*100,2)[round(data.isnull().sum().sort_values(ascending = False)/len(data)*100,2) != 0]
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])

In [13]:
missing_percentage(data)

Unnamed: 0,Total,Percent


In [14]:
from sklearn.model_selection import train_test_split
y = data["Outcome"]
X = data.drop(["Outcome"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [15]:
random_state = 42
modelx =RandomForestClassifier(random_state = random_state)
modelx.fit(X_train, y_train).predict(X_test)
modelx.score(X_test,y_test)

0.8831168831168831

In [16]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [17]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,169.5,33.6,0.627,50,1
1,1,85.0,66.0,29.0,102.5,26.6,0.351,31,0
2,8,183.0,64.0,32.0,169.5,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [18]:
qw = (6,148,72,35,169.5,33.6,0.627,50)

qw_q = np.asarray(qw)
qw_shape = qw_q.reshape(1,-1)
pred = modelx.predict(qw_shape)
print(pred)

[1]


In [19]:
import pickle

pickle.dump(modelx, open('pima-model.pkl','wb'))

In [20]:
!conda list


# packages in environment at /home/patrick/anaconda3:
#
# Name                    Version                   Build  Channel
_ipyw_jlab_nb_ext_conf    0.1.0                    py38_0  
_libgcc_mutex             0.1                        main  
aiohttp                   3.9.3                    pypi_0    pypi
aiosignal                 1.3.1                    pypi_0    pypi
alabaster                 0.7.12             pyhd3eb1b0_0  
anaconda                  2021.05                  py38_0  
anaconda-client           1.7.2                    py38_0  
anaconda-navigator        2.0.3                    py38_0  
anaconda-project          0.9.1              pyhd3eb1b0_1  
anyio                     2.2.0            py38h06a4308_1  
appdirs                   1.4.4                      py_0  
argh                      0.26.2                   py38_0  
argon2-cffi               20.1.0           py38h27cfd23_1  
asn1crypto                1.4.0                      py_0  
astroid 