#### Use pima_Indian_Diabetes dataset  and perform the following:
1. Use preprocessing methods to clean the dataset.
2. Spit the dataset into Train and Test dataset, respectively.
3. Visualize your training data to know whether the relationship between dependent and independent variable is linear.
4. Use train dataset to create a decision tree model.
5. Use test dataset for predictions.
6. Access the accuracy of your model.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Glucose, BloodPressure, SkinThickness, Insulin, BMI can never be zero. So, we need to process on them.

In [5]:
cols_2_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_2_replace] = data[cols_2_replace].replace(0, np.nan)

In [6]:
data.isna().sum().sort_values(ascending=False)

Insulin                     374
SkinThickness               227
BloodPressure                35
BMI                          11
Glucose                       5
Outcome                       0
Age                           0
DiabetesPedigreeFunction      0
Pregnancies                   0
dtype: int64

In [7]:
print(data.dropna(subset = ['BloodPressure','BMI','Glucose']).shape)
print(data.dropna(subset = ['BloodPressure','BMI','Glucose']).isna().sum())

(724, 9)
Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness               192
Insulin                     332
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [8]:
data.dropna(subset = ['BloodPressure','BMI','Glucose'], inplace=True)

In [9]:
X = data.drop(axis=1, labels='Outcome')
y = data['Outcome']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [11]:
X_train.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

def preprop(features):
    X = features.copy()
    t = [('i_mode', SimpleImputer(strategy='most_frequent'), ['SkinThickness']),
         ('i_mean', SimpleImputer(strategy='mean'), ['Insulin'])]

    transformer = ColumnTransformer(transformers=t)
    X_trans =  transformer.fit_transform(X)
    X.loc[:, 'SkinThickness'] = X_trans[:,0]
    X.loc[:, 'Insulin'] = X_trans[:,1]
    return X

In [13]:
X_train = preprop(X_train)

In [14]:
X_train.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4,shuffle=True, random_state=1)

lg = LogisticRegression(solver='liblinear')

param_grid = [{'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}]

grid_search = GridSearchCV(lg, param_grid, cv=skf, scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

LogisticRegression(C=10, penalty='l1', solver='liblinear')

In [16]:
# Training using the best_estimators_
lg = LogisticRegression(C=10, penalty='l1', solver='liblinear')
lg.fit(X_train, y_train)

X_test = preprop(X_test)
y_pred = lg.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score

print("Accuracy score : ", accuracy_score(y_test, y_pred))

Accuracy score :  0.7322175732217573


## There is a 1.255 % increase in accuracy in Logistic Regression model than DecsionTreeClassifier model on Diabetes database.