In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from mlxtend.classifier import StackingCVClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


In [6]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.18.0


In [2]:
df = pd.read_csv("diabetes_data.csv")

In [3]:
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.shape

(768, 9)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [12]:
df.stack()

0    pregnancies      6.000
     glucose        148.000
     diastolic       72.000
     triceps         35.000
     insulin          0.000
                     ...   
767  insulin          0.000
     bmi             30.400
     dpf              0.315
     age             23.000
     diabetes         0.000
Length: 6912, dtype: float64

In [13]:
X = df.drop('diabetes', axis = 1)

In [14]:
X.shape

(768, 8)

In [15]:
y = df["diabetes"]

In [16]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: diabetes, Length: 768, dtype: int64

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y)

In [19]:
df['diabetes'].value_counts()

0    500
1    268
Name: diabetes, dtype: int64

In [20]:
y_train.value_counts()

0    400
1    214
Name: diabetes, dtype: int64

In [21]:
y_test.value_counts()

0    100
1     54
Name: diabetes, dtype: int64

In [22]:
#create KNN model

In [23]:
knn = KNeighborsClassifier()

In [26]:
params_knn = {"n_neighbors": np.arange(1,25)}

In [27]:
knn_gs = GridSearchCV(knn, params_knn, cv = 5)

In [28]:
knn_gs.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])})

In [29]:
knn_best = knn_gs.best_estimator_

In [30]:
knn_best

KNeighborsClassifier(n_neighbors=19)

In [31]:
#create  random forest

In [32]:
rf = RandomForestClassifier()

In [36]:
params_rf = {"n_neigbors": [50,100,200]}

In [38]:
rf_gs = GridSearchCV(rf,params_rf, cv = 5)

In [59]:
rf = RandomForestClassifier()
params_rf = {'n_estimators':[50, 100, 150, 200]}
grid_rf = GridSearchCV(estimator=rf, param_grid=params_rf, cv = 5)
grid_rf.fit(x_train, y_train)
rf_best= grid_rf.best_estimator_

In [41]:
#log model

In [44]:
log_reg = LogisticRegression()

In [45]:
log_reg.fit(x_train,y_train)

LogisticRegression()

In [60]:
print('knn: {}'.format(knn_best.score(x_test,y_test)))
print("rf: {}".format(rf_best.score(x_test,y_test)))
print("log_reg: {}".format(log_reg.score(x_test,y_test)))

knn: 0.7207792207792207
rf: 0.7662337662337663
log_reg: 0.7857142857142857


In [50]:
#dict for our model

In [61]:
estiamators = [('knn',knn_best), ("log_reg",log_reg), ('rf',rf_best)]

In [53]:
#voting classifer 

In [114]:
vc = VotingClassifier(estiamators,voting="hard")
vc.fit(x_train,y_train)
vc.score(x_test,y_test)

0.7792207792207793

In [64]:
#simple stacking cv classifier

In [104]:
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GaussianNB()
clf4 = SVC(probability=True)
lr= LogisticRegression()

In [105]:
sclf = StackingCVClassifier(classifiers= [clf1, clf2, clf3, clf4],meta_classifier=lr, random_state=42)

In [106]:
sclf.fit(x_train,y_train)

StackingCVClassifier(classifiers=[KNeighborsClassifier(n_neighbors=10),
                                  RandomForestClassifier(random_state=42),
                                  GaussianNB(), SVC(probability=True)],
                     meta_classifier=LogisticRegression(), random_state=42)

In [107]:
sclf.score(x_test,y_test)

0.7402597402597403

In [108]:
sclf1 = StackingCVClassifier(classifiers= [clf1, clf2, clf3, clf4],meta_classifier=lr, random_state=42, use_probas=True)

In [109]:
sclf1.fit(x_train,y_train)

StackingCVClassifier(classifiers=[KNeighborsClassifier(n_neighbors=10),
                                  RandomForestClassifier(random_state=42),
                                  GaussianNB(), SVC(probability=True)],
                     meta_classifier=LogisticRegression(), random_state=42,
                     use_probas=True)

In [110]:
sclf1.score(x_test,y_test)

0.7662337662337663

In [90]:
#vif - testing for one variable

In [93]:
variance_inflation_factor(x_train.values, 0)

3.2706232526702292

In [94]:
#vif for all variables 

In [95]:
for i in range(len(x_train.columns)):
    print(variance_inflation_factor(x_train.values, i))

3.2706232526702292
16.656903886323693
15.348854182776305
4.209071131999817
2.0916143382864227
20.809004143745693
3.18977919954901
13.719056033410537


In [96]:
x_train.shape

(614, 8)

In [97]:
x_train.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
77,5,95,72,33,0,37.7,0.37,27
240,1,91,64,24,0,29.2,0.192,21
690,8,107,80,0,0,24.6,0.856,34
177,0,129,110,46,130,67.1,0.319,26
54,7,150,66,42,342,34.7,0.718,42
