#### Classification model practice

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv(r'E:\classification\diabetis\diabetes.csv')

In [3]:
df.shape

(768, 9)

In [4]:
df.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [13]:
x = df.iloc[:,:8].values
y = df.iloc[:,8].values

#### train_test_split

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0, stratify=y)

In [15]:
len(x_train), len(x_test), len(y_train), len(y_test)

(537, 231, 537, 231)

### logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)
lr.fit(x_train, y_train)


# maxiter to avoid ConvergenceWarning: Liblinear failed to converge, increase the number of iterations

LogisticRegression(max_iter=10000)

In [20]:
y_pred_lr = lr.predict(x_test)

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [22]:
confusion_matrix(y_test, y_pred_lr)

array([[133,  17],
       [ 35,  46]], dtype=int64)

In [23]:
accuracy_score(y_test, y_pred_lr)

0.7748917748917749

In [24]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       150
           1       0.73      0.57      0.64        81

    accuracy                           0.77       231
   macro avg       0.76      0.73      0.74       231
weighted avg       0.77      0.77      0.77       231



#### decision tree classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier()

In [26]:
y_pred_dt = dt.predict(x_test)

In [27]:
confusion_matrix(y_test, y_pred_dt)

array([[117,  33],
       [ 35,  46]], dtype=int64)

In [28]:
accuracy_score(y_test, y_pred_dt)

0.7056277056277056

#### random forest classifier

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10)
rf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=10)

In [30]:
y_pred_rf = rf.predict(x_test)

In [31]:
confusion_matrix(y_test, y_pred_rf)

array([[128,  22],
       [ 38,  43]], dtype=int64)

In [32]:
accuracy_score(y_test, y_pred_rf)

0.7402597402597403

############## grid search with random forest

In [37]:
from sklearn.model_selection import GridSearchCV


param = [{'n_estimators':list(range(25))}]

In [38]:
grid_search = GridSearchCV(estimator=rf, param_grid=param, scoring='accuracy', n_jobs=-1)
grid_search = grid_search.fit(x_train, y_train)

In [39]:
best_accuracy = grid_search.best_score_
best_param = grid_search.best_params_

In [40]:
best_accuracy, best_param

(0.7672377985462098, {'n_estimators': 17})

In [41]:
rf = RandomForestClassifier(n_estimators=17)
rf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=17)

In [42]:
y_pred_rf = rf.predict(x_test)

In [43]:
accuracy_score(y_test, y_pred_rf)

0.7922077922077922

#### SVM

In [48]:
from sklearn.svm import SVC
sv = SVC(kernel='linear')
sv.fit(x_train, y_train)

SVC(kernel='linear')

In [53]:
param = [{'kernel':['linear', 'poly', 'sigmoid', 'rbf']}]

In [54]:
grid_search = GridSearchCV(estimator=sv, param_grid=param, scoring='accuracy', n_jobs=-1)
grid_search = grid_search.fit(x_train, y_train)

In [55]:
best_accuracy = grid_search.best_score_
best_param = grid_search.best_params_

In [56]:
best_accuracy, best_param

(0.7652821045344409, {'kernel': 'linear'})

In [57]:
sv = SVC(kernel='linear')
sv.fit(x_train, y_train)

SVC(kernel='linear')

In [58]:
y_pred_sv = sv.predict(x_test)

In [59]:
accuracy_score(y_test, y_pred_sv)

0.7705627705627706

#### knn

In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [63]:
param = [{'n_neighbors' : list(range(1,25))}]

In [64]:
grid_search = GridSearchCV(estimator=knn, param_grid=param, scoring='accuracy', n_jobs=-1)
grid_search = grid_search.fit(x_train, y_train)

In [65]:
best_accuracy = grid_search.best_score_
best_param = grid_search.best_params_

In [66]:
best_accuracy, best_param

(0.7430425752855659, {'n_neighbors': 20})

In [67]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [70]:
y_pred_knn = knn.predict(x_test)

In [71]:
accuracy_score(y_test, y_pred_knn)

0.7445887445887446

#### naive bayes

In [72]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB()

In [74]:
y_pred_gnb = gnb.predict(x_test)

In [75]:
accuracy_score(y_test, y_pred_gnb)

0.7748917748917749