# <center>Early Stage Diabetes</center>
---

# Classification

## Data Understanding

In [1]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as prepro
import sklearn.feature_selection as feat_sel
import sklearn.metrics as metrics
import sklearn.ensemble as ensemble
from sklearn.model_selection import train_test_split

In [2]:
# load data
data = pd.read_csv("data/diabetes_data_upload.csv")

In [3]:
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [4]:
data.shape

(520, 17)

### Data Description
#### Data Set Information:

This has been col- 
lected using direct questionnaires from the patients of Sylhet Diabetes 
Hospital in Sylhet, Bangladesh and approved by a doctor.

#### Attribute Information:

1) Age 1.20-65 
2) Sex 1. Male, 2.Female 
3) Polyuria 1.Yes, 2.No. 
4) Polydipsia 1.Yes, 2.No. 
5) sudden weight loss 1.Yes, 2.No. 
6) weakness 1.Yes, 2.No. 
7) Polyphagia 1.Yes, 2.No. 
8) Genital thrush 1.Yes, 2.No. 
9) visual blurring 1.Yes, 2.No. 
10) Itching 1.Yes, 2.No. 
11) Irritability 1.Yes, 2.No. 
12) delayed healing 1.Yes, 2.No. 
13) partial paresis 1.Yes, 2.No. 
14) muscle stifness 1.Yes, 2.No. 
15) Alopecia 1.Yes, 2.No. 
16) Obesity 1.Yes, 2.No. 
17) Class 1.Positive, 2.Negative. `Target`

In [5]:
# missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

## Data Preparation

### Feature Engineering

#### Convert to lowercase

In [6]:
features = []
# converting all the features into lower case
for col in data.columns:
    features.append(col.lower())


data.columns = features

In [7]:
# display top 5 rows
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden weight loss,weakness,polyphagia,genital thrush,visual blurring,itching,irritability,delayed healing,partial paresis,muscle stiffness,alopecia,obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


#### Convert to numeric

In [8]:
def to_numeric(row):
    if row == "No" or row == "Negative":
        return 0
    elif row == "Yes" or row == "Male" or row == "Positive":
        return 1
    elif row == "Female":
        return 2
    else:
        return np.nan

In [9]:
for col in data.iloc[:,1:].columns:
    data[col] = data[col].apply(to_numeric)

In [10]:
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden weight loss,weakness,polyphagia,genital thrush,visual blurring,itching,irritability,delayed healing,partial paresis,muscle stiffness,alopecia,obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


### Sample Data

In [11]:
data = data.sample(data.shape[0])
data.shape

(520, 17)

In [12]:
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden weight loss,weakness,polyphagia,genital thrush,visual blurring,itching,irritability,delayed healing,partial paresis,muscle stiffness,alopecia,obesity,class
105,69,2,1,0,1,1,1,1,1,1,0,1,0,0,1,0,1
159,38,2,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1
22,39,1,1,0,1,0,0,1,0,1,1,0,0,0,1,0,1
424,43,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,1
143,53,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1


### Splitting Data

In [13]:
X = data.drop("class",axis=1)
y = data["class"]

In [14]:
# train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=11)

## Modelling

In [15]:
model = ensemble.RandomForestClassifier(criterion="entropy",max_depth=4)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [16]:
model.score(X_test,y_test)

0.9423076923076923

### Confusion Matrix

In [17]:
metrics.confusion_matrix(y_test,y_pred)

array([[39,  2],
       [ 4, 59]])

### Accuracy Score

In [18]:
metrics.accuracy_score(y_test,y_pred)

0.9423076923076923

# Feature Selection | Filter Methods | Mutual Information for Classification

In [19]:
X1 = X.copy(deep=True)
y1 = y.copy(deep=True)

## Splitting Data

In [20]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X1,y1,test_size=0.2,random_state=11)

In [21]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 520 entries, 105 to 160
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   age                 520 non-null    int64
 1   gender              520 non-null    int64
 2   polyuria            520 non-null    int64
 3   polydipsia          520 non-null    int64
 4   sudden weight loss  520 non-null    int64
 5   weakness            520 non-null    int64
 6   polyphagia          520 non-null    int64
 7   genital thrush      520 non-null    int64
 8   visual blurring     520 non-null    int64
 9   itching             520 non-null    int64
 10  irritability        520 non-null    int64
 11  delayed healing     520 non-null    int64
 12  partial paresis     520 non-null    int64
 13  muscle stiffness    520 non-null    int64
 14  alopecia            520 non-null    int64
 15  obesity             520 non-null    int64
dtypes: int64(16)
memory usage: 69.1 KB


In [22]:
X1.dtypes == "int64"

age                   True
gender                True
polyuria              True
polydipsia            True
sudden weight loss    True
weakness              True
polyphagia            True
genital thrush        True
visual blurring       True
itching               True
irritability          True
delayed healing       True
partial paresis       True
muscle stiffness      True
alopecia              True
obesity               True
dtype: bool

Age is the only continuous feature, and rest all are discrete features.

In [23]:
discrete = list(X1.dtypes == "int64")
discrete[0] = False
discrete

[False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

### Mutual Information for Classification

In [24]:
# Mutual Information for Classification
feature_selection = feat_sel.mutual_info_classif(X1,y1,discrete_features=discrete,random_state=11)

In [25]:
# boolean mask
selected_Features = feature_selection > 0.029

In [26]:
X_train = X_train.loc[:, selected_Features]
X_test = X_test.loc[:, selected_Features]

In [27]:
print(f"No. of features before feature selection: {X.shape[1]}")
print(f"No. of features after feature selection: {X_train.shape[1]}")

No. of features before feature selection: 16
No. of features after feature selection: 10


## Modelling

In [28]:
model = ensemble.RandomForestClassifier(n_estimators=300, max_depth=4,random_state=11)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [29]:
model.score(X_test,y_test)

0.9326923076923077

### Confusion Matrix

In [30]:
metrics.confusion_matrix(y_test,y_pred)

array([[38,  3],
       [ 4, 59]])

### Accuracy Score

In [31]:
metrics.accuracy_score(y_test,y_pred)

0.9326923076923077

# Feature Selection | Filter Methods | Chi Square

In [32]:
X2 = X.copy(deep=True)
y2 = y.copy(deep=True)

In [33]:
X2.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden weight loss,weakness,polyphagia,genital thrush,visual blurring,itching,irritability,delayed healing,partial paresis,muscle stiffness,alopecia,obesity
105,69,2,1,0,1,1,1,1,1,1,0,1,0,0,1,0
159,38,2,1,1,1,1,1,0,1,1,1,1,1,1,0,0
22,39,1,1,0,1,0,0,1,0,1,1,0,0,0,1,0
424,43,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0
143,53,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0


## Train Test Split

In [34]:
X_train,X_test,y_train,y_test = train_test_split(X2,y2,test_size=0.2,random_state=11)

## Chi Square
`.chi2():` Compute chi-squared stats between each non-negative feature and class.

This score can be used to select the n_features features with the
highest values for the test chi-squared statistic from X, which must
contain only non-negative features such as booleans or frequencies
(e.g., term counts in document classification), relative to the classes.

Recall that the chi-square test measures dependence between stochastic
variables, so using this function "weeds out" the features that are the
most likely to be independent of class and therefore irrelevant for
classification.


In [35]:
feature_selection = feat_sel.chi2(X2,y2)

In [36]:
feature_selection

(array([1.88457668e+01, 1.78500351e+01, 1.16184593e+02, 1.20785515e+02,
        5.77493088e+01, 1.27242623e+01, 3.31984177e+01, 4.91400862e+00,
        1.81245708e+01, 4.78260870e-02, 3.53341270e+01, 6.20188285e-01,
        5.53142857e+01, 4.87500000e+00, 2.44027933e+01, 2.25028409e+00]),
 array([1.41725286e-05, 2.39014859e-05, 4.33053165e-27, 4.25762360e-28,
        2.97744668e-14, 3.60942204e-04, 8.32182204e-09, 2.66397473e-02,
        2.06914149e-05, 8.26890166e-01, 2.77722772e-09, 4.30977327e-01,
        1.02719337e-13, 2.72484409e-02, 7.81552909e-07, 1.33589875e-01]))

### SelectKBest

In [37]:
kbest = feat_sel.SelectKBest(score_func=feat_sel.chi2,k=12)

In [38]:
selected_features = kbest.fit_transform(X2,y2)

In [39]:
kbest.get_feature_names_out()

array(['age', 'gender', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'irritability', 'partial paresis', 'alopecia'], dtype=object)

In [44]:
X_train = X_train[kbest.get_feature_names_out()]
X_test = X_test[kbest.get_feature_names_out()]

In [46]:
model = ensemble.RandomForestClassifier(n_estimators=300,max_depth=4,random_state=11)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

## Evaluation

In [47]:
model.score(X_test,y_test)

0.9326923076923077

### Confusion Matrix

In [48]:
metrics.confusion_matrix(y_test,y_pred)

array([[38,  3],
       [ 4, 59]])

### Accuracy Score

In [49]:
metrics.accuracy_score(y_test,y_pred)

0.9326923076923077