In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/diabetes.csv")

In [4]:
df.shape

(768, 9)

In [5]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

## Separate X & Y

In [6]:
X = df.drop("Outcome", axis=1)
Y = df['Outcome']

In [7]:
X.shape, Y.shape

((768, 8), (768,))

In [8]:
X.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [9]:
X.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object

In [10]:
X_ohe = pd.get_dummies(X)

In [12]:
X_ohe.shape

(768, 8)

In [13]:
X_ohe.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object

## Format the Target Y

In [14]:
Y.dtype

dtype('int64')

In [17]:
Y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [18]:
Y.loc[Y == 0] = 'Healthy'
Y.loc[Y == 1] = 'Diabetic'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y.loc[Y == 0] = 'Healthy'


In [19]:
Y.unique()

array(['Diabetic', 'Healthy'], dtype=object)

In [20]:
Y.dtype

dtype('O')

## Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ohe, Y,
                                                    test_size=0.3,
                                                    random_state = 7,
                                                    stratify = Y)

In [24]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((537, 8), (231, 8), (537,), (231,))

## **Feature Selection**

In [25]:
from sklearn.decomposition import PCA

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
pca = PCA(n_components=0.99, random_state=7)

In [28]:
pca.fit(X_train)

In [29]:
pca.components_

array([[-1.37732832e-03,  9.51139842e-02,  1.86451108e-02,
         6.08687285e-02,  9.93322216e-01,  1.43998971e-02,
         6.88260289e-04, -1.39145429e-03],
       [-2.80834702e-02, -9.69385346e-01, -1.22604296e-01,
         9.90740633e-02,  8.94326849e-02, -4.40725646e-02,
        -1.01547507e-03, -1.57207694e-01],
       [-2.94257537e-02,  1.19000040e-01, -9.34629290e-01,
        -2.84012965e-01,  2.49410889e-02, -1.11440583e-01,
        -6.36338300e-04, -1.33177984e-01],
       [-4.00376841e-02,  1.53874457e-01, -2.28888920e-01,
         8.72630175e-01, -6.78217271e-02,  2.35104514e-01,
         3.24408809e-03, -3.17744603e-01],
       [ 1.54378642e-01, -9.64937972e-02, -2.38190443e-01,
         2.77827381e-01, -2.64637666e-03,  5.66244058e-02,
         2.04099601e-03,  9.10887451e-01]])

In [30]:
pca.n_components_

5

In [31]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
dt = DecisionTreeClassifier(criterion = 'gini', random_state=7, class_weight='balanced')

In [34]:
dt.fit(X_train, Y_train)

In [35]:
dt.feature_importances_

array([0.02214651, 0.30384573, 0.05319688, 0.05334427, 0.09948071,
       0.16187174, 0.14743958, 0.15867457])

In [36]:
feature_tup = tuple(zip(X_train.columns, dt.feature_importances_))

In [37]:
sorted(feature_tup, key = lambda x:-x[1])

[('Glucose', 0.3038457303917415),
 ('BMI', 0.16187173981802866),
 ('Age', 0.1586745724613922),
 ('DiabetesPedigreeFunction', 0.14743958161426934),
 ('Insulin', 0.09948070939651132),
 ('SkinThickness', 0.053344271322059614),
 ('BloodPressure', 0.05319688030330543),
 ('Pregnancies', 0.022146514692691927)]

## Library mwthod for selecting best features using Decision Tree

In [38]:
from sklearn.feature_selection import SelectFromModel

In [39]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
sel = SelectFromModel(DecisionTreeClassifier(criterion = 'gini',
                                             random_state=7,
                                             class_weight='balanced'),
                                             threshold = "mean")

In [43]:
sel.fit(X_train, Y_train)

In [44]:
sel.get_feature_names_out

In [45]:
X_train_DT = sel.transform(X_train)
X_test_DT = sel.transform(X_test)

In [46]:
sel.get_feature_names_out(X_train.columns)

array(['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype=object)

In [47]:
sel.estimator_.feature_importances_

array([0.02214651, 0.30384573, 0.05319688, 0.05334427, 0.09948071,
       0.16187174, 0.14743958, 0.15867457])

In [48]:
X_train.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')