In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

In [43]:
columns = ["Pregnancies", "GLucose", "BloodPressure", "Skin Thickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
dataset = pd.read_csv('./master_pima-indians-diabetes.csv')
dataset.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [5]:
array = dataset.values
X = array[:, 0:8]
y = array[:, 8]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

### Univariate Analysis

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)

In [11]:
np.set_printoptions(precision=3)
features = fit.transform(X)

In [13]:
print("Scores: ")
print(fit.scores_)

Scores: 
[ 110.727 1406.59    17.505   51.008 2219.398  127.671    5.356  178.011]


In [14]:
print("Features: ")
print(features[0:5, :])

Features: 
[[ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]
 [116.    0.   25.6  30. ]]


### Recursive Feature Elimination

In [27]:
from sklearn.feature_selection import RFE

In [28]:
array = dataset.values
X = array[:, 0:8]
y = array[:, 8]

In [None]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=4)
fit = rfe.fit(X,y)

In [32]:
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 4
Selected Features: [ True  True False False False  True  True False]
Feature Ranking: [1 1 3 5 4 1 1 2]


In [36]:
reduced_dataset = dataset.iloc[:, :8].loc[:, fit.support_]
reduced_dataset.head()

Unnamed: 0,6,148,33.6,0.627
0,1,85,26.6,0.351
1,8,183,23.3,0.672
2,1,89,28.1,0.167
3,0,137,43.1,2.288
4,5,116,25.6,0.201


### Lasso - L1 Regularization

In [60]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [61]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [65]:
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

In [66]:
lasso_coefficients = lasso.coef_

In [67]:
selected_features = [feature for feature, coef in zip(columns, lasso_coefficients) if coef != 0]
print("Selected Features:", selected_features)

Selected Features: ['Pregnancies', 'GLucose', 'BMI', 'DiabetesPedigreeFunction']


### Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [47]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [48]:
rf_importances = rf_model.feature_importances_

In [51]:
sorted_indices = rf_importances.argsort()[::-1]
selected_features_rf = np.array(columns)[sorted_indices]

In [52]:
print("Feature Importances (Random Forest):")
for feature, importance in zip(selected_features_rf, rf_importances[sorted_indices]):
    print(f"{feature}: {importance:.4f}")

Feature Importances (Random Forest):
GLucose: 0.2590
BMI: 0.1503
Age: 0.1357
DiabetesPedigreeFunction: 0.1280
Pregnancies: 0.0976
BloodPressure: 0.0871
Insulin: 0.0768
Skin Thickness: 0.0655


### XGBoost

In [53]:
import xgboost as xgb

In [54]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

In [55]:
xgb_importances = xgb_model.feature_importances_

In [57]:
sorted_indices = xgb_importances.argsort()[::-1]
selected_features_xgb = np.array(columns)[sorted_indices]

In [59]:
print("Feature Importances (XGBoost):")
for feature, importance in zip(selected_features_xgb, xgb_importances[sorted_indices]):
    print(f"{feature}: {importance:.4f}")

Feature Importances (XGBoost):
GLucose: 0.2428
Age: 0.1437
BMI: 0.1319
Pregnancies: 0.1155
Skin Thickness: 0.1063
Insulin: 0.1061
DiabetesPedigreeFunction: 0.0844
BloodPressure: 0.0693
