In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# 讀取資料集
data = pd.read_csv('diabetes.csv')

# 探索數據
print(data.head())  # 查看資料集前幾行
print(data.shape)  # 查看資料集的形狀
print(data.info())  # 查看資料集的資訊


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
(100000, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age   

In [2]:

missing_values = data.isnull().sum()
print(missing_values)


gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [3]:

# 進行獨熱編碼
data_encoded = pd.get_dummies(data, columns=["gender", "smoking_history"])
data_encoded

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,1,0,0,1,0,0,0,0,0
99996,2.0,0,0,17.37,6.5,100,0,1,0,0,1,0,0,0,0,0
99997,66.0,0,0,27.83,5.7,155,0,0,1,0,0,0,0,1,0,0
99998,24.0,0,0,35.42,4.0,100,0,1,0,0,0,0,0,0,1,0


In [4]:

correlation_matrix = data_encoded.corr()
covariance_matrix = data_encoded.cov()

diabetes_correlation = correlation_matrix['diabetes']

print("Correlation\n", diabetes_correlation)


Correlation
 age                            0.258008
hypertension                   0.197823
heart_disease                  0.171727
bmi                            0.214357
HbA1c_level                    0.400660
blood_glucose_level            0.419558
diabetes                       1.000000
gender_Female                 -0.037553
gender_Male                    0.037666
gender_Other                  -0.004090
smoking_history_No Info       -0.118939
smoking_history_current        0.019606
smoking_history_ever           0.024080
smoking_history_former         0.097917
smoking_history_never          0.027267
smoking_history_not current    0.020734
Name: diabetes, dtype: float64


In [5]:
for i in range(1, len(diabetes_correlation)):
    top_features = diabetes_correlation.drop(
        'diabetes').abs().nlargest(i).index.tolist()

    # 拆分資料集
    # X = data_encoded.drop('diabetes', axis=1)
    X = data_encoded[top_features]
    y = data_encoded['diabetes']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # 建立邏輯回歸模型
    # iteration is not enough
    model = LogisticRegression(solver='lbfgs', max_iter=2000)

    # 模型訓練
    model.fit(X_train, y_train)

    # 預測測試集
    y_pred = model.predict(X_test)

    # 評估準確度
    accuracy = accuracy_score(y_test, y_pred)
    print('準確度 ', i, ": ",  accuracy)



準確度  1 :  0.9408
準確度  2 :  0.9536
準確度  3 :  0.9561
準確度  4 :  0.9586
準確度  5 :  0.9588
準確度  6 :  0.95875
準確度  7 :  0.95895
準確度  8 :  0.95885
準確度  9 :  0.9588
準確度  10 :  0.9588
準確度  11 :  0.9588
準確度  12 :  0.95885
準確度  13 :  0.959
準確度  14 :  0.9591
準確度  15 :  0.959
