In [1]:
import pandas as pd
data = pd.read_csv("pima-indians-diabetes.csv",delimiter=",")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
print(data.columns.tolist())

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']


In [3]:
import numpy as np
# data['Pregnancies'].replace((0),(np.nan),inplace=True)  不該轉換成 NaN，0 是可能合理的值，有可能沒懷孕過
data['Glucose'].replace((0),(np.nan),inplace=True)
data['BloodPressure'].replace((0),(np.nan),inplace=True)
data['SkinThickness'].replace((0),(np.nan),inplace=True)
data['Insulin'].replace((0),(np.nan),inplace=True)
data['BMI'].replace((0),(np.nan),inplace=True)
# data['DiabetesPedigreeFunction'].replace((0),(np.nan),inplace=True) 一個人家族中有可能完全沒有人有糖尿病
data['Age'].replace((0),(np.nan),inplace=True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Glucose'].replace((0),(np.nan),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['BloodPressure'].replace((0),(np.nan),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [4]:
data.isna().sum() #check missing values amount

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Class                         0
dtype: int64

In [5]:
cols_to_fill = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in cols_to_fill:
    data[col] = data[col].fillna(data.groupby('Class')[col].transform('mean').round())

In [6]:
data.isna().sum() #再次確認是否有遺漏miss value

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Class                       0
dtype: int64

In [7]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148.0,72.0,35.0,207.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,130.0,26.6,0.351,31,0
2,8,183.0,64.0,33.0,207.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [8]:
# 取出最後 100 筆作為預測資料
inference_data = data.tail(100)

# 匯出成 CSV 檔案
inference_data.to_csv('pima-indians-diabetes_tableau.csv', index=False)

In [9]:
Q1 = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']].quantile(0.25)
Q3 = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']].quantile(0.75)
IQR = Q3 - Q1

# 篩選出異常值
outliers = ((data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']] < (Q1 - 1.5 * IQR)) |
            (data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']] > (Q3 + 1.5 * IQR)))

print(outliers.sum())  # 每個欄位異常值數量

Glucose                      0
BloodPressure               14
SkinThickness               53
Insulin                     26
DiabetesPedigreeFunction    29
dtype: int64


In [10]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

minmax_scaler = MinMaxScaler()
robust_scaler = RobustScaler()

# 用 MinMaxScaler 處理沒有離群值的欄位
data[['Glucose']] = minmax_scaler.fit_transform(data[['Glucose']])

# 用 RobustScaler 處理有離群值的欄位
cols_with_outliers = ['BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']
data[cols_with_outliers] = robust_scaler.fit_transform(data[cols_with_outliers])

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,0.670968,0.0,0.875,0.900585,33.6,0.665359,50,1
1,1,0.264516,-0.375,0.125,0.0,26.6,-0.056209,31,0
2,8,0.896774,-0.5,0.625,0.900585,23.3,0.783007,32,1
3,1,0.290323,-0.375,-0.625,-0.421053,28.1,-0.537255,21,0
4,0,0.6,-2.0,0.875,0.444444,43.1,5.007843,33,1


In [11]:
bins = [-np.inf, 18.5, 25, 30, np.inf]
labels = [0, 1, 2, 3]
# Underweight
# Healthy Weight
# Overweight
# Obesity
data['BMI'] = pd.cut(data['BMI'], bins=bins, labels=labels)

In [12]:
data['BMI'].unique() #查看BMI資料有哪些

[3, 2, 1, 0]
Categories (4, int64): [0 < 1 < 2 < 3]

In [13]:
import pandas as pd

# 定義年齡區間邊界
bins = [10, 20, 30, 40, 50, 60, 70, 80, float('inf')]
labels = [1, 2, 3, 4, 5, 6, 7, 8]

# 套用分箱
data['Age'] = pd.cut(data['Age'], bins=bins, labels=labels)

In [14]:
data['Age'].unique() #查看Age資料有哪些

[4, 3, 2, 5, 6, 7, 8]
Categories (8, int64): [1 < 2 < 3 < 4 < 5 < 6 < 7 < 8]

In [15]:
# 分出訓練 + 測試資料（前 668 筆）
train_test_data = data.head(len(data) - 100)
# 分出訓練 + 測試資料（後 100 筆）
inference_data = data.tail(100)

# 定義特徵 X 和目標變數 Y
X = train_test_data.drop(['Class'], axis=1)
Y = train_test_data['Class']


# 建立隨機森林模型並訓練
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, Y)

# 計算特徵的重要性
feature_importance = model.feature_importances_

# 顯示每個特徵的重要性
feature_names = X.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# 根據重要性排序
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                    Feature  Importance
4                   Insulin    0.372540
1                   Glucose    0.172805
3             SkinThickness    0.153023
6  DiabetesPedigreeFunction    0.089585
2             BloodPressure    0.066835
0               Pregnancies    0.062168
7                       Age    0.048802
5                       BMI    0.034242


In [16]:
train_test_data.to_csv('pima-indians-diabetes_traintest.csv', index=False, encoding='utf-8-sig')
inference_data.to_csv('pima-indians-diabetes_predict.csv', index=False, encoding='utf-8-sig')

In [19]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.activations import relu, sigmoid

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer
import pandas as pd

# 1. 定義建立模型的函數
def create_model(layers, activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i == 0:
            model.add(Dense(nodes, input_dim=X_train.shape[1]))
        else:
            model.add(Dense(nodes))
        model.add(Activation(activation))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 2. 包裝成 KerasClassifier
nn_model = KerasClassifier(build_fn=create_model, verbose=0)

# 3. 定義參數搜尋空間
param_grid = {
    'layers': [[8,6], [50,34], [100,68], [8,6,5]],
    'activation': ['sigmoid', 'relu'],
    'batch_size': [8, 16, 32],
    'epochs': [100, 150, 200]
}

# 4. 使用 GridSearchCV 做模型搜尋
grid = GridSearchCV(estimator=nn_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_result = grid.fit(X_train, Y_train)

# 5. 取得最佳模型進行測試
best_nn_model = grid_result.best_estimator_
y_pred = best_nn_model.predict(X_test)

# 6. 評估模型
nn_accuracy = accuracy_score(Y_test, y_pred)
nn_precision = precision_score(Y_test, y_pred, zero_division=0)
nn_recall = recall_score(Y_test, y_pred, zero_division=0)
nn_f1 = f1_score(Y_test, y_pred, zero_division=0)

# 7. 輸出結果與其他模型比較（可加到原本表格）
nn_result = pd.DataFrame([{
    'Model': 'NeuralNetwork',
    'Accuracy': nn_accuracy,
    'Precision': nn_precision,
    'Recall': nn_recall,
    'F1': nn_f1
}])

# 顯示結果
print("✅ Best parameters from GridSearchCV for NN:", grid_result.best_params_)
print("\n🔍 Performance on Test Set:")
print(nn_result)


ModuleNotFoundError: No module named 'tensorflow'