## Baseline + Normalization + Feature Selection

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Read Normal Datasets
df_path= 'Dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv'
df = pd.read_csv(df_path)

In [3]:
X = df.iloc[:,1:]
y = df['Diabetes_binary']

# Bagi dataset menjadi data latih (training) dan data uji (testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Normalization
X_train_norm = (X_train - X_train.min()) / (X_train.max() - X_train.min())
X_test_norm= (X_test - X_test.min()) / (X_test.max() - X_test.min())

In [5]:
# feature selection dengan nilai korelasi
# Mengukur korelasi antara setiap fitur dengan target/class (output)
correlation_with_target = df.corr()['Diabetes_binary'][1:]  # Mengabaikan korelasi dengan dirinya sendiri

threshold = 0.1  # Misalnya, ambang batas korelasi
selected_features = correlation_with_target[correlation_with_target.abs() > threshold]

print(df.columns, len(df.columns))
print("\nKorelasi dengan Target (Pearson):")
print(selected_features, len(selected_features))


Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object') 22

Korelasi dengan Target (Pearson):
HighBP                  0.381516
HighChol                0.289213
CholCheck               0.115382
BMI                     0.293373
Stroke                  0.125427
HeartDiseaseorAttack    0.211523
PhysActivity           -0.158666
GenHlth                 0.407612
PhysHlth                0.213081
DiffWalk                0.272646
Age                     0.278738
Education              -0.170481
Income                 -0.224449
Name: Diabetes_binary, dtype: float64 13


In [6]:
# Assign data train dan data test normalisasi+feature selection

X_train_norm_fs = X_train_norm[selected_features.index]
X_test_norm_fs = X_test_norm[selected_features.index]

## SVM Model

In [7]:
# Inisialisasi model SVM
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')

In [8]:
# Latih model pada data latih
svm_model.fit(X_train_norm_fs, y_train)

# with open('pickle/svm_model_normalized.pkl', 'wb') as file:
#     pickle.dump(svm_model, file)

In [9]:
# Lakukan prediksi pada data uji
svm_pred = svm_model.predict(X_test_norm_fs)

In [10]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

[[5055 2035]
 [1570 5479]]
              precision    recall  f1-score   support

         0.0       0.76      0.71      0.74      7090
         1.0       0.73      0.78      0.75      7049

    accuracy                           0.75     14139
   macro avg       0.75      0.75      0.74     14139
weighted avg       0.75      0.75      0.74     14139



In [11]:
# Hitung precision, recall, dan F1-score
precision = precision_score(y_test, svm_pred, average='weighted')
recall = recall_score(y_test, svm_pred, average='weighted')
f1 = f1_score(y_test, svm_pred, average='weighted')

# Tampilkan hasil evaluasi
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.7461446138386598
Recall: 0.7450314732300728
F1-score: 0.7447797874248188


## Random Forest Model

In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators=500)

In [13]:
rfc_model.fit(X_train_norm_fs, y_train)

# with open('pickle/rfc_model_normalized.pkl', 'wb') as file:
#     pickle.dump(rfc_model, file)

In [14]:
rfc_pred = rfc_model.predict(X_test_norm_fs)

In [15]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test, rfc_pred))
print(classification_report(y_test, rfc_pred))

[[4991 2099]
 [1823 5226]]
              precision    recall  f1-score   support

         0.0       0.73      0.70      0.72      7090
         1.0       0.71      0.74      0.73      7049

    accuracy                           0.72     14139
   macro avg       0.72      0.72      0.72     14139
weighted avg       0.72      0.72      0.72     14139



In [16]:
# Hitung precision, recall, dan F1-score
precision = precision_score(y_test, rfc_pred, average='weighted')
recall = recall_score(y_test, rfc_pred, average='weighted')
f1 = f1_score(y_test, rfc_pred, average='weighted')

# Tampilkan hasil evaluasi
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.7229824083702975
Recall: 0.7226112172006507
F1-score: 0.722521195245792


## Save to Pickle

In [None]:
# with open('pickle/svm_model_norm_fs.pkl', 'wb') as file:
#     pickle.dump(svm_model, file)
# with open('pickle/rfc_model_norm_fs.pkl', 'wb') as file:
#     pickle.dump(rfc_model, file)