## Baseline + Normalization + Feature Selection + Dimentional Reduction

In [9]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler

In [10]:
# Read Normal Datasets
#DATASET
df_path= 'Dataset\hypertension_data.csv'
df = pd.read_csv(df_path)

# mengisi missing value pada kolom sex dengan modus
df['sex'] = df['sex'].fillna(df['sex'].mode()[0])

In [11]:
# # Splitting Data
X = df.drop(columns=['target'])
y = df['target']

# Terapkan Random Under Sampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Bagi dataset menjadi data latih (training) dan data uji (testing) dengan proporsi 80:20
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [12]:
# Normalisasi Data Train:
X_train_norm = (X_train - X_train.min()) / (X_train.max() - X_train.min())
# Normalisasi Data Test: Gunakan max dan min dari data train
X_test_norm = (X_test - X_train.min()) / (X_train.max() - X_train.min())

In [13]:
# feature selection dengan nilai korelasi
# Mengukur korelasi antara setiap fitur dengan target/class (output)
correlation_with_target = df.corr()['Diabetes_binary'][1:]  # Mengabaikan korelasi dengan dirinya sendiri

threshold = 0.1  # Misalnya, ambang batas korelasi
selected_features = correlation_with_target[correlation_with_target.abs() > threshold]

print(df.columns, len(df.columns))
print("\nKorelasi dengan Target (Pearson):")
print(selected_features, len(selected_features))


Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object') 22

Korelasi dengan Target (Pearson):
HighBP                  0.381516
HighChol                0.289213
CholCheck               0.115382
BMI                     0.293373
Stroke                  0.125427
HeartDiseaseorAttack    0.211523
PhysActivity           -0.158666
GenHlth                 0.407612
PhysHlth                0.213081
DiffWalk                0.272646
Age                     0.278738
Education              -0.170481
Income                 -0.224449
Name: Diabetes_binary, dtype: float64 13


In [14]:
# Assign data train dan data test normalisasi+feature selection

X_train_norm_fs = X_train_norm[selected_features.index]
X_test_norm_fs = X_test_norm[selected_features.index]

In [15]:
# Dimentional Reduction

# Inisialisasi model PCA dengan jumlah komponen utama yang diinginkan
pca = PCA(n_components=5)

# Fit dan transformasi data ke ruang fitur baru
X_train_norm_fs_dr = pca.fit_transform(X_train_norm_fs)
X_test_norm_fs_dr = pca.fit_transform(X_test_norm_fs)
print(X_train_norm_fs_dr)
# sekarang berisi data yang telah direduksi menjadi 5 komponen utama

[[-0.15908547 -0.07896947 -0.72954111 -0.14355551 -0.09492074]
 [ 1.66718759  0.52849479  0.12370811  0.23679716  0.42442295]
 [ 0.45760611  0.52402    -0.77043881  0.52542883 -0.47663795]
 ...
 [ 0.25041334 -0.71011289 -0.1055331  -0.01020016 -0.0301584 ]
 [-0.01534184  0.33004991 -0.34239182 -0.94713942  0.03955407]
 [-0.97793045  0.21702665 -0.00387839  0.1355357  -0.00704249]]


## SVM Model

In [16]:
# Inisialisasi model SVM
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')

In [17]:
# Latih model pada data latih
svm_model.fit(X_train_norm_fs_dr, y_train)

# with open('pickle/svm_model_normalized.pkl', 'wb') as file:
#     pickle.dump(svm_model, file)

In [18]:
# Lakukan prediksi pada data uji
svm_pred = svm_model.predict(X_test_norm_fs_dr)

In [19]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

[[4405 2685]
 [1751 5298]]
              precision    recall  f1-score   support

         0.0       0.72      0.62      0.67      7090
         1.0       0.66      0.75      0.70      7049

    accuracy                           0.69     14139
   macro avg       0.69      0.69      0.69     14139
weighted avg       0.69      0.69      0.68     14139



In [20]:
# Hitung precision, recall, dan F1-score
precision = precision_score(y_test, svm_pred, average='weighted')
recall = recall_score(y_test, svm_pred, average='weighted')
f1 = f1_score(y_test, svm_pred, average='weighted')

# Tampilkan hasil evaluasi
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.689686417570397
Recall: 0.6862578683075182
F1-score: 0.6849436435013044


## Random Forest Model

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators=500)

In [22]:
rfc_model.fit(X_train_norm_fs_dr, y_train)

# with open('pickle/rfc_model_normalized.pkl', 'wb') as file:
#     pickle.dump(rfc_model, file)

In [23]:
rfc_pred = rfc_model.predict(X_test_norm_fs_dr)

In [24]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_test, rfc_pred))
print(classification_report(y_test, rfc_pred))

[[5207 1883]
 [2345 4704]]
              precision    recall  f1-score   support

         0.0       0.69      0.73      0.71      7090
         1.0       0.71      0.67      0.69      7049

    accuracy                           0.70     14139
   macro avg       0.70      0.70      0.70     14139
weighted avg       0.70      0.70      0.70     14139



In [25]:
# Hitung precision, recall, dan F1-score
precision = precision_score(y_test, rfc_pred, average='weighted')
recall = recall_score(y_test, rfc_pred, average='weighted')
f1 = f1_score(y_test, rfc_pred, average='weighted')

# Tampilkan hasil evaluasi
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.7017743280385624
Recall: 0.7009689511280854
F1-score: 0.7006209033627455


## Save to Pickle

In [None]:
# with open('pickle/svm_model_norm_fs_dr.pkl', 'wb') as file:
#     pickle.dump(svm_model, file)
# with open('pickle/rfc_model_norm_fs_dr.pkl', 'wb') as file:
#     pickle.dump(rfc_model, file)