## Baseline + Normalization + Feature Selection + Dimentional Reduction

In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from scipy.stats import chi2_contingency

from imblearn.under_sampling import RandomUnderSampler

In [63]:
# Read Normal Datasets
#DATASET
df_path= 'Dataset\hypertension_data.csv'
df = pd.read_csv(df_path)

# mengisi missing value pada kolom sex dengan modus
df['sex'] = df['sex'].fillna(df['sex'].mode()[0])

In [64]:
# # Splitting Data
X = df.drop(columns=['target'])
y = df['target']

# Terapkan Random Under Sampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Bagi dataset menjadi data latih (training) dan data uji (testing) dengan proporsi 80:20
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [65]:
# Normalisasi Data Train:
X_train_norm = (X_train - X_train.min()) / (X_train.max() - X_train.min())
# Normalisasi Data Test: Gunakan max dan min dari data train
X_test_norm = (X_test - X_train.min()) / (X_train.max() - X_train.min())

In [66]:
#Cramer's V Function
def cramers_V(var1,var2) :
  crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
  stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
  obs = np.sum(crosstab) # Number of observations
  mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
  
  return (stat/(obs*mini))

In [67]:
rows= []
for var1 in df:
   cramers =cramers_V(df[var1], df['target']) # Cramer's V test
   rows.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V
  
cramers_results = np.array(rows)
Cramer_matrix_target = pd.DataFrame(cramers_results[:-1], index =df.columns[:-1], columns=['target'])
threshold = 0.1  # Misalnya, ambang batas korelasi
selected_features = Cramer_matrix_target[Cramer_matrix_target.abs() > threshold].dropna()

print(df.columns, len(df.columns))
print("\nKorelasi dengan Target (Cramer's V):")
print(selected_features)
print('\nJumlah feature yang dipilih:', len(selected_features))

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object') 14

Korelasi dengan Target (Cramer's V):
          target
cp          0.27
trestbps    0.16
chol        0.58
thalach     0.36
exang       0.19
oldpeak     0.30
slope       0.16
ca          0.24
thal        0.29

Jumlah feature yang dipilih: 9


In [68]:
# Assign data train dan data test normalisasi+feature selection

X_train_norm_fs = X_train_norm[selected_features.index]
X_test_norm_fs = X_test_norm[selected_features.index]

In [69]:
# Dimentional Reduction

# Inisialisasi model PCA dengan jumlah komponen utama yang diinginkan
pca = PCA(n_components=5)

# Fit dan transformasi data ke ruang fitur baru
X_train_norm_fs_dr = pca.fit_transform(X_train_norm_fs)
X_test_norm_fs_dr = pca.fit_transform(X_test_norm_fs)
print(X_train_norm_fs_dr)
# sekarang berisi data yang telah direduksi menjadi 5 komponen utama

[[-0.3772907   0.41759906  0.21665664 -0.11710908  0.00357453]
 [ 0.78749097 -0.02771718  0.00691276 -0.02459766 -0.08831041]
 [-0.2930117  -0.30826362 -0.41138838 -0.0381119   0.08765825]
 ...
 [-0.37116778  0.31120431  0.09878603  0.12710279 -0.19170133]
 [-0.24074584 -0.2723968  -0.52798493  0.23220919 -0.17287224]
 [-0.46627862  0.22262179  0.22874512 -0.16019983  0.05423537]]


## 3. Metode klasifikasi:
1. Support Vector Machine
2. Gaussian Naive Bayes
3. Ridge Classifier
4. Linear Discriminant Analysis
5. Logistic Regression

### 1. Support Vector Machine

In [70]:
# Inisialisasi model SVM
svm_model = SVC(kernel='linear')

In [71]:
# Latih model pada data latih
svm_model.fit(X_train_norm_fs_dr, y_train)

In [72]:
# Lakukan prediksi pada data uji
svm_pred = svm_model.predict(X_test_norm_fs_dr)

### 2. Gaussian Naive Bayes

In [73]:
# Inisialisasi model Gaussian Naive Bayes
gnb = GaussianNB()

In [74]:
# Latih model pada data latih
gnb.fit(X_train_norm_fs_dr, y_train)

In [75]:
gnb_pred = gnb.predict(X_test_norm_fs_dr)

### 3. Ridge Classifier

In [76]:
ridge_classifier = RidgeClassifier(alpha=1.0, solver='auto', random_state=42)

In [77]:
# Melatih model pada data pelatihan
ridge_classifier.fit(X_train_norm_fs_dr, y_train)

In [78]:
# Melakukan prediksi pada data uji
rc_pred = ridge_classifier.predict(X_test_norm_fs_dr)

### 4. Linear Discriminant Analysist

In [79]:
lda = LinearDiscriminantAnalysis()

In [80]:
lda.fit(X_train_norm_fs_dr, y_train)

In [81]:
lda_pred = lda.predict(X_test_norm_fs_dr)

### 5. Logistic Regression

In [82]:
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)

In [83]:
logistic_regression.fit(X_train_norm_fs_dr, y_train)

In [84]:
lr_pred = logistic_regression.predict(X_test_norm_fs_dr)

## 4. Model evaluator:
1. Precision
2. Recall
3. F1-measure

In [85]:
# Hitung precision, recall, dan F1-score
def eval(y_test,model):
    precision = precision_score(y_test, model, average='weighted')
    recall = recall_score(y_test, model, average='weighted')
    f1 = f1_score(y_test, model, average='weighted')
    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1-score: {f1:.3f}')
    print()
    print(confusion_matrix(y_test, model))
    print(classification_report(y_test, model))
    return

### 1. Support Vector Machine

In [86]:
eval(y_test, svm_pred)

Precision: 0.827
Recall: 0.824
F1-score: 0.824

[[1832  520]
 [ 311 2061]]
              precision    recall  f1-score   support

           0       0.85      0.78      0.82      2352
           1       0.80      0.87      0.83      2372

    accuracy                           0.82      4724
   macro avg       0.83      0.82      0.82      4724
weighted avg       0.83      0.82      0.82      4724



### 2. Gaussian Naive Bayes

In [87]:
eval(y_test, gnb_pred)

Precision: 0.798
Recall: 0.798
F1-score: 0.798

[[1854  498]
 [ 456 1916]]
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      2352
           1       0.79      0.81      0.80      2372

    accuracy                           0.80      4724
   macro avg       0.80      0.80      0.80      4724
weighted avg       0.80      0.80      0.80      4724



### 3. Ridge Classifier

In [88]:
eval(y_test, rc_pred)

Precision: 0.830
Recall: 0.827
F1-score: 0.827

[[1838  514]
 [ 302 2070]]
              precision    recall  f1-score   support

           0       0.86      0.78      0.82      2352
           1       0.80      0.87      0.84      2372

    accuracy                           0.83      4724
   macro avg       0.83      0.83      0.83      4724
weighted avg       0.83      0.83      0.83      4724



### 4. Linear Discriminant Analysist

In [89]:
eval(y_test, lda_pred)

Precision: 0.830
Recall: 0.827
F1-score: 0.827

[[1838  514]
 [ 302 2070]]
              precision    recall  f1-score   support

           0       0.86      0.78      0.82      2352
           1       0.80      0.87      0.84      2372

    accuracy                           0.83      4724
   macro avg       0.83      0.83      0.83      4724
weighted avg       0.83      0.83      0.83      4724



### 5. Logistic Regression

In [90]:
eval(y_test, lr_pred)

Precision: 0.827
Recall: 0.826
F1-score: 0.826

[[1867  485]
 [ 338 2034]]
              precision    recall  f1-score   support

           0       0.85      0.79      0.82      2352
           1       0.81      0.86      0.83      2372

    accuracy                           0.83      4724
   macro avg       0.83      0.83      0.83      4724
weighted avg       0.83      0.83      0.83      4724

