## Baseline + Normalization + Feature Selection

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
from scipy.stats import chi2_contingency

from imblearn.under_sampling import RandomUnderSampler

In [9]:
# Read Normal Datasets
#DATASET
df_path= 'Dataset\hypertension_data.csv'
df = pd.read_csv(df_path)

# mengisi missing value pada kolom sex dengan modus
df['sex'] = df['sex'].fillna(df['sex'].mode()[0])

In [10]:
# # Splitting Data
X = df.drop(columns=['target'])
y = df['target']

# Terapkan Random Under Sampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Bagi dataset menjadi data latih (training) dan data uji (testing) dengan proporsi 80:20
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [11]:
# Normalisasi Data Train:
X_train_norm = (X_train - X_train.min()) / (X_train.max() - X_train.min())
# Normalisasi Data Test: Gunakan max dan min dari data train
X_test_norm = (X_test - X_train.min()) / (X_train.max() - X_train.min())

In [12]:
df.nunique()

age          88
sex           2
cp            4
trestbps     49
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
target        2
dtype: int64

In [13]:
#Cramer's V Function
def cramers_V(var1,var2) :
  crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
  stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
  obs = np.sum(crosstab) # Number of observations
  mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
  
  return (stat/(obs*mini))

In [14]:
rows= []
for var1 in df:
   cramers =cramers_V(df[var1], df['target']) # Cramer's V test
   rows.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V
  
cramers_results = np.array(rows)
Cramer_matrix_target = pd.DataFrame(cramers_results[:-1], index =df.columns[:-1], columns=['target'])
threshold = 0.1  # Misalnya, ambang batas korelasi
selected_features = Cramer_matrix_target[Cramer_matrix_target.abs() > threshold].dropna()

print(df.columns, len(df.columns))
print("\nKorelasi dengan Target (Cramer's V):")
print(selected_features)
print('\nJumlah feature yang dipilih:', len(selected_features))
print(Cramer_matrix_target)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object') 14

Korelasi dengan Target (Cramer's V):
          target
cp          0.27
trestbps    0.16
chol        0.58
thalach     0.36
exang       0.19
oldpeak     0.30
slope       0.16
ca          0.24
thal        0.29

Jumlah feature yang dipilih: 9
          target
age         0.00
sex         0.00
cp          0.27
trestbps    0.16
chol        0.58
fbs         0.00
restecg     0.03
thalach     0.36
exang       0.19
oldpeak     0.30
slope       0.16
ca          0.24
thal        0.29


In [15]:
# Assign data train dan data test normalisasi+feature selection

X_train_norm_fs = X_train_norm[selected_features.index]
X_test_norm_fs = X_test_norm[selected_features.index]

X_train_norm_fs

Unnamed: 0,cp,trestbps,chol,thalach,exang,oldpeak,slope,ca,thal
16520,0.666667,0.433962,0.134703,0.641221,0.0,0.483871,0.5,0.00,0.666667
10263,0.000000,0.339623,0.011416,0.335878,1.0,0.193548,0.5,0.25,1.000000
24718,0.000000,0.320755,0.404110,0.671756,0.0,0.000000,1.0,0.25,0.666667
2801,0.000000,0.528302,0.276256,0.763359,0.0,0.241935,1.0,0.00,0.666667
13360,0.000000,0.490566,0.210046,0.259542,0.0,0.322581,0.5,0.25,1.000000
...,...,...,...,...,...,...,...,...,...
22248,0.000000,0.169811,0.052511,0.412214,0.0,0.258065,0.5,0.00,0.666667
3789,0.333333,0.066038,0.162100,0.648855,0.0,0.000000,1.0,0.00,1.000000
11908,0.666667,0.320755,0.235160,0.603053,0.0,0.064516,0.5,0.25,1.000000
1954,0.000000,0.292453,0.397260,0.763359,0.0,0.000000,1.0,0.50,1.000000


## 3. Metode klasifikasi:
1. Support Vector Machine
2. Gaussian Naive Bayes
3. Ridge Classifier
4. Linear Discriminant Analysis
5. Logistic Regression

### 1. Support Vector Machine

In [16]:
# Inisialisasi model SVM
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')

In [17]:
# Latih model pada data latih
svm_model.fit(X_train_norm_fs, y_train)

# with open('pickle/svm_model_normalized.pkl', 'wb') as file:
#     pickle.dump(svm_model, file)

In [18]:
# Lakukan prediksi pada data uji
svm_pred = svm_model.predict(X_test_norm_fs)

### 2. Gaussian Naive Bayes

In [19]:
# Inisialisasi model Gaussian Naive Bayes
gnb = GaussianNB()

In [20]:
# Latih model pada data latih
gnb.fit(X_train_norm_fs, y_train)

In [21]:
gnb_pred = gnb.predict(X_test_norm_fs)

### 3. Ridge Classifier

In [22]:
ridge_classifier = RidgeClassifier(alpha=1.0, solver='auto', random_state=42)

In [23]:
# Melatih model pada data pelatihan
ridge_classifier.fit(X_train_norm_fs, y_train)

In [24]:
# Melakukan prediksi pada data uji
rc_pred = ridge_classifier.predict(X_test_norm_fs)

### 4. Linear Discriminant Analysist

In [25]:
lda = LinearDiscriminantAnalysis()

In [26]:
lda.fit(X_train_norm_fs, y_train)

In [27]:
lda_pred = lda.predict(X_test_norm_fs)

### 5. Logistic Regression

In [28]:
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)

In [29]:
logistic_regression.fit(X_train_norm_fs, y_train)

In [30]:
lr_pred = logistic_regression.predict(X_test_norm_fs)

## 4. Model evaluator:
1. Precision
2. Recall
3. F1-measure

In [31]:
# Hitung precision, recall, dan F1-score
def eval(y_test,model):
    precision = precision_score(y_test, model, average='weighted')
    recall = recall_score(y_test, model, average='weighted')
    f1 = f1_score(y_test, model, average='weighted')
    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1-score: {f1:.3f}')
    print()
    print(confusion_matrix(y_test, model))
    print(classification_report(y_test, model))
    return

### 1. Support Vector Machine

In [32]:
eval(y_test, svm_pred)

Precision: 0.851
Recall: 0.847
F1-score: 0.846

[[1859  493]
 [ 231 2141]]
              precision    recall  f1-score   support

           0       0.89      0.79      0.84      2352
           1       0.81      0.90      0.86      2372

    accuracy                           0.85      4724
   macro avg       0.85      0.85      0.85      4724
weighted avg       0.85      0.85      0.85      4724



### 2. Gaussian Naive Bayes

In [33]:
eval(y_test, gnb_pred)

Precision: 0.829
Recall: 0.827
F1-score: 0.827

[[1873  479]
 [ 336 2036]]
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      2352
           1       0.81      0.86      0.83      2372

    accuracy                           0.83      4724
   macro avg       0.83      0.83      0.83      4724
weighted avg       0.83      0.83      0.83      4724



### 3. Ridge Classifier

In [34]:
eval(y_test, rc_pred)

Precision: 0.839
Recall: 0.831
F1-score: 0.830

[[1766  586]
 [ 214 2158]]
              precision    recall  f1-score   support

           0       0.89      0.75      0.82      2352
           1       0.79      0.91      0.84      2372

    accuracy                           0.83      4724
   macro avg       0.84      0.83      0.83      4724
weighted avg       0.84      0.83      0.83      4724



### 4. Linear Discriminant Analysist

In [35]:
eval(y_test, lda_pred)

Precision: 0.839
Recall: 0.831
F1-score: 0.830

[[1766  586]
 [ 214 2158]]
              precision    recall  f1-score   support

           0       0.89      0.75      0.82      2352
           1       0.79      0.91      0.84      2372

    accuracy                           0.83      4724
   macro avg       0.84      0.83      0.83      4724
weighted avg       0.84      0.83      0.83      4724



### 5. Logistic Regression

In [36]:
eval(y_test=y_test, model=lr_pred)

Precision: 0.848
Recall: 0.846
F1-score: 0.846

[[1906  446]
 [ 282 2090]]
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      2352
           1       0.82      0.88      0.85      2372

    accuracy                           0.85      4724
   macro avg       0.85      0.85      0.85      4724
weighted avg       0.85      0.85      0.85      4724



## 4. Model evaluator:
1. Precision
2. Recall
3. F1-measure

In [37]:
# Hitung precision, recall, dan F1-score
def eval(y_test,model):
    precision = precision_score(y_test, model, average='weighted')
    recall = recall_score(y_test, model, average='weighted')
    f1 = f1_score(y_test, model, average='weighted')
    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1-score: {f1:.3f}')
    print()
    print(confusion_matrix(y_test, model))
    print(classification_report(y_test, model))
    return

### 1. Support Vector Machine

In [38]:
eval(y_test, svm_pred)

Precision: 0.851
Recall: 0.847
F1-score: 0.846

[[1859  493]
 [ 231 2141]]
              precision    recall  f1-score   support

           0       0.89      0.79      0.84      2352
           1       0.81      0.90      0.86      2372

    accuracy                           0.85      4724
   macro avg       0.85      0.85      0.85      4724
weighted avg       0.85      0.85      0.85      4724



### 2. Gaussian Naive Bayes

In [39]:
eval(y_test, gnb_pred)

Precision: 0.829
Recall: 0.827
F1-score: 0.827

[[1873  479]
 [ 336 2036]]
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      2352
           1       0.81      0.86      0.83      2372

    accuracy                           0.83      4724
   macro avg       0.83      0.83      0.83      4724
weighted avg       0.83      0.83      0.83      4724



### 3. Ridge Classifier

In [40]:
eval(y_test, rc_pred)

Precision: 0.839
Recall: 0.831
F1-score: 0.830

[[1766  586]
 [ 214 2158]]
              precision    recall  f1-score   support

           0       0.89      0.75      0.82      2352
           1       0.79      0.91      0.84      2372

    accuracy                           0.83      4724
   macro avg       0.84      0.83      0.83      4724
weighted avg       0.84      0.83      0.83      4724



### 4. Linear Discriminant Analysist

In [41]:
eval(y_test, lda_pred)

Precision: 0.839
Recall: 0.831
F1-score: 0.830

[[1766  586]
 [ 214 2158]]
              precision    recall  f1-score   support

           0       0.89      0.75      0.82      2352
           1       0.79      0.91      0.84      2372

    accuracy                           0.83      4724
   macro avg       0.84      0.83      0.83      4724
weighted avg       0.84      0.83      0.83      4724



### 5. Logistic Regression

In [42]:
eval(y_test, lr_pred)

Precision: 0.848
Recall: 0.846
F1-score: 0.846

[[1906  446]
 [ 282 2090]]
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      2352
           1       0.82      0.88      0.85      2372

    accuracy                           0.85      4724
   macro avg       0.85      0.85      0.85      4724
weighted avg       0.85      0.85      0.85      4724

