In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
from scipy import stats

I tried comparing between adaboost using skit-learn, lightGBM liberary and XGboost liberary, and using the basic model with no additional optimization skitlearn gave higher accuracy so it is what I will be using

# loading dataset and checking for outliers

In [11]:
# Load your dataset
file_path = 'Dataset/cleaned_hypertension_data.csv'  # Replace with the path to your dataset
data = pd.read_csv(file_path)

# Outlier detection and removal
z_scores = np.abs(stats.zscore(data))
data = data[(z_scores < 3).all(axis=1)]

# Finding correlation between features and choosing the top K feastures

In [12]:
# Calculating the correlation matrix
correlation_matrix = data.corr()

# Extracting correlations with the target variable (assuming the target variable is 'Class')
feature_correlation = abs(correlation_matrix["Class"])
sorted_features = feature_correlation.sort_values(ascending=False).drop('Class')

# Loop to train the model using top k features and print the accuracy for each k
for k in range(1, len(sorted_features) + 1):
    top_k_features = sorted_features.head(k).index
    X = data[top_k_features]
    y = data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ada_boost_model = AdaBoostClassifier(random_state=42)
    ada_boost_model.fit(X_train, y_train)
    y_pred_proba = ada_boost_model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    optimal_threshold = thresholds[ix]
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Using top {k} features with optimal threshold {optimal_threshold}: Accuracy: {accuracy}")

Using top 1 features with optimal threshold 0.5045387755369072: Accuracy: 0.725
Using top 2 features with optimal threshold 0.49917187361254284: Accuracy: 0.685
Using top 3 features with optimal threshold 0.499970619167732: Accuracy: 0.79
Using top 4 features with optimal threshold 0.5025145679959081: Accuracy: 0.8125
Using top 5 features with optimal threshold 0.5025145679959081: Accuracy: 0.8125
Using top 6 features with optimal threshold 0.501703405077917: Accuracy: 0.805
Using top 7 features with optimal threshold 0.5024233203277794: Accuracy: 0.805
Using top 8 features with optimal threshold 0.4969743937441496: Accuracy: 0.74
Using top 9 features with optimal threshold 0.4992962538045883: Accuracy: 0.8825
Using top 10 features with optimal threshold 0.4975949879322776: Accuracy: 0.88
Using top 11 features with optimal threshold 0.4975949879322776: Accuracy: 0.88
Using top 12 features with optimal threshold 0.4975949879322776: Accuracy: 0.88


# Manual Hyperparameter tuning using top 9 features

In [13]:
# Manual Hyperparameter Tuning for top 9 features
top_9_features = sorted_features.head(9).index
X_top9 = data[top_9_features]
y_top9 = data['Class']

# Define a range of hyperparameters for tuning
n_estimators_range = [50, 100, 150, 200]
learning_rate_range = [0.01, 0.1, 1.0]

best_accuracy = 0
best_params = {}

for n_estimators in n_estimators_range:
    for learning_rate in learning_rate_range:
        model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        cv_results = cross_val_score(model, X_top9, y_top9, cv=skf, scoring='accuracy')
        max_accuracy_index = np.argmax(cv_results)
        if cv_results[max_accuracy_index] > best_accuracy:
            best_accuracy = cv_results[max_accuracy_index]
            best_params = {'n_estimators': n_estimators, 'learning_rate': learning_rate}

# Finding the optimal threshold while training the model

In [14]:
# Training the best model with optimal threshold
best_model = AdaBoostClassifier(**best_params, random_state=42)
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_top9, y_top9, test_size=0.2, random_state=42)
best_model.fit(X_train_final, y_train_final)
y_pred_proba_final = best_model.predict_proba(X_test_final)[:, 1]
precision_final, recall_final, thresholds_final = precision_recall_curve(y_test_final, y_pred_proba_final)
fscore_final = (2 * precision_final * recall_final) / (precision_final + recall_final)
ix_final = np.argmax(fscore_final)
optimal_threshold_final = thresholds_final[ix_final]
y_pred_final = (y_pred_proba_final >= optimal_threshold_final).astype(int)

# Results
 ( when experementing the focus was on achieving the highest Recal since it's medical data , "high sensitivity and avoiding false negative is most important" )

In [15]:
precision_final = precision_score(y_test_final, y_pred_final)
recall_final = recall_score(y_test_final, y_pred_final)
f1_final = f1_score(y_test_final, y_pred_final)
conf_matrix_final = confusion_matrix(y_test_final, y_pred_final)

print(f"Best Parameters: {best_params}")
print(f"Optimal Threshold: {optimal_threshold_final}")
print(f"Accuracy: {accuracy_score(y_test_final, y_pred_final)}")
print(f"Precision: {precision_final}")
print(f"Recall: {recall_final}")
print(f"F1 Score: {f1_final}")
print(f"Confusion Matrix:\n{conf_matrix_final}")

Best Parameters: {'n_estimators': 50, 'learning_rate': 0.1}
Optimal Threshold: 0.47485040925205974
Accuracy: 0.8775
Precision: 0.8168316831683168
Recall: 0.9322033898305084
F1 Score: 0.8707124010554088
Confusion Matrix:
[[186  37]
 [ 12 165]]
