In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load data
data_test = pd.read_csv('../data/heart_disease/data_test.csv')
bert_predictions = pd.read_csv('results/bert.csv')
mistral_predictions = pd.read_csv('results/mistral.csv')
traditional_predictions = pd.read_csv('results/traditional.csv')

# Prepare predictions
bert_predictions = bert_predictions.iloc[:, 0].astype(int)
mistral_predictions = mistral_predictions.iloc[:, 0].astype(int)
traditional_predictions = traditional_predictions['Predictions'].astype(int)

# Combine predictions into a single DataFrame
stacking_features = pd.DataFrame({
    'BERT': bert_predictions,
    'Mistral': mistral_predictions,
    'Random_Forest': traditional_predictions
})

# True labels from the test data
true_labels = data_test['num']

# Initialize meta-models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Random Forest': RandomForestClassifier(),
}

# Train and evaluate each meta-model
for name, model in models.items():
    model.fit(stacking_features, true_labels)
    predictions = model.predict(stacking_features)
    accuracy = accuracy_score(true_labels, predictions)
    print(f'Accuracy of {name}: {accuracy:.2f}')

    # gennerate confusion matrix
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(true_labels, predictions))






Accuracy of Logistic Regression: 0.91
[[ 67   8]
 [  9 100]]
Accuracy of Decision Tree: 0.91
[[ 67   8]
 [  9 100]]
Accuracy of Gradient Boosting: 0.91
[[ 67   8]
 [  9 100]]
Accuracy of Random Forest: 0.91
[[ 67   8]
 [  9 100]]
