In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# AdaBoost metrics
adaboost_train_auc, adaboost_train_accuracy, adaboost_train_recall, adaboost_train_precision, adaboost_train_specificity, adaboost_train_f1 = 0.85, 0.75, 0.65, 0.80, 0.85, 0.72
adaboost_valid_auc, adaboost_valid_accuracy, adaboost_valid_recall, adaboost_valid_precision, adaboost_valid_specificity, adaboost_valid_f1 = 0.82, 0.72, 0.60, 0.78, 0.80, 0.70

# CatBoost metrics
catboost_train_auc, catboost_train_accuracy, catboost_train_recall, catboost_train_precision, catboost_train_specificity, catboost_train_f1 = 0.88, 0.78, 0.68, 0.82, 0.87, 0.75
catboost_valid_auc, catboost_valid_accuracy, catboost_valid_recall, catboost_valid_precision, catboost_valid_specificity, catboost_valid_f1 = 0.85, 0.75, 0.65, 0.80, 0.82, 0.72

# LightGBM metrics
lgb_train_auc, lgb_train_accuracy, lgb_train_recall, lgb_train_precision, lgb_train_specificity, lgb_train_f1 = 0.90, 0.80, 0.70, 0.85, 0.88, 0.78
lgb_valid_auc, lgb_valid_accuracy, lgb_valid_recall, lgb_valid_precision, lgb_valid_specificity, lgb_valid_f1 = 0.86, 0.76, 0.66, 0.82, 0.84, 0.74

# Create DataFrames for training sets
df_adaboost_train = pd.DataFrame({'classifier': ['AdaBoost'], 'data_set': ['train'], 'auc': [adaboost_train_auc], 'accuracy': [adaboost_train_accuracy], 'recall': [adaboost_train_recall], 'precision': [adaboost_train_precision], 'specificity': [adaboost_train_specificity], 'f1': [adaboost_train_f1]})
df_catboost_train = pd.DataFrame({'classifier': ['CatBoost'], 'data_set': ['train'], 'auc': [catboost_train_auc], 'accuracy': [catboost_train_accuracy], 'recall': [catboost_train_recall], 'precision': [catboost_train_precision], 'specificity': [catboost_train_specificity], 'f1': [catboost_train_f1]})
df_lgb_train = pd.DataFrame({'classifier': ['LightGBM'], 'data_set': ['train'], 'auc': [lgb_train_auc], 'accuracy': [lgb_train_accuracy], 'recall': [lgb_train_recall], 'precision': [lgb_train_precision], 'specificity': [lgb_train_specificity], 'f1': [lgb_train_f1]})

# Create DataFrames for validation sets
df_adaboost_valid = pd.DataFrame({'classifier': ['AdaBoost'], 'data_set': ['valid'], 'auc': [adaboost_valid_auc], 'accuracy': [adaboost_valid_accuracy], 'recall': [adaboost_valid_recall], 'precision': [adaboost_valid_precision], 'specificity': [adaboost_valid_specificity], 'f1': [adaboost_valid_f1]})
df_catboost_valid = pd.DataFrame({'classifier': ['CatBoost'], 'data_set': ['valid'], 'auc': [catboost_valid_auc], 'accuracy': [catboost_valid_accuracy], 'recall': [catboost_valid_recall], 'precision': [catboost_valid_precision], 'specificity': [catboost_valid_specificity], 'f1': [catboost_valid_f1]})
df_lgb_valid = pd.DataFrame({'classifier': ['LightGBM'], 'data_set': ['valid'], 'auc': [lgb_valid_auc], 'accuracy': [lgb_valid_accuracy], 'recall': [lgb_valid_recall], 'precision': [lgb_valid_precision], 'specificity': [lgb_valid_specificity], 'f1': [lgb_valid_f1]})

# Concatenate the DataFrames
df_results_train = pd.concat([df_adaboost_train, df_catboost_train, df_lgb_train], ignore_index=True)
df_results_valid = pd.concat([df_adaboost_valid, df_catboost_valid, df_lgb_valid], ignore_index=True)

# Print the resulting DataFrames
print("Training Set Results:")
print(df_results_train)
print("\nValidation Set Results:")
print(df_results_valid)

# Plotting
df_results = pd.concat([df_results_train, df_results_valid], ignore_index=True)
ax = sns.barplot(x="classifier", y="auc", hue="data_set", data=df_results)
ax.set_xlabel('Classifier', fontsize=15)
ax.set_ylabel('AUC', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=15)
plt.show()

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Modify the print_performance_metrics function to return metrics
def print_performance_metrics(title, y_true, y_pred, thresh):
    auc = roc_auc_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, (y_pred > thresh).astype(int))
    recall = recall_score(y_true, (y_pred > thresh).astype(int))
    precision = precision_score(y_true, (y_pred > thresh).astype(int))
    tn, fp, fn, tp = confusion_matrix(y_true, (y_pred > thresh).astype(int)).ravel()
    specificity = tn / (tn + fp)
    f1 = f1_score(y_true, (y_pred > thresh).astype(int))
    
    print(f"{title} AUC: {auc:.4f}")
    print(f"{title} Accuracy: {accuracy:.4f}")
    print(f"{title} Recall: {recall:.4f}")
    print(f"{title} Precision: {precision:.4f}")
    print(f"{title} Specificity: {specificity:.4f}")
    print(f"{title} F1 Score: {f1:.4f}")
    
    return auc, accuracy, recall, precision, specificity, f1

# Train the AdaBoost model on your training data
DTC = DecisionTreeClassifier(max_depth=1)
adaboost_clf = AdaBoostClassifier(n_estimators=50, base_estimator=DTC, learning_rate=1)
adaboost_model = adaboost_clf.fit(X_train_tf, y_train)

# Predict the response for the training dataset
y_train_pred_adaboost = adaboost_model.predict(X_train_tf)

# Evaluate the performance on the training set
thresh = 0.5  # You can adjust the threshold if needed
adaboost_train_metrics = print_performance_metrics("AdaBoost Training", y_train, y_train_pred_adaboost, thresh)

# Predict the response for the validation dataset
y_valid_pred_adaboost = adaboost_model.predict(X_valid_tf)

# Evaluate the performance on the validation set
adaboost_valid_metrics = print_performance_metrics("AdaBoost Validation", y_valid, y_valid_pred_adaboost, thresh)

# Save the AdaBoost model to a file
pickle.dump(adaboost_model, open('adaboost_model.sav', 'wb'))

# Train the CatBoost model on your training data
catboost_clf = CatBoostClassifier(iterations=50, depth=1, learning_rate=1, loss_function='Logloss')
catboost_model = catboost_clf.fit(X_train_tf, y_train)

# Predict the response for the training dataset
y_train_pred_catboost = catboost_model.predict(X_train_tf, prediction_type='Probability')[:, 1]

# Convert probabilities to binary predictions
y_train_pred_binary_catboost = (y_train_pred_catboost > 0.5).astype(int)

# Evaluate the performance on the training set
catboost_train_metrics = print_performance_metrics("CatBoost Training", y_train, y_train_pred_binary_catboost, thresh)

# Predict the response for the validation dataset
y_valid_pred_catboost = catboost_model.predict(X_valid_tf, prediction_type='Probability')[:, 1]

# Convert probabilities to binary predictions
y_valid_pred_binary_catboost = (y_valid_pred_catboost > 0.5).astype(int)

# Evaluate the performance on the validation set
catboost_valid_metrics = print_performance_metrics("CatBoost Validation", y_valid, y_valid_pred_binary_catboost, thresh)

# Save the CatBoost model to a file
pickle.dump(catboost_model, open('catboost_model.sav', 'wb'))

# Train the LightGBM model on your training data
lgb_clf = lgb.LGBMClassifier(objective='binary', boosting_type='gbdt', num_leaves=31, learning_rate=0.05, feature_fraction=0.9)
lgb_model = lgb_clf.fit(X_train_tf, y_train)

# Predict the response for the training dataset
y_train_pred_lgb = lgb_model.predict(X_train_tf)

# Convert probabilities to binary predictions
y_train_pred_binary_lgb = (y_train_pred_lgb > 0.5).astype(int)

# Evaluate the performance on the training set
lgb_train_metrics = print_performance_metrics("LightGBM Training", y_train, y_train_pred_binary_lgb, thresh)

# Predict the response for the validation dataset
y_valid_pred_lgb = lgb_model.predict(X_valid_tf)

# Convert probabilities to binary predictions
y_valid_pred_binary_lgb = (y_valid_pred_lgb > 0.5).astype(int)

# Evaluate the performance on the validation set
lgb_valid_metrics = print_performance_metrics("LightGBM Validation", y_valid, y_valid_pred_binary_lgb, thresh)

# Save the LightGBM model to a file
pickle.dump(lgb_model, open('lgb_model.sav', 'wb'))

# Append metrics to the existing DataFrames
df_adaboost_train = pd.DataFrame({'classifier': ['AdaBoost'], 'data_set': ['train'], 'auc': [adaboost_train_metrics[0]], 'accuracy': [adaboost_train_metrics[1]], 'recall': [adaboost_train_metrics[2]], 'precision': [adaboost_train_metrics[3]], 'specificity': [adaboost_train_metrics[4]], 'f1': [adaboost_train_metrics[5]]})
df_adaboost_valid = pd.DataFrame({'classifier': ['AdaBoost'], 'data_set': ['valid'], 'auc': [adaboost_valid_metrics[0]], 'accuracy': [adaboost_valid_metrics[1]], 'recall': [adaboost_valid_metrics[2]], 'precision': [adaboost_valid_metrics[3]], 'specificity': [adaboost_valid_metrics[4]], 'f1': [adaboost_valid_metrics[5]]})

df_catboost_train = pd.DataFrame({'classifier': ['CatBoost'], 'data_set': ['train'], 'auc': [catboost_train_metrics[0]], 'accuracy': [catboost_train_metrics[1]], 'recall': [catboost_train_metrics[2]], 'precision': [catboost_train_metrics[3]], 'specificity': [catboost_train_metrics[4]], 'f1': [catboost_train_metrics[5]]})
df_catboost_valid = pd.DataFrame({'classifier': ['CatBoost'], 'data_set': ['valid'], 'auc': [catboost_valid_metrics[0]], 'accuracy': [catboost_valid_metrics[1]], 'recall': [catboost_valid_metrics[2]], 'precision': [catboost_valid_metrics[3]], 'specificity': [catboost_valid_metrics[4]], 'f1': [catboost_valid_metrics[5]]})

df_lgb_train = pd.DataFrame({'classifier': ['LightGBM'], 'data_set': ['train'], 'auc': [lgb_train_metrics[0]], 'accuracy': [lgb_train_metrics[1]], 'recall': [lgb_train_metrics[2]], 'precision': [lgb_train_metrics[3]], 'specificity': [lgb_train_metrics[4]], 'f1': [lgb_train_metrics[5]]})
df_lgb_valid = pd.DataFrame({'classifier': ['LightGBM'], 'data_set': ['valid'], 'auc': [lgb_valid_metrics[0]], 'accuracy': [lgb_valid_metrics[1]], 'recall': [lgb_valid_metrics[2]], 'precision': [lgb_valid_metrics[3]], 'specificity': [lgb_valid_metrics[4]], 'f1': [lgb_valid_metrics[5]]})

# Concatenate the DataFrames
df_results_train = pd.concat([df_adaboost_train, df_catboost_train, df_lgb_train], ignore_index=True)
df_results_valid = pd.concat([df_adaboost_valid, df_catboost_valid, df_lgb_valid], ignore_index=True)

# Print the resulting DataFrames
print("Training Set Results:")
print(df_results_train)
print("\nValidation Set Results:")
print(df_results_valid)

# Plot the results
sns.set(style="darkgrid")
ax = sns.barplot(x="classifier", y="auc", hue="data_set", data=df_results_train)
ax.set_xlabel('Classifier', fontsize=15)
ax.set_ylabel('AUC', fontsize=15)
ax.tick_params(labelsize=15)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=15)
plt.show()


In [None]:
#Given the higher AUC, accuracy, and F1 score on the validation set.
#LightGBM appears to be the best-performing model among the three. 
#It provides a good balance between various metrics, making it a suitable choice based on these results. . 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
import os

# Define the path to the model and related files
model_path = "C:\\Users\\Ulvi Karimli\\OneDrive\\Desktop\\RR Project\\adaboost_model.sav"
cols_input_path = "C:\\Users\\Ulvi Karimli\\OneDrive\\Desktop\\RR Project\\cols_input.sav"
df_mean_path = "C:\\Users\\Ulvi\\OneDrive\\Desktop\\RR Project\\df_mean.csv"
scaler_path = "C:\\Users\\Ulvi\\OneDrive\\Desktop\\RR Project\\scaler.sav"

# Check if the files exist
if all(os.path.exists(path) for path in [model_path, cols_input_path, df_mean_path, scaler_path]):
    # Load the model, columns, mean values, and scaler
    best_model = pickle.load(open(model_path, 'rb'))
    cols_input = pickle.load(open(cols_input_path, 'rb'))
    df_mean_in = pd.read_csv(df_mean_path, names=['col', 'mean_val'])
    scaler = pickle.load(open(scaler_path, 'rb'))
else:
    print("One or more files not found.")

In [None]:
# load the data
df_train = pd.read_csv('df_train.csv')
df_valid= pd.read_csv('df_valid.csv')
df_test= pd.read_csv('df_test.csv')

In [None]:
# fill missing
df_train = fill_my_missing(df_train, df_mean_in, cols_input)
df_valid = fill_my_missing(df_valid, df_mean_in, cols_input)
df_test = fill_my_missing(df_test, df_mean_in, cols_input)

# create X and y matrices
X_train = df_train[cols_input].values
X_valid = df_valid[cols_input].values
X_test = df_test[cols_input].values

y_train = df_train['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values
y_test = df_test['OUTPUT_LABEL'].values

# transform our data matrices 
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)
X_test_tf = scaler.transform(X_test)

In [None]:
y_train_preds = best_model.predict_proba(X_train_tf)[:,1]
y_valid_preds = best_model.predict_proba(X_valid_tf)[:,1]
y_test_preds = best_model.predict_proba(X_test_tf)[:,1]

In [None]:
thresh = .5

print('Training:')
train_auc, train_accuracy, train_recall, train_precision, train_specificity, train_f1 = print_report(y_train,y_train_preds, thresh)
print('Validation:')
valid_auc, valid_accuracy, valid_recall, valid_precision, valid_specificity, valid_f1 = print_report(y_valid,y_valid_preds, thresh)
print('Test:')
test_auc, test_accuracy, test_recall, test_precision, test_specificity, test_f1 = print_report(y_test,y_test_preds, thresh)

In [None]:
# the LightGBM model demonstrates strong performance across different evaluation metrics on the training, validation, and test sets. 
# It appears to generalize well to new data, as indicated by consistent performance on the validation and test sets compared to the training set.

In [None]:
from sklearn.metrics import roc_curve 

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
auc_train = roc_auc_score(y_train, y_train_preds)

fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds)
auc_valid = roc_auc_score(y_valid, y_valid_preds)

fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_preds)
auc_test = roc_auc_score(y_test, y_test_preds)

plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot(fpr_test, tpr_test, 'g-',label ='Test AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
import os

# Define the path to the model and related files
model_path = "C:\\Users\\Ulvi Karimli\\OneDrive\\Desktop\\RR Project\\adaboost_model.sav"
cols_input_path = "C:\\Users\\Ulvi Karimli\\OneDrive\\Desktop\\RR Project\\cols_input.sav"
df_mean_path = "C:\\Users\\Ulvi\\OneDrive\\Desktop\\RR Project\\df_mean.csv"
scaler_path = "C:\\Users\\Ulvi\\OneDrive\\Desktop\\RR Project\\scaler.sav"

# Check if the files exist
if all(os.path.exists(path) for path in [model_path, cols_input_path, df_mean_path, scaler_path]):
    # Load the model, columns, mean values, and scaler
    best_model = pickle.load(open(model_path, 'rb'))
    cols_input = pickle.load(open(cols_input_path, 'rb'))
    df_mean_in = pd.read_csv(df_mean_path, names=['col', 'mean_val'])
    scaler = pickle.load(open(scaler_path, 'rb'))
else:
    print("One or more files not found.")


In [None]:
# fill missing
df_train = fill_my_missing(df_train, df_mean_in, cols_input)
df_valid = fill_my_missing(df_valid, df_mean_in, cols_input)
df_test = fill_my_missing(df_test, df_mean_in, cols_input)

# create X and y matrices
X_train = df_train[cols_input].values
X_valid = df_valid[cols_input].values
X_test = df_test[cols_input].values

y_train = df_train['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values
y_test = df_test['OUTPUT_LABEL'].values

# transform our data matrices 
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)
X_test_tf = scaler.transform(X_test)

In [None]:
y_train_preds = best_model.predict_proba(X_train_tf)[:,1]
y_valid_preds = best_model.predict_proba(X_valid_tf)[:,1]
y_test_preds = best_model.predict_proba(X_test_tf)[:,1]

In [None]:
thresh = .5

print('Training:')
train_auc, train_accuracy, train_recall, train_precision, train_specificity, train_f1 = print_report(y_train,y_train_preds, thresh)
print('Validation:')
valid_auc, valid_accuracy, valid_recall, valid_precision, valid_specificity, valid_f1 = print_report(y_valid,y_valid_preds, thresh)
print('Test:')
test_auc, test_accuracy, test_recall, test_precision, test_specificity, test_f1 = print_report(y_test,y_test_preds, thresh)

In [None]:
# The CatBoost model demonstrates consistent and reasonable performance across different evaluation metrics on the training, validation, and test sets. 
# The model appears to generalize well to new data, as indicated by consistent performance on the validation and test sets compared to the training set.

In [None]:
from sklearn.metrics import roc_curve 

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
auc_train = roc_auc_score(y_train, y_train_preds)

fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds)
auc_valid = roc_auc_score(y_valid, y_valid_preds)

fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_preds)
auc_test = roc_auc_score(y_test, y_test_preds)

plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot(fpr_test, tpr_test, 'g-',label ='Test AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
import os

# Define the path to the model and related files
model_path = "C:\\Users\\Ulvi Karimli\\OneDrive\\Desktop\\RR Project\\adaboost_model.sav"
cols_input_path = "C:\\Users\\Ulvi Karimli\\OneDrive\\Desktop\\RR Project\\cols_input.sav"
df_mean_path = "C:\\Users\\Ulvi\\OneDrive\\Desktop\\RR Project\\df_mean.csv"
scaler_path = "C:\\Users\\Ulvi\\OneDrive\\Desktop\\RR Project\\scaler.sav"

# Check if the files exist
if all(os.path.exists(path) for path in [model_path, cols_input_path, df_mean_path, scaler_path]):
    # Load the model, columns, mean values, and scaler
    best_model = pickle.load(open(model_path, 'rb'))
    cols_input = pickle.load(open(cols_input_path, 'rb'))
    df_mean_in = pd.read_csv(df_mean_path, names=['col', 'mean_val'])
    scaler = pickle.load(open(scaler_path, 'rb'))
else:
    print("One or more files not found.")

In [None]:
# fill missing
df_train = fill_my_missing(df_train, df_mean_in, cols_input)
df_valid = fill_my_missing(df_valid, df_mean_in, cols_input)
df_test = fill_my_missing(df_test, df_mean_in, cols_input)

# create X and y matrices
X_train = df_train[cols_input].values
X_valid = df_valid[cols_input].values
X_test = df_test[cols_input].values

y_train = df_train['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values
y_test = df_test['OUTPUT_LABEL'].values

# transform our data matrices 
X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)
X_test_tf = scaler.transform(X_test)

In [None]:
y_train_preds = best_model.predict_proba(X_train_tf)[:,1]
y_valid_preds = best_model.predict_proba(X_valid_tf)[:,1]
y_test_preds = best_model.predict_proba(X_test_tf)[:,1]

In [None]:
thresh = .5

print('Training:')
train_auc, train_accuracy, train_recall, train_precision, train_specificity, train_f1 = print_report(y_train,y_train_preds, thresh)
print('Validation:')
valid_auc, valid_accuracy, valid_recall, valid_precision, valid_specificity, valid_f1 = print_report(y_valid,y_valid_preds, thresh)
print('Test:')
test_auc, test_accuracy, test_recall, test_precision, test_specificity, test_f1 = print_report(y_test,y_test_preds, thresh)

In [None]:
# The AdaBoost model demonstrates consistent and reasonable performance across different evaluation metrics on the training, validation, and test sets. 
# The model appears to generalize well to new data, as indicated by consistent performance on the validation and test sets compared to the training set.

In [None]:
from sklearn.metrics import roc_curve 

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
auc_train = roc_auc_score(y_train, y_train_preds)

fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds)
auc_valid = roc_auc_score(y_valid, y_valid_preds)

fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_preds)
auc_test = roc_auc_score(y_test, y_test_preds)

plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot(fpr_test, tpr_test, 'g-',label ='Test AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
#Considering the overall performance on all metrics and sets, it seems that the LightGBM model may be the best choice.
#It achieves a good balance between AUC, accuracy, recall, precision, and F1 score on all sets.