In [None]:
import os, sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score, recall_score, accuracy_score, f1_score, precision_score
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

In [None]:
cutoff = ' (cutoff 500)'

In [None]:
NT = pd.read_csv('IAD With Hist and Demos (120)',index_col=0,header=0)
NT

In [None]:
TFIDF = pd.read_csv('TFIDF (120)'+cutoff,index_col=0,header=0)
TFIDF

In [None]:
BW2V = pd.read_csv('BioWord2Vecs (120)' + cutoff,index_col=0,header=0)
BW2V

In [None]:
Labels = NT['P:C(30)>=1.8'].values
Labels
np.save('Labels (120)', Labels)

In [None]:
RV = NT.iloc[:,18:]
RV.drop(columns=['Primary Arterial Site','Occlusion Location'],inplace=True)
columns_to_drop = RV.filter(like='PreferredLanguage', axis=1).columns
RV = RV.drop(columns=columns_to_drop)
RV

In [None]:
scaler = MinMaxScaler()
imputer = KNNImputer(n_neighbors=5)

BW2V = pd.DataFrame(scaler.fit_transform(BW2V),columns=BW2V.columns)
BW2V = pd.DataFrame(imputer.fit_transform(BW2V),columns=BW2V.columns)
RV = pd.DataFrame(scaler.fit_transform(RV),columns=RV.columns)
RV = pd.DataFrame(imputer.fit_transform(RV),columns=RV.columns)


In [None]:
RV.to_csv('RV (120)')

In [None]:
RV_BW2V = pd.merge(RV,BW2V,right_index=True,left_index=True)

In [None]:
RV_BW2V.to_csv('RV_BW2V (120)'+ cutoff)
RV_BW2V

In [None]:
BW2V.to_csv('BW2V (120)' + cutoff)
BW2V

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import xgboost as xgb

def run_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    return y_proba, y_test

def bootstrap(X, y, n_bootstraps, model):
    auc_values = []
    for i in range(n_bootstraps):
        y_proba, y_label = run_model(X, y, model)
        fpr, tpr, _ = roc_curve(y_label, y_proba)
        auc_values.append(auc(fpr, tpr))

    auc_values.sort()

    low_ci = int(np.floor(.025 * n_bootstraps))
    median = int(np.floor(.5 * n_bootstraps))
    high_ci = int(np.floor(.975 * n_bootstraps))

    metrics = {
        'number of samples': y.shape[0],
        'number of positive labels': int(y.sum()),
        'auc': {'mean': auc_values[median], '95% CI lower bound': auc_values[low_ci], '95% CI upper bound': auc_values[high_ci]}
    }
    print(metrics)
    return metrics

# List of models to iterate through
model_list = [
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('SVC', SVC(probability=True)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Ada Boost', AdaBoostClassifier()),
    ('Gradient Boosted', GradientBoostingClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
    ('Quadratic Discriminant Analysis', QuadraticDiscriminantAnalysis()),
    ('XGBoost', xgb.XGBClassifier())
]

with open('model_results '+cutoff+'.txt', 'w') as file:
    # Iterate through the models and run the bootstrap function for each
    for model_name, model in model_list:
        print(f'Running bootstrap for {model_name}')
        metrics = bootstrap(RV_BW2V, Labels, 1000, model)
        output = f'{model_name} metrics: {metrics}\n'
        print(output)
        file.write(output)
