<a href="https://colab.research.google.com/github/stedua22/6372-Project-2/blob/main/ML1_Project_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 2: Classification - An AIDS Dataset
* Authors: Aaron Abromowitz | Catherine Ticzon | Stephanie Duarte | David Camacho
* Date of Submission: June 30, 2024

## Data Preparation

In [None]:
# Imports
import copy
import pandas as pd
import pandas.api.types as ptypes
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import statsmodels.formula.api as smf
import xgboost as xgb
from xgboost import XGBClassifier

### Define and prepare your class variables.

In [None]:
# Pull in data
url = "https://raw.githubusercontent.com/aabromowitz/ML1_Project/main/AIDS_Classification_50000.csv"
df = pd.read_csv(url)

Many of the variables are categorical, but the categories are numbers that mean something specific.  To make the data clearer, replace the numbers with more descriptive variable names.

In [None]:
# Re-label the trt variable
df['trt'] = df['trt'].replace(0, '0: ZDV only')
df['trt'] = df['trt'].replace(1, '1: ZDV + ddl')
df['trt'] = df['trt'].replace(2, '2: ZDV + Zal')
df['trt'] = df['trt'].replace(3, '3: ddl only')

# Re-label the race variable
df['race'] = df['race'].replace(0, '0: white')
df['race'] = df['race'].replace(1, '1: non-white')

# Re-label the gender variable
df['gender'] = df['gender'].replace(0, '0: Female')
df['gender'] = df['gender'].replace(1, '1: Male')

# Re-label the str2 variable
df['str2'] = df['str2'].replace(0, '0: naive')
df['str2'] = df['str2'].replace(1, '1: experienced')

# Re-label the strat variable
df['strat'] = df['strat'].replace(1, '1: Antiretroviral Naive')
df['strat'] = df['strat'].replace(2, '2: <= 52 weeks')
df['strat'] = df['strat'].replace(3, '3: > 52 weeks')

# Re-label the symptom variable
df['symptom'] = df['symptom'].replace(0, '0: asymp')
df['symptom'] = df['symptom'].replace(1, '1: symp')

# Re-label the treat variable
df['treat'] = df['treat'].replace(0, '0: ZDV only')
df['treat'] = df['treat'].replace(1, '1: others')

# Make 0 / 1 (i.e. True / False) columns categorical
df['hemo'] = pd.Categorical(df['hemo'])
df['homo'] = pd.Categorical(df['homo'])
df['drugs'] = pd.Categorical(df['drugs'])
df['oprior'] = pd.Categorical(df['oprior'])
df['z30'] = pd.Categorical(df['z30'])
df['offtrt'] = pd.Categorical(df['offtrt'])
df['infected'] = pd.Categorical(df['infected'])
df['trt'] = pd.Categorical(df['trt'])
df['race'] = pd.Categorical(df['race'])
df['gender'] = pd.Categorical(df['gender'])
df['str2'] = pd.Categorical(df['str2'])
df['strat'] = pd.Categorical(df['strat'])
df['symptom'] = pd.Categorical(df['symptom'])
df['treat'] = pd.Categorical(df['treat'])

In the first lab, we decided to add some additional data columns: change in CD4 counts from baseline to 20 weeks, change in CD8 counts from baseline to 20 weeks, an overall risk score, and if they were on a combined treatment regime.

In [None]:
# Add extra columns
df['cd4_change'] = df['cd420'] - df['cd40']  # Change in CD4 counts from baseline to 20 weeks
df['cd8_change'] = df['cd820'] - df['cd80']  # Change in CD8 counts from baseline to 20 weeks
df['risk_score'] = df['hemo'].astype(int) + df['homo'].astype(int) + df['drugs'].astype(int)
df['risk_score'] = pd.Categorical(df['risk_score'])
df['trt_comb'] = df['trt'].apply(lambda x: 0 if x in ['0: ZDV only', '3: ddl only'] else 1)
df['trt_comb'] = pd.Categorical(df['trt_comb'])

We also created PCA and LDA column in the first lab.  Since these are data reduction columns that take into account all the numeric columns, including those could be useful for predictions.

In [None]:
# Normalization
selected_columns = df[['time','age','wtkg','karnof','preanti','cd40','cd420','cd80','cd820']]
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(selected_columns)
normalized_df = pd.DataFrame(normalized_data, columns=selected_columns.columns)

# PCA
pca = PCA(n_components=2)  # Reduce to 2 principal components
principal_components = pca.fit_transform(normalized_df)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
df['PC1'] = pca_df['PC1']
df['PC2'] = pca_df['PC2']

# LDA
lda = LDA(n_components=1) # Can only use 1 component, since only 2 classes
X_lda = lda.fit(normalized_df, df['infected']).transform(normalized_df)
df_lda = normalized_df.dot(lda.scalings_)
df['lda'] = df_lda
# df.head()

In order to use certain algorithms (logistic regression, svm, etc.), the categorical variables need to be in a one hot encoding representation.  So this function creates that encoding for all the categorical columns.

In [None]:
# Make a function that does the one-hot encoding
def one_hot_encoding(input_df, target_var):
  encoder = OneHotEncoder(sparse_output=False)
  df_onehot = pd.DataFrame()
  for col in input_df.columns:
    if col == target_var: # You don't want to one-hot encode the target variable
      df_onehot = pd.concat([df_onehot, df[col]], axis=1)
    elif df[col].dtype == 'int64' or df[col].dtype == 'float64':
      df_onehot = pd.concat([df_onehot, df[col]], axis=1)
    else:
      df_onehot_col = pd.DataFrame(encoder.fit_transform(np.reshape(df[col],(-1, 1))))
      df_onehot_col.columns = df_onehot_col.columns.astype(str)
      df_onehot_col = df_onehot_col.rename(columns={col2: col + "_" + col2 for col2 in df_onehot_col.columns})
      df_onehot_col.reset_index(drop=True, inplace=True)
      df_onehot.reset_index(drop=True, inplace=True)
      df_onehot = pd.concat([df_onehot, df_onehot_col], axis=1)
  return df_onehot

In [None]:
# Create a dataframe with one hot encoded columns.
df_onehot = one_hot_encoding(df, 'infected')
# print(df_onehot.columns)

### Describe the final dataset that is used for classification/regression

In [None]:
# Look at dataset
df.head()

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,cd80,cd820,infected,cd4_change,cd8_change,risk_score,trt_comb,PC1,PC2,lda
0,1073,1: ZDV + ddl,37,79.46339,0,1,0,100,0,1,...,882,754,1,147,-128,1,1,-0.182701,-0.12985,-4.002009
1,324,0: ZDV only,33,73.02314,0,1,0,90,0,1,...,1035,1525,1,407,490,1,0,0.480324,0.233016,-2.867419
2,495,1: ZDV + ddl,43,69.47793,0,1,0,100,0,1,...,1147,1088,1,-44,-59,1,1,0.318144,-0.154143,-3.074864
3,1201,3: ddl only,42,89.15934,0,1,0,100,1,1,...,775,1019,1,86,244,1,0,-0.272211,-0.07909,-2.848368
4,934,0: ZDV only,37,137.46581,0,1,0,100,0,0,...,1601,849,0,-57,-752,1,0,-0.082183,-0.155079,-5.065466


In [None]:
# Look at one hot encoding
df_onehot.head()

Unnamed: 0,time,trt_0,trt_1,trt_2,trt_3,age,wtkg,hemo_0,hemo_1,homo_0,...,cd8_change,risk_score_0,risk_score_1,risk_score_2,risk_score_3,trt_comb_0,trt_comb_1,PC1,PC2,lda
0,1073,0.0,1.0,0.0,0.0,37,79.46339,1.0,0.0,0.0,...,-128,0.0,1.0,0.0,0.0,0.0,1.0,-0.182701,-0.12985,-4.002009
1,324,1.0,0.0,0.0,0.0,33,73.02314,1.0,0.0,0.0,...,490,0.0,1.0,0.0,0.0,1.0,0.0,0.480324,0.233016,-2.867419
2,495,0.0,1.0,0.0,0.0,43,69.47793,1.0,0.0,0.0,...,-59,0.0,1.0,0.0,0.0,0.0,1.0,0.318144,-0.154143,-3.074864
3,1201,0.0,0.0,0.0,1.0,42,89.15934,1.0,0.0,0.0,...,244,0.0,1.0,0.0,0.0,1.0,0.0,-0.272211,-0.07909,-2.848368
4,934,1.0,0.0,0.0,0.0,37,137.46581,1.0,0.0,0.0,...,-752,0.0,1.0,0.0,0.0,1.0,0.0,-0.082183,-0.155079,-5.065466


The final dataset is the original dataset with some variable updates and additions.  All the categorical variables were originally represented by numbers, so we decided to replace the numbers with more descriptive categories.  We thought that using the change in CD4 and CD8 counts over the 20 weeks could be a useful feature, so we created these by subtracting CD4 from CD420 and CD8 from CD820.  We also created an overall risk score based on the Hemo, Homo, and Drug variables as well as a variable to track if a combined drug treatment regime was used.

We have 10 numeric features (originally), which could be combined into more condensed features using dimension reduction techniques.  We used PCA to create two variables which included most of the variability of the numeric features, which are uncorrelated with each other.  LDA was also used to create a combined feature which was associated with the infected variable.

Lastly, some algorithms in SciKitLearn (Logistic Regression, SVM, etc.) need categorical variables to be onehot encoded before they work.  So we created a dataset with each of the categorical variables onehot encoded.  We created a function that does this, which can be used on datasets with subsets of the columns.

## Modeling and Evaluation

### Choose and explain your evaluation metrics that you will use

For the models in this project, we will be evaluating on: accuracy, with an accompanying cost matrix; precision, recall, and F-1 scores.

**Accuracy.** Accuracy can serve as a simple preliminary evaluation metric for our models. It is a straightforward metric that calculates the ratio of the number of correct predictions over the number of total predictions. However, it is important to note the limitations of using accuracy as an evaluation metric on its own. For one, accuracy can be severly misleading especially with imbalanced data sets. Additionally, accuracy does not take into account the cost of misclassified predictions. For this reason, we will use an accompanying cost matrix with this metric.

**Cost Matrix.**

We will use the following weights for the cost matrix:

|  | Predicted Negative | Predicted Positive |
|----------|----------|----------|
|    Actual Negative    |    X    |    X    |
|    Actual Positive    |   X    |   X  |

**Precision.**

**Recall.**

**F-1 Score.**


### Choose the method you will use for dividing your data into training and testing splits

### Create three different classification/regression models for each task

For the classification models, we will use the "infected" variable as our target variable.  This is the same target variable we used in the previous lab.

For the regression models, we decided to try to predict the change in CD4 count over 20 weeks.  This would be the cd4_change variable that we calculated earlier.  The original paper emphasised the importance of trying to predict if CD4 T Cell count had reduced over the course of the study.  The CD40 variable measure CD4 count at the beginning of the study, and the CD420 variable measures the CD4 count after 20 weeks of the study.

In [None]:
# Remove relevant variables
df_reg = df
df_reg = df_reg.drop('cd420', axis=1)
df_reg = df_reg.drop('cd40', axis=1)
df_reg = df_reg.drop('lda', axis=1) # Has info from cd40 and cd420 in it
df_reg = df_reg.drop('PC1', axis=1) # Has info from cd40 and cd420 in it
df_reg = df_reg.drop('PC2', axis=1) # Has info from cd40 and cd420 in it

# Normalization
selected_columns = df_reg[['time','age','wtkg','karnof','preanti','cd80','cd820']]
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(selected_columns)
normalized_df = pd.DataFrame(normalized_data, columns=selected_columns.columns)

# PCA
pca = PCA(n_components=2)  # Reduce to 2 principal components
principal_components = pca.fit_transform(normalized_df)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
df_reg['PC1'] = pca_df['PC1']
df_reg['PC2'] = pca_df['PC2']

# One-hot encode
df_reg_onehot = one_hot_encoding(df_reg, 'cd4_change')

#### Model 1 - Classification: XGBoost

A well known example of a boosting algorithm is XGBoost.  We wanted to test this model out to see how it performed against our other models.

In [None]:
# Get the features and labels
search_strings = ['lda','strat','z30','treat','preanti','str2','trt','offtrt']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
y = df['infected']

# For an initial test, just try a 90/10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Convert the data into DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# XGBoost Parameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 4,
    'eta': 0.3,
    'eval_metric': 'logloss'
}

# Train the model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	train-logloss:0.59497	eval-logloss:0.59557
[1]	train-logloss:0.58198	eval-logloss:0.58309
[2]	train-logloss:0.57405	eval-logloss:0.57562
[3]	train-logloss:0.56925	eval-logloss:0.57103
[4]	train-logloss:0.56621	eval-logloss:0.56832
[5]	train-logloss:0.56411	eval-logloss:0.56641
[6]	train-logloss:0.56272	eval-logloss:0.56527
[7]	train-logloss:0.56163	eval-logloss:0.56480
[8]	train-logloss:0.56078	eval-logloss:0.56438
[9]	train-logloss:0.56017	eval-logloss:0.56392
[10]	train-logloss:0.55962	eval-logloss:0.56358
[11]	train-logloss:0.55916	eval-logloss:0.56345
[12]	train-logloss:0.55884	eval-logloss:0.56345




[13]	train-logloss:0.55845	eval-logloss:0.56333
[14]	train-logloss:0.55821	eval-logloss:0.56334
[15]	train-logloss:0.55785	eval-logloss:0.56316
[16]	train-logloss:0.55760	eval-logloss:0.56321
[17]	train-logloss:0.55736	eval-logloss:0.56313
[18]	train-logloss:0.55705	eval-logloss:0.56322
[19]	train-logloss:0.55685	eval-logloss:0.56318
[20]	train-logloss:0.55666	eval-logloss:0.56309
[21]	train-logloss:0.55640	eval-logloss:0.56325
[22]	train-logloss:0.55625	eval-logloss:0.56337
[23]	train-logloss:0.55609	eval-logloss:0.56322
[24]	train-logloss:0.55591	eval-logloss:0.56333
[25]	train-logloss:0.55566	eval-logloss:0.56338
[26]	train-logloss:0.55536	eval-logloss:0.56347
[27]	train-logloss:0.55524	eval-logloss:0.56344
[28]	train-logloss:0.55508	eval-logloss:0.56357
[29]	train-logloss:0.55473	eval-logloss:0.56364
[30]	train-logloss:0.55446	eval-logloss:0.56406
[31]	train-logloss:0.55420	eval-logloss:0.56408
[32]	train-logloss:0.55405	eval-logloss:0.56399
[33]	train-logloss:0.55397	eval-logloss:

In [None]:
# Make predictions
y_pred_prob = bst.predict(dtest)
y_pred = np.round(y_pred_prob)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 70.62%


70.62% accuracy is a great starting point.  We used Feature Selection (in appenddix) to arrive at the following variables: ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']. We had 70.638% accuracy using a 10 fold cross validation.  Even though this isn't much higher, we can be more confident since we used cross validation and this is the average accuracy.

In [None]:
# Get the 10 fold CV accuracy for ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
seed = 6
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f'Mean Accuracy: {scores.mean()}')

Mean Accuracy: 0.70638


In the Extra Work below, we used grid search to determine that the best eta is 0.07 and the best max_depth is 4.  Let's try those out to see if they perform better.

In [None]:
# Try using eta of 0.07 and max_depth of 4 that we got from grid search (below)
seed = 6
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.07, eval_metric='logloss', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f'Mean Accuracy: {scores.mean()}')

Mean Accuracy: 0.70572


It looks like they aren't performing better than the original eta of 0.3.  We'll just stick with that number then.  They might be close, and it just depends on which seed you choose.  

We know the accuracy, but it would be good to determine other metrics.

In [None]:
seed = 6
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = xgb.XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
accuracy = []
sensitivity = []
specificity = []
precision = []
recall = []
f1 = []
true_labels = []
predicted_labels = []

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
  train_index = np.array(train_index)
  test_index = np.array(test_index)
  X_train = X.iloc[train_index]
  X_test = X.iloc[test_index]
  print(X_train.shape)
  print(X_test.shape)

(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)
(45000, 20)
(5000, 20)


In [None]:
# Try using eta of 0.07 and max_depth of 4 that we got from grid search (below)
seed = 6
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = xgb.XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
accuracy = []
sensitivity = []
specificity = []
precision = []
recall = []
f1 = []
true_labels = []
predicted_labels = []

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
  X_train = X.iloc[train_index]
  X_test = X.iloc[test_index]
  y_train = y.iloc[train_index]
  y_test = y.iloc[test_index]

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  tp = np.sum((y_test == 1) & (y_pred == 1))
  fp = np.sum((y_test == 0) & (y_pred == 1))
  tn = np.sum((y_test == 0) & (y_pred == 0))
  fn = np.sum((y_test == 1) & (y_pred == 0))
  prec = tp / (tp + fp)
  rec = tp / (tp + fn)

  accuracy.append((tp + tn) / (tp + fp + tn + fn))
  sensitivity.append(rec)
  specificity.append(tn / (tn + fp))
  precision.append(prec)
  recall.append(rec)
  f1.append(2*prec*rec / (prec + rec))

  true_labels.extend(y_test)
  predicted_labels.extend(y_pred)

# Print Metrics
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print(len(true_labels))
print(len(predicted_labels))
print(np.unique(true_labels))
tp = np.sum((true_labels == 1) & (predicted_labels == 1))
fp = np.sum((true_labels == 0) & (predicted_labels == 1))
tn = np.sum((true_labels == 0) & (predicted_labels == 0))
fn = np.sum((true_labels == 1) & (predicted_labels == 0))
print(tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
print("Confusion Matrix:\n", conf_matrix)
print(f"Sensitivity: {rec:.4f}")
print(f"Specificity: {tn / (tn + fp):.4f}")
print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {2*prec*rec / (prec + rec):.4f}")


50000
50000
[0 1]
0
Confusion Matrix:
 [[31591  2903]
 [11778  3728]]
Sensitivity: nan
Specificity: nan
Accuracy: nan
Precision: nan
Recall: nan
F1 Score: nan


  prec = tp / (tp + fp)
  rec = tp / (tp + fn)
  print(f"Specificity: {tn / (tn + fp):.4f}")
  print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")


In [None]:
# Print Metrics
conf_matrix = confusion_matrix(true_labels, predicted_labels)
predicted_labels = list(map(int, predicted_labels))
print(len(true_labels))
print(len(predicted_labels))
print(np.unique(true_labels))
element_types = [type(item) for item in true_labels]
print("Types of all elements in the list:", element_types)
element_types = [type(item) for item in predicted_labels]
print("Types of all elements in the list:", element_types)
print(true_labels)
print(predicted_labels)
print((true_labels == 1))
print([label == 1 for label in true_labels])
tp = np.sum((true_labels == 1) & (predicted_labels == 1))
fp = np.sum((true_labels == 0) & (predicted_labels == 1))
tn = np.sum((true_labels == 0) & (predicted_labels == 0))
fn = np.sum((true_labels == 1) & (predicted_labels == 0))
tp = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 1))
fp = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 1))
tn = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 0))
fn = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 0))
print(tp)
print(fp)
print(tn)
print(fn)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
print("Confusion Matrix:\n", conf_matrix)
print(f"Sensitivity: {rec:.4f}")
print(f"Specificity: {tn / (tn + fp):.4f}")
print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {2*prec*rec / (prec + rec):.4f}")

50000
50000
[0 1]
Types of all elements in the list: [<class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <

### Analyze the results using your chosen method of evaluation

#### XGBoost

In [None]:
# Try using the auc version of XG Boost
seed = 40
y = df['infected']
search_strings = ['lda','strat','z30','trt','PC2','PC1','str2','treat','preanti','cd40']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = xgb.XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
true_labels = []
predicted_labels = []

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
  X_train = X.iloc[train_index]
  X_test = X.iloc[test_index]
  y_train = y.iloc[train_index]
  y_test = y.iloc[test_index]

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  true_labels.extend(y_test)
  predicted_labels.extend(y_pred)


In [None]:
# Print Metrics
conf_matrix = confusion_matrix(true_labels, predicted_labels)
predicted_labels = list(map(int, predicted_labels))
tp = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 1))
fp = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 1))
tn = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 0))
fn = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 0))
print(f"tp: {tp}")
print(f"fp: {fp}")
print(f"tn: {tn}")
print(f"fn: {fn}")
prec = tp / (tp + fp)
rec = tp / (tp + fn)
print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")
print(f"Sensitivity: {rec:.4f}")
print(f"Specificity: {tn / (tn + fp):.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {2*prec*rec / (prec + rec):.4f}")

tp: 3842
fp: 3140
tn: 31354
fn: 11664
Confusion Matrix:
 [[31354  3140]
 [11664  3842]]
Accuracy: 0.7039
Sensitivity: 0.2478
Specificity: 0.9090
Precision: 0.5503
Recall: 0.2478
F1 Score: 0.3417


In [None]:
# Try using the f1 version of XG Boost
seed = 60
y = df['infected']
search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = xgb.XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
true_labels = []
predicted_labels = []

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
  X_train = X.iloc[train_index]
  X_test = X.iloc[test_index]
  y_train = y.iloc[train_index]
  y_test = y.iloc[test_index]

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  true_labels.extend(y_test)
  predicted_labels.extend(y_pred)

In [None]:
# Print Metrics
conf_matrix = confusion_matrix(true_labels, predicted_labels)
predicted_labels = list(map(int, predicted_labels))
tp = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 1))
fp = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 1))
tn = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 0))
fn = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 0))
print(f"tp: {tp}")
print(f"fp: {fp}")
print(f"tn: {tn}")
print(f"fn: {fn}")
prec = tp / (tp + fp)
rec = tp / (tp + fn)
print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")
print(f"Sensitivity: {rec:.4f}")
print(f"Specificity: {tn / (tn + fp):.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {2*prec*rec / (prec + rec):.4f}")

tp: 3882
fp: 3124
tn: 31370
fn: 11624
Confusion Matrix:
 [[31370  3124]
 [11624  3882]]
Accuracy: 0.7050
Sensitivity: 0.2504
Specificity: 0.9094
Precision: 0.5541
Recall: 0.2504
F1 Score: 0.3449


In [None]:
# Try using max_depth of 8
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
model = xgb.XGBClassifier(objective='binary:logistic', max_depth=8, eta=1.5, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
true_labels = []
predicted_labels = []

# Perform cross-validation
for train_index, test_index in cv.split(X, y):
  X_train = X.iloc[train_index]
  X_test = X.iloc[test_index]
  y_train = y.iloc[train_index]
  y_test = y.iloc[test_index]

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  true_labels.extend(y_test)
  predicted_labels.extend(y_pred)

# Print Metrics
conf_matrix = confusion_matrix(true_labels, predicted_labels)
predicted_labels = list(map(int, predicted_labels))
tp = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 1))
fp = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 1))
tn = np.sum(np.logical_and(np.array(true_labels) == 0, np.array(predicted_labels) == 0))
fn = np.sum(np.logical_and(np.array(true_labels) == 1, np.array(predicted_labels) == 0))
print(f"tp: {tp}")
print(f"fp: {fp}")
print(f"tn: {tn}")
print(f"fn: {fn}")
prec = tp / (tp + fp)
rec = tp / (tp + fn)
print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy: {(tp + tn) / (tp + fp + tn + fn):.4f}")
print(f"Sensitivity: {rec:.4f}")
print(f"Specificity: {tn / (tn + fp):.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {2*prec*rec / (prec + rec):.4f}")

seed: 67
tp: 5819
fp: 8312
tn: 26182
fn: 9687
Confusion Matrix:
 [[26182  8312]
 [ 9687  5819]]
Accuracy: 0.6400
Sensitivity: 0.3753
Specificity: 0.7590
Precision: 0.4118
Recall: 0.3753
F1 Score: 0.3927


### Discuss the advantages of each model for each classification task

### Which attributes from your analysis are most important

## Deployment

### How useful is your model for interested parties

## Extra Work

### Grid Search Parameters

#### XGBoost

In [None]:
# Try finding good values for the eta and max depth parameters
seed = 10
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]

# Initialize the XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')


# Set up the hyperparameter grid
param_grid = {
    'max_depth': [4],
    # 'max_depth': [3, 4, 5],
    'eta': [0.07, 0.08, 0.09]
    # 'max_depth': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4],
}

# Set up StratifiedKFold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform grid search
grid_search.fit(X, y)

# Print the best parameters and the best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Accuracy: {grid_search.best_score_ * 100:.2f}%')
# Best Parameters: {'eta': 0.07, 'max_depth': 4}
# Best Accuracy: 70.61%

Fitting 10 folds for each of 3 candidates, totalling 30 fits
Best Parameters: {'eta': 0.07, 'max_depth': 4}
Best Accuracy: 70.61%


In [None]:
# Try finding good values for the eta and max depth parameters for the F1
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]

# Initialize the XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='auc')


# Set up the hyperparameter grid
param_grid = {
    'max_depth': [6, 7, 8, 9],
    # 'max_depth': [2, 3, 4, 5, 6],
    # 'max_depth': [6, 7, 8, 9, 10],
    # 'max_depth': [8, 9, 10, 11],
    'eta': [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
    # 'eta': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4]
    # 'eta': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    # 'eta': [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
}

# Set up StratifiedKFold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)

# Perform grid search
grid_search.fit(X, y)

# Print the best parameters and the best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best F1: {grid_search.best_score_ * 100:.2f}%')
# Best Parameters: {'eta': 0.4, 'max_depth': 6}
# Best F1: 36.76%
# Best Parameters: {'eta': 1.0, 'max_depth': 9}
# Best F1: 39.01%
# Best Parameters: {'eta': 1.5, 'max_depth': 8}
# Best F1: 39.41%

seed: 64
Fitting 10 folds for each of 44 candidates, totalling 440 fits
Best Parameters: {'eta': 1.1, 'max_depth': 8}
Best F1: 39.29%


### Interaction Linear Model

In [None]:
# Let's first try a basic linear model
X = df_reg_onehot.drop('cd4_change', axis=1)
y = df_reg['cd4_change']

# For an initial test, just try a 90/10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Print the model coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the RMSE
print("Root Mean Squared Error (RMSE):", rmse)

Intercept: -232626.26838072395
Coefficients: [-1.15086788e+02 -7.80346454e-10  2.08313510e-09  1.36674227e-09
  2.22529383e-09 -8.81429981e+01 -1.75841772e+01  7.31294840e-12
 -7.10596415e-11  2.02338681e-11  8.50737461e-11  4.19052639e-11
  7.56463620e-11  3.49870127e+03 -6.43559361e-11 -2.23271763e-10
  4.91964768e-10 -5.51991418e-10 -1.68969288e+00 -2.83958989e-11
 -8.43814441e-11 -4.53659161e-11  4.90038949e-11  1.64321879e-10
  1.31772381e-10 -7.61597715e-11 -8.74113850e-12 -1.15021095e-10
 -4.22486983e-11  2.55869819e-10 -5.88258444e-11  2.24460563e-11
  2.41825452e-11 -2.41824338e-11 -5.55763602e-02 -1.52702959e-01
 -4.95075092e-11 -7.73563436e-11 -9.71265992e-02  1.97078132e-10
  9.43021217e-11 -2.25483451e-11 -1.21863332e-10 -9.71390302e-10
  7.69931452e-10 -1.27876419e+05  9.34250640e+04]
Root Mean Squared Error (RMSE): 2.0503310833882502e-09


In [None]:
# What about a simple linear model with just one variable
y = df_reg['cd4_change']
search_strings = ['time']
filtered_columns = [col for col in df_reg_onehot.columns if any(s in col for s in search_strings)]
X = df_reg_onehot[filtered_columns]

# For an initial test, just try a 90/10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Print the model coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the RMSE
print("Root Mean Squared Error (RMSE):", rmse)

Intercept: 93.46680869270725
Coefficients: [0.02913458]
Root Mean Squared Error (RMSE): 162.11604753418422


In [None]:
# Is there a way to do variable interactions in Python
y = df_reg['cd4_change']
search_strings = ['time','strat','age','str2']
filtered_columns = [col for col in df_reg_onehot.columns if any(s in col for s in search_strings)]
X = df_reg_onehot[filtered_columns]

# For an initial test, just try a 90/10 split
X_train, X_test = train_test_split(df_reg, test_size=0.1, random_state=1)

# Fit the linear regression model
model = smf.ols('cd4_change ~ time:age + time:strat + strat:str2 + time', data=X_train).fit()
# model = smf.ols('cd4_change ~ time*age', data=X_train).fit()

# Print the model summary
print(model.summary())

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
y_test = X_test['cd4_change']
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the RMSE
print("Root Mean Squared Error (RMSE):", rmse)

                            OLS Regression Results                            
Dep. Variable:             cd4_change   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     102.9
Date:                Mon, 24 Jun 2024   Prob (F-statistic):          1.46e-191
Time:                        03:29:23   Log-Likelihood:            -2.9294e+05
No. Observations:               45000   AIC:                         5.859e+05
Df Residuals:                   44990   BIC:                         5.860e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

In [None]:
# What about polynomial terms?
X_train, X_test = train_test_split(df_reg, test_size=0.1, random_state=1)

# Fit the linear regression model
model = smf.ols(formula = 'cd4_change ~ I(time**2)', data=X_train).fit()
# model = smf.ols('cd4_change ~ time*age', data=X_train).fit()

# Print the model summary
print(model.summary())

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
y_test = X_test['cd4_change']
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the RMSE
print("Root Mean Squared Error (RMSE):", rmse)

                            OLS Regression Results                            
Dep. Variable:             cd4_change   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     129.0
Date:                Mon, 24 Jun 2024   Prob (F-statistic):           7.39e-30
Time:                        03:37:57   Log-Likelihood:            -2.9334e+05
No. Observations:               45000   AIC:                         5.867e+05
Df Residuals:                   44998   BIC:                         5.867e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      103.3900      1.578     65.508   

## Backup

### XGBoost Feature Selection

In [None]:
# XGBoost with cross validation
search_strings = ['lda','strat','z30','treat','preanti','str2','trt','offtrt']
filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
X = df_onehot[filtered_columns]
y = df['infected']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data into DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set up the parameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 4,
    'eta': 0.3,
    'eval_metric': 'logloss'
}

# Perform cross-validation
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=100,
    nfold=5,  # Number of folds in cross-validation
    metrics={'logloss'},  # Evaluation metric
    early_stopping_rounds=10,  # Stop early if no improvement
    seed=42
)

# Display cross-validation results
print(cv_results)

    train-logloss-mean  train-logloss-std  test-logloss-mean  test-logloss-std
0             0.595140           0.000463           0.596022          0.001925
1             0.581732           0.000501           0.583360          0.001815
2             0.573734           0.000481           0.576270          0.001927
3             0.568863           0.000515           0.572242          0.002129
4             0.565706           0.000567           0.569443          0.002183
5             0.563465           0.000670           0.567925          0.002338
6             0.561928           0.000704           0.566694          0.002518
7             0.560774           0.000757           0.566029          0.002712
8             0.559851           0.000753           0.565556          0.002902
9             0.559162           0.000741           0.565252          0.003070
10            0.558527           0.000807           0.565031          0.003121
11            0.557967           0.000801           

In [None]:
search_strings = 'strat'
filtered_columns = [col for col in df_onehot.columns if search_strings in col]
print(filtered_columns)
X = df_onehot[filtered_columns]
X.head()

['strat_0', 'strat_1', 'strat_2']


Unnamed: 0,strat_0,strat_1,strat_2
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [None]:
# Try making a for loop to test all of the columns
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
df_metrics['Test_01'] = 0
y = df['infected']
for index, row in df_metrics.iterrows():
    to_skip = ['infected']
    var = df_metrics['Var'][index]
    if var in to_skip:
        continue
    if ptypes.is_numeric_dtype(df[var]):
        X = pd.DataFrame(np.reshape(df[var],(-1, 1)))
        X.columns = X.columns.astype(str)
    else: # categorical
        filtered_columns = [col for col in df_onehot.columns if var in col]
        X = df_onehot[filtered_columns]
    model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
    df_metrics['Test_01'] = scores.mean()

Var: time, Mean Accuracy: 0.69036
Var: trt, Mean Accuracy: 0.6898799999999999
Var: age, Mean Accuracy: 0.68994
Var: wtkg, Mean Accuracy: 0.6898799999999999
Var: hemo, Mean Accuracy: 0.6898799999999999
Var: homo, Mean Accuracy: 0.6898799999999999
Var: drugs, Mean Accuracy: 0.6898799999999999
Var: karnof, Mean Accuracy: 0.6898199999999999
Var: oprior, Mean Accuracy: 0.6898799999999999
Var: z30, Mean Accuracy: 0.6898799999999999
Var: preanti, Mean Accuracy: 0.6890599999999999
Var: race, Mean Accuracy: 0.6898799999999999
Var: gender, Mean Accuracy: 0.6898799999999999
Var: str2, Mean Accuracy: 0.6898799999999999
Var: strat, Mean Accuracy: 0.6898799999999999
Var: symptom, Mean Accuracy: 0.6898799999999999
Var: treat, Mean Accuracy: 0.6898799999999999
Var: offtrt, Mean Accuracy: 0.6898799999999999
Var: cd40, Mean Accuracy: 0.69094
Var: cd420, Mean Accuracy: 0.6945400000000002
Var: cd80, Mean Accuracy: 0.6898799999999999
Var: cd820, Mean Accuracy: 0.6898799999999999
Var: cd4_change, Mean Accur

In [None]:
# Try adding a variable to LDA
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
df_metrics['Test_02'] = 0
y = df['infected']
for index, row in df_metrics.iterrows():
    to_skip = ['infected', 'lda']
    var = df_metrics['Var'][index]
    if var in to_skip:
        continue
    search_strings = ['lda']
    search_strings.append(var)
    filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
    X = df_onehot[filtered_columns]
    model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
    df_metrics['Test_02'] = scores.mean()

Var: time, Mean Accuracy: 0.6950599999999999
Var: trt, Mean Accuracy: 0.69714
Var: age, Mean Accuracy: 0.69466
Var: wtkg, Mean Accuracy: 0.6921999999999999
Var: hemo, Mean Accuracy: 0.6956999999999999
Var: homo, Mean Accuracy: 0.69298
Var: drugs, Mean Accuracy: 0.6941200000000001
Var: karnof, Mean Accuracy: 0.6938799999999999
Var: oprior, Mean Accuracy: 0.69362
Var: z30, Mean Accuracy: 0.6973199999999999
Var: preanti, Mean Accuracy: 0.6990999999999999
Var: race, Mean Accuracy: 0.69378
Var: gender, Mean Accuracy: 0.6945
Var: str2, Mean Accuracy: 0.69678
Var: strat, Mean Accuracy: 0.69786
Var: symptom, Mean Accuracy: 0.6943
Var: treat, Mean Accuracy: 0.69526
Var: offtrt, Mean Accuracy: 0.6938
Var: cd40, Mean Accuracy: 0.6931999999999999
Var: cd420, Mean Accuracy: 0.69768
Var: cd80, Mean Accuracy: 0.69134
Var: cd820, Mean Accuracy: 0.69404
Var: cd4_change, Mean Accuracy: 0.6939200000000001
Var: cd8_change, Mean Accuracy: 0.69328
Var: risk_score, Mean Accuracy: 0.69296
Var: trt_comb, Mean 

In [None]:
# Try to beat 0.69786
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
df_metrics['Test_03'] = 0
y = df['infected']
for index, row in df_metrics.iterrows():
    to_skip = ['infected', 'lda', 'strat']
    var = df_metrics['Var'][index]
    if var in to_skip:
        continue
    search_strings = ['lda', 'strat']
    search_strings.append(var)
    filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
    X = df_onehot[filtered_columns]
    model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
    df_metrics['Test_03'] = scores.mean()

Var: time, Mean Accuracy: 0.6959799999999999
Var: trt, Mean Accuracy: 0.69994
Var: age, Mean Accuracy: 0.69856
Var: wtkg, Mean Accuracy: 0.69538
Var: hemo, Mean Accuracy: 0.6983199999999999
Var: homo, Mean Accuracy: 0.6965000000000001
Var: drugs, Mean Accuracy: 0.6977800000000001
Var: karnof, Mean Accuracy: 0.69862
Var: oprior, Mean Accuracy: 0.6987599999999999
Var: z30, Mean Accuracy: 0.6999000000000001
Var: preanti, Mean Accuracy: 0.70084
Var: race, Mean Accuracy: 0.6981999999999999
Var: gender, Mean Accuracy: 0.69636
Var: str2, Mean Accuracy: 0.69914
Var: symptom, Mean Accuracy: 0.69738
Var: treat, Mean Accuracy: 0.6992400000000001
Var: offtrt, Mean Accuracy: 0.6992800000000001
Var: cd40, Mean Accuracy: 0.69634
Var: cd420, Mean Accuracy: 0.69804
Var: cd80, Mean Accuracy: 0.69478
Var: cd820, Mean Accuracy: 0.6955600000000001
Var: cd4_change, Mean Accuracy: 0.6962599999999999
Var: cd8_change, Mean Accuracy: 0.6959
Var: risk_score, Mean Accuracy: 0.69758
Var: trt_comb, Mean Accuracy: 0

In [None]:
# Try testing removing a variable
df_metrics['Test_04'] = 0
y = df['infected']
search_strings = ['lda', 'strat', 'preanti']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_04'] = scores.mean()

lda
['preanti', 'strat_0', 'strat_1', 'strat_2']
   preanti  strat_0  strat_1  strat_2
0       18      0.0      1.0      0.0
1      224      0.0      0.0      1.0
2        0      1.0      0.0      0.0
3      513      0.0      0.0      1.0
4        4      0.0      0.0      1.0
Var: lda, Mean Accuracy: 0.6836
strat
['preanti', 'lda']
   preanti       lda
0       18 -4.002009
1      224 -2.867419
2        0 -3.074864
3      513 -2.848368
4        4 -5.065466
Var: strat, Mean Accuracy: 0.6990999999999999
preanti
['strat_0', 'strat_1', 'strat_2', 'lda']
   strat_0  strat_1  strat_2       lda
0      0.0      1.0      0.0 -4.002009
1      0.0      0.0      1.0 -2.867419
2      1.0      0.0      0.0 -3.074864
3      0.0      0.0      1.0 -2.848368
4      0.0      0.0      1.0 -5.065466
Var: preanti, Mean Accuracy: 0.69786


In [None]:
# Try to beat 0.70084
df_metrics['Test_05'] = 0
y = df['infected']
for index, row in df_metrics.iterrows():
    to_skip = ['infected', 'lda', 'strat', 'preanti']
    var = df_metrics['Var'][index]
    if var in to_skip:
        continue
    search_strings = ['lda', 'strat', 'preanti']
    search_strings.append(var)
    filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
    X = df_onehot[filtered_columns]
    model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
    df_metrics['Test_05'] = scores.mean()

Var: time, Mean Accuracy: 0.70202
Var: trt, Mean Accuracy: 0.7028000000000001
Var: age, Mean Accuracy: 0.70024
Var: wtkg, Mean Accuracy: 0.70038
Var: hemo, Mean Accuracy: 0.7000399999999999
Var: homo, Mean Accuracy: 0.6998000000000001
Var: drugs, Mean Accuracy: 0.70104
Var: karnof, Mean Accuracy: 0.7018
Var: oprior, Mean Accuracy: 0.70088
Var: z30, Mean Accuracy: 0.7019400000000001
Var: race, Mean Accuracy: 0.70084
Var: gender, Mean Accuracy: 0.70048
Var: str2, Mean Accuracy: 0.70234
Var: symptom, Mean Accuracy: 0.70134
Var: treat, Mean Accuracy: 0.70198
Var: offtrt, Mean Accuracy: 0.7021200000000001
Var: cd40, Mean Accuracy: 0.701
Var: cd420, Mean Accuracy: 0.70062
Var: cd80, Mean Accuracy: 0.70088
Var: cd820, Mean Accuracy: 0.7011200000000001
Var: cd4_change, Mean Accuracy: 0.7000599999999999
Var: cd8_change, Mean Accuracy: 0.7005
Var: risk_score, Mean Accuracy: 0.70004
Var: trt_comb, Mean Accuracy: 0.70238
Var: PC1, Mean Accuracy: 0.70198
Var: PC2, Mean Accuracy: 0.7012


In [None]:
# Try testing removing a variable
df_metrics['Test_06'] = 0
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_06'] = scores.mean()

Var: lda, Mean Accuracy: 0.6862999999999999
Var: strat, Mean Accuracy: 0.70122
Var: preanti, Mean Accuracy: 0.69994
Var: trt, Mean Accuracy: 0.70084


In [None]:
search_strings = ['lda', 'strat', 'preanti', 'trt']
print(search_strings)
to_skip = copy.deepcopy(search_strings)
to_skip.append('infected')
print(to_skip)

['lda', 'strat', 'preanti', 'trt']
['lda', 'strat', 'preanti', 'trt', 'infected']


In [None]:
# Try to beat 0.7028000000000001
df_metrics['Test_05'] = 0
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda', 'strat', 'preanti', 'trt']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_05'] = scores.mean()

Var: time, Mean Accuracy: 0.70312
Var: age, Mean Accuracy: 0.7035799999999999
Var: wtkg, Mean Accuracy: 0.70374
Var: hemo, Mean Accuracy: 0.70314
Var: homo, Mean Accuracy: 0.70342
Var: drugs, Mean Accuracy: 0.70278
Var: karnof, Mean Accuracy: 0.7023800000000001
Var: oprior, Mean Accuracy: 0.70186
Var: z30, Mean Accuracy: 0.7038
Var: race, Mean Accuracy: 0.70426
Var: gender, Mean Accuracy: 0.70324
Var: str2, Mean Accuracy: 0.70358
Var: symptom, Mean Accuracy: 0.7021
Var: treat, Mean Accuracy: 0.7028399999999999
Var: offtrt, Mean Accuracy: 0.7028000000000001
Var: cd40, Mean Accuracy: 0.7020200000000001
Var: cd420, Mean Accuracy: 0.70338
Var: cd80, Mean Accuracy: 0.70306
Var: cd820, Mean Accuracy: 0.7026
Var: cd4_change, Mean Accuracy: 0.7016600000000001
Var: cd8_change, Mean Accuracy: 0.7028999999999999
Var: risk_score, Mean Accuracy: 0.70402
Var: trt_comb, Mean Accuracy: 0.7028000000000001
Var: PC1, Mean Accuracy: 0.70104
Var: PC2, Mean Accuracy: 0.70312


In [None]:
# Try testing removing a variable
df_metrics['Test_06'] = 0
seed = 1
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_06'] = scores.mean()

Var: lda, Mean Accuracy: 0.6876
Var: strat, Mean Accuracy: 0.7013800000000001
Var: preanti, Mean Accuracy: 0.70012
Var: trt, Mean Accuracy: 0.70084
Var: race, Mean Accuracy: 0.7028000000000001


In [None]:
# Try to beat 0.70426
df_metrics['Test_07'] = 0
seed = 2
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda', 'strat', 'preanti', 'trt', 'race']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_07'] = scores.mean()

Var: time, Mean Accuracy: 0.70198
Var: age, Mean Accuracy: 0.70442
Var: wtkg, Mean Accuracy: 0.7032
Var: hemo, Mean Accuracy: 0.70252
Var: homo, Mean Accuracy: 0.70174
Var: drugs, Mean Accuracy: 0.70198
Var: karnof, Mean Accuracy: 0.7022
Var: oprior, Mean Accuracy: 0.70242
Var: z30, Mean Accuracy: 0.7019
Var: gender, Mean Accuracy: 0.70156
Var: str2, Mean Accuracy: 0.7026999999999999
Var: symptom, Mean Accuracy: 0.70196
Var: treat, Mean Accuracy: 0.70174
Var: offtrt, Mean Accuracy: 0.70188
Var: cd40, Mean Accuracy: 0.7024799999999999
Var: cd420, Mean Accuracy: 0.70312
Var: cd80, Mean Accuracy: 0.7016600000000002
Var: cd820, Mean Accuracy: 0.7021200000000001
Var: cd4_change, Mean Accuracy: 0.7017800000000001
Var: cd8_change, Mean Accuracy: 0.70026
Var: risk_score, Mean Accuracy: 0.7022799999999999
Var: trt_comb, Mean Accuracy: 0.70188
Var: PC1, Mean Accuracy: 0.70208
Var: PC2, Mean Accuracy: 0.7017599999999999


In [None]:
# Try testing removing a variable
df_metrics['Test_08'] = 0
seed = 3
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_08'] = scores.mean()

Var: lda, Mean Accuracy: 0.68784
Var: strat, Mean Accuracy: 0.70138
Var: preanti, Mean Accuracy: 0.6999799999999999
Var: trt, Mean Accuracy: 0.70114
Var: race, Mean Accuracy: 0.7043200000000001
Var: age, Mean Accuracy: 0.7021000000000001


In [None]:
# Try to beat 0.70442
df_metrics['Test_09'] = 0
seed = 4
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_09'] = scores.mean()

Var: time, Mean Accuracy: 0.70308
Var: wtkg, Mean Accuracy: 0.7035
Var: hemo, Mean Accuracy: 0.70282
Var: homo, Mean Accuracy: 0.7028800000000001
Var: drugs, Mean Accuracy: 0.70312
Var: karnof, Mean Accuracy: 0.70182
Var: oprior, Mean Accuracy: 0.70272
Var: z30, Mean Accuracy: 0.7049999999999998
Var: gender, Mean Accuracy: 0.70234
Var: str2, Mean Accuracy: 0.70328
Var: symptom, Mean Accuracy: 0.70318
Var: treat, Mean Accuracy: 0.7024
Var: offtrt, Mean Accuracy: 0.70312
Var: cd40, Mean Accuracy: 0.70272
Var: cd420, Mean Accuracy: 0.7025800000000001
Var: cd80, Mean Accuracy: 0.7016
Var: cd820, Mean Accuracy: 0.70308
Var: cd4_change, Mean Accuracy: 0.70254
Var: cd8_change, Mean Accuracy: 0.70126
Var: risk_score, Mean Accuracy: 0.7036399999999999
Var: trt_comb, Mean Accuracy: 0.70312
Var: PC1, Mean Accuracy: 0.70244
Var: PC2, Mean Accuracy: 0.7027599999999999


In [None]:
# Try testing removing a variable
df_metrics['Test_10'] = 0
seed = 5
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_10'] = scores.mean()

Var: lda, Mean Accuracy: 0.68864
Var: strat, Mean Accuracy: 0.70218
Var: preanti, Mean Accuracy: 0.70052
Var: trt, Mean Accuracy: 0.70292
Var: race, Mean Accuracy: 0.70382
Var: age, Mean Accuracy: 0.70358
Var: z30, Mean Accuracy: 0.7034


In [None]:
# Try to beat 0.7049999999999998
df_metrics['Test_11'] = 0
seed = 6
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_11'] = scores.mean()

Var: time, Mean Accuracy: 0.70348
Var: wtkg, Mean Accuracy: 0.7050000000000001
Var: hemo, Mean Accuracy: 0.7057000000000001
Var: homo, Mean Accuracy: 0.7047
Var: drugs, Mean Accuracy: 0.70638
Var: karnof, Mean Accuracy: 0.7052799999999999
Var: oprior, Mean Accuracy: 0.70506
Var: gender, Mean Accuracy: 0.7045
Var: str2, Mean Accuracy: 0.7053
Var: symptom, Mean Accuracy: 0.70518
Var: treat, Mean Accuracy: 0.7052799999999999
Var: offtrt, Mean Accuracy: 0.70506
Var: cd40, Mean Accuracy: 0.7047200000000001
Var: cd420, Mean Accuracy: 0.7038
Var: cd80, Mean Accuracy: 0.7030000000000001
Var: cd820, Mean Accuracy: 0.7048599999999999
Var: cd4_change, Mean Accuracy: 0.70442
Var: cd8_change, Mean Accuracy: 0.7038
Var: risk_score, Mean Accuracy: 0.7045399999999999
Var: trt_comb, Mean Accuracy: 0.70506
Var: PC1, Mean Accuracy: 0.7042400000000001
Var: PC2, Mean Accuracy: 0.7046000000000001


In [None]:
# Try testing removing a variable
df_metrics['Test_12'] = 0
seed = 7
y = df['infected']
search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_12'] = scores.mean()

Var: lda, Mean Accuracy: 0.6880599999999999
Var: strat, Mean Accuracy: 0.7030799999999999
Var: preanti, Mean Accuracy: 0.70178
Var: trt, Mean Accuracy: 0.70282
Var: race, Mean Accuracy: 0.7049200000000001
Var: age, Mean Accuracy: 0.70428
Var: z30, Mean Accuracy: 0.70292
Var: drugs, Mean Accuracy: 0.7040200000000001


In [None]:
# Try to beat 0.70638
df_metrics['Test_13'] = 0
seed = 8
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda', 'strat', 'preanti', 'trt', 'race', 'age', 'z30', 'drugs']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='logloss', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
  print(f'Var: {var}, Mean Accuracy: {scores.mean()}')
  df_metrics['Test_13'] = scores.mean()

Var: time, Mean Accuracy: 0.7028399999999999
Var: wtkg, Mean Accuracy: 0.70498
Var: hemo, Mean Accuracy: 0.70516
Var: homo, Mean Accuracy: 0.7042
Var: karnof, Mean Accuracy: 0.70444
Var: oprior, Mean Accuracy: 0.7047399999999999
Var: gender, Mean Accuracy: 0.70534
Var: str2, Mean Accuracy: 0.70418
Var: symptom, Mean Accuracy: 0.7034
Var: treat, Mean Accuracy: 0.7043
Var: offtrt, Mean Accuracy: 0.70424
Var: cd40, Mean Accuracy: 0.7035199999999999
Var: cd420, Mean Accuracy: 0.7051999999999998
Var: cd80, Mean Accuracy: 0.70312
Var: cd820, Mean Accuracy: 0.70298
Var: cd4_change, Mean Accuracy: 0.7034999999999999
Var: cd8_change, Mean Accuracy: 0.70402
Var: risk_score, Mean Accuracy: 0.70366
Var: trt_comb, Mean Accuracy: 0.70424
Var: PC1, Mean Accuracy: 0.7041799999999999
Var: PC2, Mean Accuracy: 0.70414


### XGBoost Feature Selection - AUC

In [None]:
# First round of trying to use the AUC as a metric
seed = 20
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = []
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.5509135981087536
Var: trt, Mean AUC: 0.5418980699187885
Var: age, Mean AUC: 0.5239515070180145
Var: wtkg, Mean AUC: 0.5225346775878668
Var: hemo, Mean AUC: 0.5051352948726879
Var: homo, Mean AUC: 0.5037759505849742
Var: drugs, Mean AUC: 0.5094043651085202
Var: karnof, Mean AUC: 0.5119818798925374
Var: oprior, Mean AUC: 0.5094460784754233
Var: z30, Mean AUC: 0.6236998183616065
Var: preanti, Mean AUC: 0.6300014350052453
Var: race, Mean AUC: 0.5137375859580218
Var: gender, Mean AUC: 0.508365990118762
Var: str2, Mean AUC: 0.6270392189793927
Var: strat, Mean AUC: 0.6464728386583507
Var: symptom, Mean AUC: 0.5074717469160677
Var: treat, Mean AUC: 0.5268234624533342
Var: offtrt, Mean AUC: 0.513622272000011
Var: cd40, Mean AUC: 0.5841594428396821
Var: cd420, Mean AUC: 0.6134521949150173
Var: cd80, Mean AUC: 0.509877969559935
Var: cd820, Mean AUC: 0.5081407560487545
Var: cd4_change, Mean AUC: 0.5400308962583551
Var: cd8_change, Mean AUC: 0.504842610161856
Var: risk_score,

In [None]:
# Try to beat 0.6588
seed = seed + 1
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.6560791262752559
Var: trt, Mean AUC: 0.6591039666629162
Var: age, Mean AUC: 0.6570707002688349
Var: wtkg, Mean AUC: 0.6545965271871687
Var: hemo, Mean AUC: 0.6573913563705004
Var: homo, Mean AUC: 0.6574258805528708
Var: drugs, Mean AUC: 0.6569052166699272
Var: karnof, Mean AUC: 0.656797287705351
Var: oprior, Mean AUC: 0.6576963715911444
Var: z30, Mean AUC: 0.6790874434230678
Var: preanti, Mean AUC: 0.6713365812052122
Var: race, Mean AUC: 0.6562273085640966
Var: gender, Mean AUC: 0.6561073366730125
Var: str2, Mean AUC: 0.6753263063103264
Var: strat, Mean AUC: 0.6794048775407295
Var: symptom, Mean AUC: 0.6564914521325995
Var: treat, Mean AUC: 0.6589262442937598
Var: offtrt, Mean AUC: 0.6574598723316963
Var: cd40, Mean AUC: 0.6554115322179983
Var: cd420, Mean AUC: 0.6567754238947712
Var: cd80, Mean AUC: 0.6536028936151084
Var: cd820, Mean AUC: 0.6529340082800619
Var: cd4_change, Mean AUC: 0.6523564783887861
Var: cd8_change, Mean AUC: 0.6521620980030887
Var: risk_sco

In [None]:
# Try to beat 0.6794
seed = seed + 1
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.6799101390308703
Var: trt, Mean AUC: 0.6836337360748572
Var: age, Mean AUC: 0.6813140269503167
Var: wtkg, Mean AUC: 0.6782985228946694
Var: hemo, Mean AUC: 0.6794860600034528
Var: homo, Mean AUC: 0.6795443575832696
Var: drugs, Mean AUC: 0.6808418710550109
Var: karnof, Mean AUC: 0.6799831788111996
Var: oprior, Mean AUC: 0.6805874069550109
Var: z30, Mean AUC: 0.6865228205064198
Var: preanti, Mean AUC: 0.6849398304304459
Var: race, Mean AUC: 0.6793309564089351
Var: gender, Mean AUC: 0.6798210260968752
Var: str2, Mean AUC: 0.6840590400315478
Var: symptom, Mean AUC: 0.6795898676968859
Var: treat, Mean AUC: 0.6823201247610601
Var: offtrt, Mean AUC: 0.6810151713227691
Var: cd40, Mean AUC: 0.6791495477772757
Var: cd420, Mean AUC: 0.6795456966474807
Var: cd80, Mean AUC: 0.6766205023616071
Var: cd820, Mean AUC: 0.6772331704600368
Var: cd4_change, Mean AUC: 0.6782489371088039
Var: cd8_change, Mean AUC: 0.6771441612755373
Var: risk_score, Mean AUC: 0.6795004634467638
Var: tr

In [None]:
# Try to beat 0.6865
seed = seed + 1
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.6882169514335661
Var: trt, Mean AUC: 0.6898460482444092
Var: age, Mean AUC: 0.688406673536731
Var: wtkg, Mean AUC: 0.6857472568758061
Var: hemo, Mean AUC: 0.6867477756146995
Var: homo, Mean AUC: 0.6858967488785762
Var: drugs, Mean AUC: 0.6870863993863234
Var: karnof, Mean AUC: 0.6870446647519961
Var: oprior, Mean AUC: 0.6876186155641245
Var: preanti, Mean AUC: 0.6898579953174941
Var: race, Mean AUC: 0.6860910346817383
Var: gender, Mean AUC: 0.686510797054179
Var: str2, Mean AUC: 0.687823926005501
Var: symptom, Mean AUC: 0.686763308128014
Var: treat, Mean AUC: 0.6897353968276365
Var: offtrt, Mean AUC: 0.6875279218941093
Var: cd40, Mean AUC: 0.6854342458204504
Var: cd420, Mean AUC: 0.6853235897778261
Var: cd80, Mean AUC: 0.684165748302794
Var: cd820, Mean AUC: 0.6830492035353355
Var: cd4_change, Mean AUC: 0.6828858363570788
Var: cd8_change, Mean AUC: 0.6837189133131196
Var: risk_score, Mean AUC: 0.6860975301381846
Var: trt_comb, Mean AUC: 0.6880171592718805
Var: PC

In [None]:
# Try removing a variable
seed = seed + 1
y = df['infected']
search_strings = ['lda','strat','z30','trt']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: lda, Mean AUC: 0.6759419993240504
Var: strat, Mean AUC: 0.681597015628417
Var: z30, Mean AUC: 0.6837047914384904
Var: trt, Mean AUC: 0.6865855253034681


In [None]:
# Try to beat 0.6898
seed = seed + 1
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.6913928414124896
Var: age, Mean AUC: 0.6906872368858553
Var: wtkg, Mean AUC: 0.688025104009562
Var: hemo, Mean AUC: 0.6898403073458683
Var: homo, Mean AUC: 0.6894461633246556
Var: drugs, Mean AUC: 0.6902750225707376
Var: karnof, Mean AUC: 0.6901249155138671
Var: oprior, Mean AUC: 0.6899330269828764
Var: preanti, Mean AUC: 0.6939986467916668
Var: race, Mean AUC: 0.689137578384545
Var: gender, Mean AUC: 0.6900491293952379
Var: str2, Mean AUC: 0.6906303845634113
Var: symptom, Mean AUC: 0.6886828750329054
Var: treat, Mean AUC: 0.6904946887294836
Var: offtrt, Mean AUC: 0.6891343917786591
Var: cd40, Mean AUC: 0.6889558704830263
Var: cd420, Mean AUC: 0.6897089348709097
Var: cd80, Mean AUC: 0.6871982997557937
Var: cd820, Mean AUC: 0.6877852042042939
Var: cd4_change, Mean AUC: 0.6872270249873087
Var: cd8_change, Mean AUC: 0.6868793047907286
Var: risk_score, Mean AUC: 0.6897970391005901
Var: trt_comb, Mean AUC: 0.6891343917786591
Var: PC1, Mean AUC: 0.6904113928545821
Var:

In [None]:
# Try removing a variable
seed = seed + 1
y = df['infected']
search_strings = ['lda','strat','z30','trt','PC2']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: lda, Mean AUC: 0.6715859507893278
Var: strat, Mean AUC: 0.682362221566038
Var: z30, Mean AUC: 0.684756049477692
Var: trt, Mean AUC: 0.6875693149854195
Var: PC2, Mean AUC: 0.68938186533599


In [None]:
# Try to beat 0.6918
seed = seed + 1
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt','PC2']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.6918018564698801
Var: age, Mean AUC: 0.6923546626511847
Var: wtkg, Mean AUC: 0.6898069864833175
Var: hemo, Mean AUC: 0.6922135883448014
Var: homo, Mean AUC: 0.6918657015071896
Var: drugs, Mean AUC: 0.6920339363934476
Var: karnof, Mean AUC: 0.6919434018340695
Var: oprior, Mean AUC: 0.6922221054521643
Var: preanti, Mean AUC: 0.6939892946192513
Var: race, Mean AUC: 0.6920479010261741
Var: gender, Mean AUC: 0.6915136163148072
Var: str2, Mean AUC: 0.6929862324952683
Var: symptom, Mean AUC: 0.6916854524535861
Var: treat, Mean AUC: 0.6927820823415318
Var: offtrt, Mean AUC: 0.6916703190988736
Var: cd40, Mean AUC: 0.6910544038375933
Var: cd420, Mean AUC: 0.6921460705410666
Var: cd80, Mean AUC: 0.689709233448604
Var: cd820, Mean AUC: 0.6911342203061033
Var: cd4_change, Mean AUC: 0.6891277253805891
Var: cd8_change, Mean AUC: 0.6886490068504207
Var: risk_score, Mean AUC: 0.6921125203728573
Var: trt_comb, Mean AUC: 0.6916703190988736
Var: PC1, Mean AUC: 0.6939928265146684


In [None]:
# Try removing a variable
seed = seed + 1
y = df['infected']
search_strings = ['lda','strat','z30','trt','PC2','PC1']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: lda, Mean AUC: 0.6904074692290052
Var: strat, Mean AUC: 0.6833603146509896
Var: z30, Mean AUC: 0.6863909704070472
Var: trt, Mean AUC: 0.6881209591291972
Var: PC2, Mean AUC: 0.6909580443574477
Var: PC1, Mean AUC: 0.6918132928614972


In [None]:
# Try to beat 0.6939
seed = seed + 1
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt','PC2','PC1']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: time, Mean AUC: 0.692721245858748
Var: age, Mean AUC: 0.6932960554361545
Var: wtkg, Mean AUC: 0.6924815498700709
Var: hemo, Mean AUC: 0.6930188579061012
Var: homo, Mean AUC: 0.6926592657790399
Var: drugs, Mean AUC: 0.6922981982065219
Var: karnof, Mean AUC: 0.6933835576631752
Var: oprior, Mean AUC: 0.6929164399080401
Var: preanti, Mean AUC: 0.695471534341035
Var: race, Mean AUC: 0.6928668735358455
Var: gender, Mean AUC: 0.6922085666724542
Var: str2, Mean AUC: 0.6944714705094447
Var: symptom, Mean AUC: 0.6925745477209756
Var: treat, Mean AUC: 0.6938196370228885
Var: offtrt, Mean AUC: 0.6928576107772836
Var: cd40, Mean AUC: 0.6921668442574388
Var: cd420, Mean AUC: 0.6933410529550339
Var: cd80, Mean AUC: 0.6922226336501642
Var: cd820, Mean AUC: 0.6921383395906917
Var: cd4_change, Mean AUC: 0.6905909811296737
Var: cd8_change, Mean AUC: 0.690823165872089
Var: risk_score, Mean AUC: 0.6922048844969106
Var: trt_comb, Mean AUC: 0.6928576107772836


In [None]:
# Try removing a variable
seed = seed + 1
y = df['infected']
search_strings = ['lda','strat','z30','trt','PC2','PC1','str2']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

Var: lda, Mean AUC: 0.6912454559450165
Var: strat, Mean AUC: 0.6896371857566581
Var: z30, Mean AUC: 0.6907731097864425
Var: trt, Mean AUC: 0.691218346303084
Var: PC2, Mean AUC: 0.6943346272833182
Var: PC1, Mean AUC: 0.6938566248911882
Var: str2, Mean AUC: 0.6927690855392724


In [None]:
# Try to beat 0.6944
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt','PC2','PC1','str2']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 31
Var: time, Mean AUC: 0.6944757702120462
Var: age, Mean AUC: 0.694207396114753
Var: wtkg, Mean AUC: 0.6949154353384497
Var: hemo, Mean AUC: 0.6940506532423255
Var: homo, Mean AUC: 0.6945002300784814
Var: drugs, Mean AUC: 0.6947719247396538
Var: karnof, Mean AUC: 0.6938099929754492
Var: oprior, Mean AUC: 0.6951608762143081
Var: preanti, Mean AUC: 0.6962902312054697
Var: race, Mean AUC: 0.6934961289052144
Var: gender, Mean AUC: 0.6944239344225219
Var: symptom, Mean AUC: 0.6951918985325178
Var: treat, Mean AUC: 0.696281474770329
Var: offtrt, Mean AUC: 0.6942804701987354
Var: cd40, Mean AUC: 0.6944396200475141
Var: cd420, Mean AUC: 0.6947906309911727
Var: cd80, Mean AUC: 0.6931185722813029
Var: cd820, Mean AUC: 0.693550482239073
Var: cd4_change, Mean AUC: 0.6937186158480273
Var: cd8_change, Mean AUC: 0.6919755691953919
Var: risk_score, Mean AUC: 0.6943007598405531
Var: trt_comb, Mean AUC: 0.6942804701987354


In [None]:
# Try removing a variable
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
search_strings = ['lda','strat','z30','trt','PC2','PC1','str2']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 32
Var: lda, Mean AUC: 0.6918591016392237
Var: strat, Mean AUC: 0.6891166837499323
Var: z30, Mean AUC: 0.6901479457621219
Var: trt, Mean AUC: 0.6906664440286352
Var: PC2, Mean AUC: 0.6925576595261449
Var: PC1, Mean AUC: 0.6937583201527057
Var: str2, Mean AUC: 0.6927499783916348


In [None]:
# Try to beat 0.6962
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt','PC2','PC1','str2','treat']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 33
Var: time, Mean AUC: 0.6957282419729333
Var: age, Mean AUC: 0.6964099617398685
Var: wtkg, Mean AUC: 0.6964001451134743
Var: hemo, Mean AUC: 0.6954579937430607
Var: homo, Mean AUC: 0.6962258403398623
Var: drugs, Mean AUC: 0.6957794095275445
Var: karnof, Mean AUC: 0.6961389198802413
Var: oprior, Mean AUC: 0.6951805291472773
Var: preanti, Mean AUC: 0.6972653360823322
Var: race, Mean AUC: 0.6949393012696347
Var: gender, Mean AUC: 0.6955240082393106
Var: symptom, Mean AUC: 0.6968499393310645
Var: offtrt, Mean AUC: 0.6959353577312939
Var: cd40, Mean AUC: 0.6950268204018902
Var: cd420, Mean AUC: 0.6954168410820734
Var: cd80, Mean AUC: 0.6945742047074497
Var: cd820, Mean AUC: 0.6946575812347533
Var: cd4_change, Mean AUC: 0.6951859669293958
Var: cd8_change, Mean AUC: 0.6940371110295522
Var: risk_score, Mean AUC: 0.6954748622979874
Var: trt_comb, Mean AUC: 0.6959353577312939


In [None]:
# Try removing a variable
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
search_strings = ['lda','strat','z30','trt','PC2','PC1','str2','preanti']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 36
Var: lda, Mean AUC: 0.6931956624181856
Var: strat, Mean AUC: 0.693210682113991
Var: z30, Mean AUC: 0.6946925815380668
Var: trt, Mean AUC: 0.6944387407629631
Var: PC2, Mean AUC: 0.6977821811309337
Var: PC1, Mean AUC: 0.6962619126011809
Var: str2, Mean AUC: 0.6955774733278337
Var: treat, Mean AUC: 0.6962830110813825
Var: preanti, Mean AUC: 0.6962803488951426


In [None]:
# Try to beat 0.6972
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt','PC2','PC1','str2','treat','preanti']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 35
Var: time, Mean AUC: 0.6977704376486474
Var: age, Mean AUC: 0.6983690418938038
Var: wtkg, Mean AUC: 0.6977809251340085
Var: hemo, Mean AUC: 0.698356025530096
Var: homo, Mean AUC: 0.697725244317809
Var: drugs, Mean AUC: 0.6976740382149658
Var: karnof, Mean AUC: 0.6985040788511203
Var: oprior, Mean AUC: 0.6984026553221062
Var: race, Mean AUC: 0.6973823874544404
Var: gender, Mean AUC: 0.6976615152911358
Var: symptom, Mean AUC: 0.6980935786400713
Var: offtrt, Mean AUC: 0.6979923761488303
Var: cd40, Mean AUC: 0.6988602477109299
Var: cd420, Mean AUC: 0.6964966719891217
Var: cd80, Mean AUC: 0.6964426795571967
Var: cd820, Mean AUC: 0.6959854700761922
Var: cd4_change, Mean AUC: 0.6963888684276406
Var: cd8_change, Mean AUC: 0.6969350217255416
Var: risk_score, Mean AUC: 0.6980780903729562
Var: trt_comb, Mean AUC: 0.6979923761488303


In [None]:
# Try removing a variable
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
search_strings = ['lda','strat','z30','trt','PC2','PC1','str2','treat','preanti','cd40']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 37
Var: lda, Mean AUC: 0.6946113692520305
Var: strat, Mean AUC: 0.6932866298672935
Var: z30, Mean AUC: 0.693778908402028
Var: trt, Mean AUC: 0.695007857155023
Var: PC2, Mean AUC: 0.6956479567945288
Var: PC1, Mean AUC: 0.6949442709672293
Var: str2, Mean AUC: 0.6955550598027924
Var: treat, Mean AUC: 0.6961271669307947
Var: preanti, Mean AUC: 0.694541919306316
Var: cd40, Mean AUC: 0.6978960699423155


In [None]:
# Try to beat 0.6988
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','trt','PC2','PC1','str2','treat','preanti','cd40']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print(f'Var: {var}, Mean AUC: {scores.mean()}')

seed: 38
Var: time, Mean AUC: 0.6967121540686048
Var: age, Mean AUC: 0.6973246674248476
Var: wtkg, Mean AUC: 0.6976341970103586
Var: hemo, Mean AUC: 0.6974685681888559
Var: homo, Mean AUC: 0.696613788152632
Var: drugs, Mean AUC: 0.6979044533056428
Var: karnof, Mean AUC: 0.6975828537882227
Var: oprior, Mean AUC: 0.6971003241382228
Var: race, Mean AUC: 0.6975965147277126
Var: gender, Mean AUC: 0.6972683818348915
Var: symptom, Mean AUC: 0.6977577323454249
Var: offtrt, Mean AUC: 0.6969818386497137
Var: cd420, Mean AUC: 0.6977443365947352
Var: cd80, Mean AUC: 0.6972710380860871
Var: cd820, Mean AUC: 0.6973969453779804
Var: cd4_change, Mean AUC: 0.6960009834819663
Var: cd8_change, Mean AUC: 0.6978704737425161
Var: risk_score, Mean AUC: 0.6966292302350515
Var: trt_comb, Mean AUC: 0.6969818386497137


### XGBoost Feature Selection - F1

In [None]:
# Start with some variables to begin with
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 41
Var: time, Mean F1: 0.28406230212241074
Var: trt, Mean F1: 0.30491439423727684
Var: age, Mean F1: 0.2882261457169545
Var: wtkg, Mean F1: 0.2941448528990762
Var: hemo, Mean F1: 0.3027925948875984
Var: homo, Mean F1: 0.29151626850609125
Var: drugs, Mean F1: 0.30307542337240484
Var: karnof, Mean F1: 0.2968049894045149
Var: oprior, Mean F1: 0.30190521302068757
Var: preanti, Mean F1: 0.30966665729188103
Var: race, Mean F1: 0.2971300466325312
Var: gender, Mean F1: 0.29219573307556923
Var: str2, Mean F1: 0.30514389447730483
Var: symptom, Mean F1: 0.29105159098856587
Var: treat, Mean F1: 0.30470619945156396
Var: offtrt, Mean F1: 0.2988472240139043
Var: cd40, Mean F1: 0.29773808580277994
Var: cd420, Mean F1: 0.2864253882081623
Var: cd80, Mean F1: 0.28573920671446923
Var: cd820, Mean F1: 0.29287426826582885
Var: cd4_change, Mean F1: 0.28627631654753805
Var: cd8_change, Mean F1: 0.28473197690364216
Var: risk_score, Mean F1: 0.29837413487352726
Var: trt_comb, Mean F1: 0.2995703665808478
V

In [None]:
# Try to beat 0.3051
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 42
Var: time, Mean F1: 0.29010872681540734
Var: trt, Mean F1: 0.30799893498223935
Var: age, Mean F1: 0.2971327474718144
Var: wtkg, Mean F1: 0.2972056166258118
Var: hemo, Mean F1: 0.3088515148412817
Var: homo, Mean F1: 0.3036588836322176
Var: drugs, Mean F1: 0.30904515828748724
Var: karnof, Mean F1: 0.30017751111197216
Var: oprior, Mean F1: 0.30890308100338304
Var: preanti, Mean F1: 0.3148857340703741
Var: race, Mean F1: 0.30505296464682746
Var: gender, Mean F1: 0.30309826132039697
Var: symptom, Mean F1: 0.30395058747173176
Var: treat, Mean F1: 0.3057620041072862
Var: offtrt, Mean F1: 0.30151539592160315
Var: cd40, Mean F1: 0.3036656795405239
Var: cd420, Mean F1: 0.2913785430325566
Var: cd80, Mean F1: 0.288629743330218
Var: cd820, Mean F1: 0.29865559607112313
Var: cd4_change, Mean F1: 0.2940404052719313
Var: cd8_change, Mean F1: 0.2946209278300882
Var: risk_score, Mean F1: 0.3065853761044425
Var: trt_comb, Mean F1: 0.2952381416427661
Var: PC1, Mean F1: 0.2928534138537528
Var: PC2,

In [None]:
# Try to beat 0.3090
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 43
Var: time, Mean F1: 0.29309932150497814
Var: trt, Mean F1: 0.30906406657648844
Var: age, Mean F1: 0.2976417518141948
Var: wtkg, Mean F1: 0.294237732390374
Var: hemo, Mean F1: 0.30126074154556237
Var: homo, Mean F1: 0.29882004064910234
Var: karnof, Mean F1: 0.30150664934729765
Var: oprior, Mean F1: 0.3034531981683633
Var: preanti, Mean F1: 0.3177907615974697
Var: race, Mean F1: 0.3023147950248511
Var: gender, Mean F1: 0.299532000037513
Var: symptom, Mean F1: 0.2997531865962464
Var: treat, Mean F1: 0.31114023020686277
Var: offtrt, Mean F1: 0.3012129672609289
Var: cd40, Mean F1: 0.302660386736923
Var: cd420, Mean F1: 0.2932159352932136
Var: cd80, Mean F1: 0.2889713239359864
Var: cd820, Mean F1: 0.2927638757020645
Var: cd4_change, Mean F1: 0.294112103149579
Var: cd8_change, Mean F1: 0.29451962240762
Var: risk_score, Mean F1: 0.3068328343118839
Var: trt_comb, Mean F1: 0.2995066025506605
Var: PC1, Mean F1: 0.2924649069333645
Var: PC2, Mean F1: 0.29782712360201774


In [None]:
# Try to beat 0.3177
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 44
Var: time, Mean F1: 0.3160224932721401
Var: trt, Mean F1: 0.3297925204459254
Var: age, Mean F1: 0.3184407050457391
Var: wtkg, Mean F1: 0.3154148565143004
Var: hemo, Mean F1: 0.31710670913193034
Var: homo, Mean F1: 0.31534861712952555
Var: karnof, Mean F1: 0.31362033548700785
Var: oprior, Mean F1: 0.31908561260016827
Var: race, Mean F1: 0.31300761932969445
Var: gender, Mean F1: 0.316183412890802
Var: symptom, Mean F1: 0.318828236501807
Var: treat, Mean F1: 0.32190787960680334
Var: offtrt, Mean F1: 0.31884443394488216
Var: cd40, Mean F1: 0.3166330003540198
Var: cd420, Mean F1: 0.3099734042565675
Var: cd80, Mean F1: 0.31413516522129786
Var: cd820, Mean F1: 0.31753463124406256
Var: cd4_change, Mean F1: 0.31500926183199546
Var: cd8_change, Mean F1: 0.31436647168766785
Var: risk_score, Mean F1: 0.3145383299670991
Var: trt_comb, Mean F1: 0.32156863110730977
Var: PC1, Mean F1: 0.3166571329182152
Var: PC2, Mean F1: 0.319777759244896


In [None]:
# Try to beat 0.3297
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 45
Var: time, Mean F1: 0.3356443121662933
Var: age, Mean F1: 0.3351005903714485
Var: wtkg, Mean F1: 0.33530520147805126
Var: hemo, Mean F1: 0.334912407112621
Var: homo, Mean F1: 0.3316165786946913
Var: karnof, Mean F1: 0.33146933283374314
Var: oprior, Mean F1: 0.33325537490632406
Var: race, Mean F1: 0.3344306107709091
Var: gender, Mean F1: 0.3300282214300698
Var: symptom, Mean F1: 0.33568680654680183
Var: treat, Mean F1: 0.33827206148227756
Var: offtrt, Mean F1: 0.3338795944404566
Var: cd40, Mean F1: 0.33768928868294146
Var: cd420, Mean F1: 0.3310709922108438
Var: cd80, Mean F1: 0.3308169616147216
Var: cd820, Mean F1: 0.33791661412940016
Var: cd4_change, Mean F1: 0.33127944605399423
Var: cd8_change, Mean F1: 0.33442631330065004
Var: risk_score, Mean F1: 0.33288851649907203
Var: trt_comb, Mean F1: 0.3338795944404566
Var: PC1, Mean F1: 0.3328714442227546
Var: PC2, Mean F1: 0.33487160615608297


In [None]:
# Try to beat 0.3382
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 46
Var: time, Mean F1: 0.33915338487261726
Var: age, Mean F1: 0.3338901146632618
Var: wtkg, Mean F1: 0.33995957358066664
Var: hemo, Mean F1: 0.3384127159472847
Var: homo, Mean F1: 0.33359976907055966
Var: karnof, Mean F1: 0.33297591586756603
Var: oprior, Mean F1: 0.33504849422221944
Var: race, Mean F1: 0.33592628195195884
Var: gender, Mean F1: 0.33358815101803774
Var: symptom, Mean F1: 0.3339309905484072
Var: offtrt, Mean F1: 0.33702745231849185
Var: cd40, Mean F1: 0.3351150747690748
Var: cd420, Mean F1: 0.33661737772791256
Var: cd80, Mean F1: 0.3327124568932059
Var: cd820, Mean F1: 0.33618095955242117
Var: cd4_change, Mean F1: 0.33614720356507044
Var: cd8_change, Mean F1: 0.3360187863282718
Var: risk_score, Mean F1: 0.33345957374808155
Var: trt_comb, Mean F1: 0.33702745231849185
Var: PC1, Mean F1: 0.34303404998059256
Var: PC2, Mean F1: 0.33923502859535776


In [None]:
# Try to beat 0.3430
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 47
Var: time, Mean F1: 0.3400480495958046
Var: age, Mean F1: 0.3376886642698028
Var: wtkg, Mean F1: 0.3453327883054471
Var: hemo, Mean F1: 0.33639948492417504
Var: homo, Mean F1: 0.33669646145462845
Var: karnof, Mean F1: 0.3395919399276172
Var: oprior, Mean F1: 0.33865695078744157
Var: race, Mean F1: 0.33799609755020016
Var: gender, Mean F1: 0.3361553060176733
Var: symptom, Mean F1: 0.336267128844811
Var: offtrt, Mean F1: 0.337923864552111
Var: cd40, Mean F1: 0.33949824236520054
Var: cd420, Mean F1: 0.33709442177392607
Var: cd80, Mean F1: 0.3390707556042207
Var: cd820, Mean F1: 0.33851123150710216
Var: cd4_change, Mean F1: 0.3423148378002712
Var: cd8_change, Mean F1: 0.34108420836962083
Var: risk_score, Mean F1: 0.3373826471709952
Var: trt_comb, Mean F1: 0.337923864552111
Var: PC2, Mean F1: 0.3438349446206531


In [None]:
# Try to beat 0.3453
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 48
Var: time, Mean F1: 0.3417110185960762
Var: age, Mean F1: 0.34538967699368206
Var: hemo, Mean F1: 0.34600164543740497
Var: homo, Mean F1: 0.3439058909958374
Var: karnof, Mean F1: 0.3445212197605708
Var: oprior, Mean F1: 0.3462495324786539
Var: race, Mean F1: 0.3456481536411239
Var: gender, Mean F1: 0.3451277332984909
Var: symptom, Mean F1: 0.34219909581823404
Var: offtrt, Mean F1: 0.34740120985248185
Var: cd40, Mean F1: 0.3481103479428415
Var: cd420, Mean F1: 0.340491743251302
Var: cd80, Mean F1: 0.3455794328794557
Var: cd820, Mean F1: 0.3444133872301968
Var: cd4_change, Mean F1: 0.3436044982495008
Var: cd8_change, Mean F1: 0.34598147209160346
Var: risk_score, Mean F1: 0.34387776593947994
Var: trt_comb, Mean F1: 0.34740120985248185
Var: PC2, Mean F1: 0.34678263592685876


In [None]:
# Try to beat 0.3481
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 49
Var: time, Mean F1: 0.34826047112448577
Var: age, Mean F1: 0.34891768446138044
Var: hemo, Mean F1: 0.3456464430675448
Var: homo, Mean F1: 0.34756913329590644
Var: karnof, Mean F1: 0.347837368874606
Var: oprior, Mean F1: 0.34258002964841816
Var: race, Mean F1: 0.34509270943788317
Var: gender, Mean F1: 0.345055750434155
Var: symptom, Mean F1: 0.3464263465594868
Var: offtrt, Mean F1: 0.34392167172394905
Var: cd420, Mean F1: 0.34156816235762844
Var: cd80, Mean F1: 0.345723544777512
Var: cd820, Mean F1: 0.34680908573161245
Var: cd4_change, Mean F1: 0.3467583368567603
Var: cd8_change, Mean F1: 0.3458608903411512
Var: risk_score, Mean F1: 0.34630541726412967
Var: trt_comb, Mean F1: 0.34392167172394905
Var: PC2, Mean F1: 0.34634032782897145


In [None]:
# Try the first variable just to see if it's lda, like expected
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = []
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 50
Var: time, Mean F1: 0.021323082372554883
Var: trt, Mean F1: 0.0
Var: age, Mean F1: 0.00385340983394138
Var: wtkg, Mean F1: 0.001142857142857143
Var: hemo, Mean F1: 0.0
Var: homo, Mean F1: 0.0
Var: drugs, Mean F1: 0.0
Var: karnof, Mean F1: 0.0
Var: oprior, Mean F1: 0.0
Var: z30, Mean F1: 0.0
Var: preanti, Mean F1: 0.018071992706447805
Var: race, Mean F1: 0.0
Var: gender, Mean F1: 0.0
Var: str2, Mean F1: 0.0
Var: strat, Mean F1: 0.0
Var: symptom, Mean F1: 0.0
Var: treat, Mean F1: 0.0
Var: offtrt, Mean F1: 0.0
Var: cd40, Mean F1: 0.06348745625976773
Var: cd420, Mean F1: 0.14324108024992765
Var: cd80, Mean F1: 0.0
Var: cd820, Mean F1: 0.0
Var: cd4_change, Mean F1: 0.0
Var: cd8_change, Mean F1: 0.0
Var: risk_score, Mean F1: 0.0
Var: trt_comb, Mean F1: 0.0
Var: PC1, Mean F1: 0.055304352576161474
Var: PC2, Mean F1: 0.0
Var: lda, Mean F1: 0.24210243458165545


In [None]:
# Try to beat 0.3489
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','age']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 51
Var: time, Mean F1: 0.3463119831703704
Var: hemo, Mean F1: 0.3477808440124771
Var: homo, Mean F1: 0.3445581358793039
Var: karnof, Mean F1: 0.3456053944867275
Var: oprior, Mean F1: 0.34502005182650836
Var: race, Mean F1: 0.3441043282761847
Var: gender, Mean F1: 0.344213704288854
Var: symptom, Mean F1: 0.3482989200551232
Var: offtrt, Mean F1: 0.3447207840229329
Var: cd420, Mean F1: 0.3447066615902995
Var: cd80, Mean F1: 0.3452918085135752
Var: cd820, Mean F1: 0.34545504417284806
Var: cd4_change, Mean F1: 0.34911245773688704
Var: cd8_change, Mean F1: 0.34499664790945106
Var: risk_score, Mean F1: 0.3460466286613448
Var: trt_comb, Mean F1: 0.3447207840229329
Var: PC2, Mean F1: 0.34938325345915805


In [None]:
# Try to beat 0.3493
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','age','PC2']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 52
Var: time, Mean F1: 0.348004827915118
Var: hemo, Mean F1: 0.34940996104031413
Var: homo, Mean F1: 0.34953363102684565
Var: karnof, Mean F1: 0.35066724915824116
Var: oprior, Mean F1: 0.35259764025419377
Var: race, Mean F1: 0.3527680177269453
Var: gender, Mean F1: 0.3506572279099991
Var: symptom, Mean F1: 0.34778322934285927
Var: offtrt, Mean F1: 0.3503039613723438
Var: cd420, Mean F1: 0.3507634866022863
Var: cd80, Mean F1: 0.3492822859722487
Var: cd820, Mean F1: 0.3494022168546095
Var: cd4_change, Mean F1: 0.3519772724730186
Var: cd8_change, Mean F1: 0.3483147303267208
Var: risk_score, Mean F1: 0.3508705336410769
Var: trt_comb, Mean F1: 0.3503039613723438


In [None]:
# Try to beat 0.3527
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','age','PC2','race']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 53
Var: time, Mean F1: 0.3493386537760951
Var: hemo, Mean F1: 0.34915214496144953
Var: homo, Mean F1: 0.3493729727186455
Var: karnof, Mean F1: 0.3495228973305703
Var: oprior, Mean F1: 0.34687315611814235
Var: gender, Mean F1: 0.3494531385878245
Var: symptom, Mean F1: 0.3499271604449094
Var: offtrt, Mean F1: 0.3456954210373328
Var: cd420, Mean F1: 0.3457693885609546
Var: cd80, Mean F1: 0.3493619378662895
Var: cd820, Mean F1: 0.3534308353760467
Var: cd4_change, Mean F1: 0.3529260292880928
Var: cd8_change, Mean F1: 0.3493443334737132
Var: risk_score, Mean F1: 0.34758127761183455
Var: trt_comb, Mean F1: 0.3456954210373328


In [None]:
# Try removing a variable
seed = seed + 1
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','age','PC2','race','cd820']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

Var: lda, Mean F1: 0.34386580200859024
Var: strat, Mean F1: 0.3390041671056921
Var: z30, Mean F1: 0.34430700929675484
Var: str2, Mean F1: 0.34603262722565
Var: drugs, Mean F1: 0.34924549745525196
Var: preanti, Mean F1: 0.3406069160267095
Var: trt, Mean F1: 0.3418857658584976
Var: treat, Mean F1: 0.3433147598813636
Var: PC1, Mean F1: 0.34391164908606153
Var: wtkg, Mean F1: 0.34676219893237165
Var: cd40, Mean F1: 0.3486119163396257
Var: age, Mean F1: 0.352968711396714
Var: PC2, Mean F1: 0.3504793194194633
Var: race, Mean F1: 0.3516931918156927
Var: cd820, Mean F1: 0.3485840330122882


In [None]:
# Removing age, since that seemed to help a bit
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

Var: lda, Mean F1: 0.34235670840055205
Var: strat, Mean F1: 0.3373694506567343
Var: z30, Mean F1: 0.3420843515473437
Var: str2, Mean F1: 0.34518611463150356
Var: drugs, Mean F1: 0.34797908318993187
Var: preanti, Mean F1: 0.3393453144953017
Var: trt, Mean F1: 0.3337237386204818
Var: treat, Mean F1: 0.3456445819065201
Var: PC1, Mean F1: 0.3443323825046935
Var: wtkg, Mean F1: 0.3392069630853255
Var: cd40, Mean F1: 0.3462469439266226
Var: PC2, Mean F1: 0.34821579908815414
Var: race, Mean F1: 0.3513596673279017
Var: cd820, Mean F1: 0.3440578887546877


In [None]:
# Try to beat 0.3529
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=4, eta=0.3, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 57
Var: time, Mean F1: 0.35124336135114537
Var: age, Mean F1: 0.3517960989279701
Var: hemo, Mean F1: 0.3523244855735065
Var: homo, Mean F1: 0.350082092244067
Var: karnof, Mean F1: 0.3514617907856566
Var: oprior, Mean F1: 0.35076488598270805
Var: gender, Mean F1: 0.3488676091360362
Var: symptom, Mean F1: 0.3511322717034478
Var: offtrt, Mean F1: 0.3511232220127411
Var: cd420, Mean F1: 0.3512042451719696
Var: cd80, Mean F1: 0.35059617499584234
Var: cd4_change, Mean F1: 0.3514947980461029
Var: cd8_change, Mean F1: 0.3514971100128658
Var: risk_score, Mean F1: 0.3478224783838201
Var: trt_comb, Mean F1: 0.3511232220127411


In [None]:
# Try removing stuff with a higher max_depth
seed = seed + 1
print(f'seed: {seed}')
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=8, eta=1.5, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
for var in search_strings:
  loop_strings = copy.deepcopy(search_strings)
  loop_strings.remove(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in loop_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 65
Var: lda, Mean F1: 0.3844780530154389
Var: strat, Mean F1: 0.38049611821002366
Var: z30, Mean F1: 0.38388975647551077
Var: str2, Mean F1: 0.3866011913491564
Var: drugs, Mean F1: 0.3878844141882102
Var: preanti, Mean F1: 0.38519512636572356
Var: trt, Mean F1: 0.38316400708453763
Var: treat, Mean F1: 0.3850433795416371
Var: PC1, Mean F1: 0.3890848418523386
Var: wtkg, Mean F1: 0.38661184558961725
Var: cd40, Mean F1: 0.3846994423219634
Var: PC2, Mean F1: 0.38144531270163107
Var: race, Mean F1: 0.3836413023680302
Var: cd820, Mean F1: 0.3855454349500436


In [None]:
# Try to beat 0.3941
seed = seed + 1
print(f'seed: {seed}')
df_metrics = pd.DataFrame(df.columns.tolist(), columns = ['Var'])
y = df['infected']
model = XGBClassifier(objective='binary:logistic', max_depth=8, eta=01.5, eval_metric='auc', use_label_encoder=False)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for index, row in df_metrics.iterrows():
  search_strings = ['lda','strat','z30','str2','drugs','preanti','trt','treat','PC1','wtkg','cd40','PC2','race','cd820']
  to_skip = copy.deepcopy(search_strings)
  to_skip.append('infected')
  var = df_metrics['Var'][index]
  if var in to_skip:
      continue
  search_strings.append(var)
  filtered_columns = [col for col in df_onehot.columns if any(s in col for s in search_strings)]
  X = df_onehot[filtered_columns]
  scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print(f'Var: {var}, Mean F1: {scores.mean()}')

seed: 66
Var: time, Mean F1: 0.3881313917063776
Var: age, Mean F1: 0.3883954830583185
Var: hemo, Mean F1: 0.38943539543266714
Var: homo, Mean F1: 0.3931883235604094
Var: karnof, Mean F1: 0.38947013908920164
Var: oprior, Mean F1: 0.39075472062730066
Var: gender, Mean F1: 0.3917219432717694
Var: symptom, Mean F1: 0.3913662885157985
Var: offtrt, Mean F1: 0.3925425121642818
Var: cd420, Mean F1: 0.38858908752673843
Var: cd80, Mean F1: 0.388616792640742
Var: cd4_change, Mean F1: 0.39081749224870815
Var: cd8_change, Mean F1: 0.38455443647368315
Var: risk_score, Mean F1: 0.39056924826273604
Var: trt_comb, Mean F1: 0.3925425121642818
