<a href="https://colab.research.google.com/github/shearere2/vcu_reu/blob/main/XGBOOST_FRAMEWORK_ETHAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [133]:
install.packages('xgboost')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [134]:
install.packages('estimatr')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [135]:
install.packages("dplyr")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [136]:
library(dplyr)

In [137]:
library(xgboost)

In [138]:
library(estimatr)

# CISGENDER WOMEN SUBGROUP

### Using GPA as primary predictor

In [139]:
set.seed(2024)
data.intake <- read.csv('gender_female.csv')
rownames(data.intake) <- data.intake$record_id

In [140]:
col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

In [141]:
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

In [142]:
n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

In [143]:
params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

In [144]:
fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

[1]	train-error:0.120536	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.040179	test-error:1.000000 
[21]	train-error:0.026786	test-error:1.000000 
[31]	train-error:0.008929	test-error:1.000000 
[41]	train-error:0.004464	test-error:1.000000 
[51]	train-error:0.004464	test-error:1.000000 
[61]	train-error:0.000000	test-error:1.000000 
[71]	train-error:0.000000	test-error:1.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[1]	train-error:0.120536	test-error:1.000000



In [145]:
pred <- predict(fit_xgb, train.intake)

In [146]:
dif <- dim(data.intake)[1] - length(pred)
dif

In [147]:
for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

In [148]:
ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))

In [149]:
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)

In [150]:
summary(model)


Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.8323    0.04594  18.120 7.461e-46   0.7565  0.90822 225
gpa_thresh3_0  -0.1896    0.08903  -2.129 3.433e-02  -0.3366 -0.04251 225

Multiple R-squared:  0.01854 ,	Adjusted R-squared:  0.01418 
F-statistic: 4.533 on 1 and 225 DF,  p-value: 0.03433

Using PTSD score as primary predictor

In [151]:
set.seed(2024)
data.intake <- read.csv('gender_female.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.071429	test-error:0.666667 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.053571	test-error:0.000000 
[21]	train-error:0.044643	test-error:0.000000 
[31]	train-error:0.022321	test-error:0.333333 
[41]	train-error:0.013393	test-error:0.000000 
[51]	train-error:0.004464	test-error:0.000000 
[61]	train-error:0.004464	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[5]	train-error:0.049107	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   0.5297    0.03231   16.39 2.875e-40   0.4763   0.5831 225
ptsd_score    0.8487    0.07343   11.56 1.457e-24   0.7274   0.9700 225

Multiple R-squared:  0.4369 ,	Adjusted R-squared:  0.4344 
F-statistic: 133.6 on 1 and 225 DF,  p-value: < 2.2e-16

Using covid_iso as primary predictor

In [152]:
median(data.intake$covid_iso)

In [153]:
set.seed(2024)
data.intake <- read.csv('gender_female.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(covid_iso = ifelse(covid_iso > 7, 1, 0))

col_names <- colnames(data.intake)
if ("covid_iso" %in% col_names){
  new_col_order <- c("covid_iso", col_names[col_names != "covid_iso"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$covid_iso==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ covid_iso, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.125000	test-error:0.333333 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.035714	test-error:0.666667 
[21]	train-error:0.008929	test-error:0.666667 
[31]	train-error:0.008929	test-error:0.666667 
[41]	train-error:0.004464	test-error:0.666667 
[51]	train-error:0.004464	test-error:0.666667 
[61]	train-error:0.000000	test-error:0.666667 
[71]	train-error:0.000000	test-error:0.666667 
[81]	train-error:0.000000	test-error:0.666667 
[91]	train-error:0.000000	test-error:0.666667 
[101]	train-error:0.000000	test-error:0.666667 
Stopping. Best iteration:
[1]	train-error:0.125000	test-error:0.333333




Call:
lm_robust(formula = mh_scale ~ covid_iso, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   0.6738    0.05108   13.19 8.195e-30   0.5895   0.7582 225
covid_iso     0.2527    0.07873    3.21 1.522e-03   0.1227   0.3827 225

Multiple R-squared:  0.04436 ,	Adjusted R-squared:  0.04012 
F-statistic:  10.3 on 1 and 225 DF,  p-value: 0.001522

Using edu.i_count as predidctor

In [154]:
set.seed(2024)
data.intake <- read.csv('gender_female.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(edu.i_count = ifelse(edu.i_count <= 2, 1, 0))

col_names <- colnames(data.intake)
if ("edu.i_count" %in% col_names){
  new_col_order <- c("edu.i_count", col_names[col_names != "edu.i_count"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$edu.i_count==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ edu.i_count, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.075893	test-error:0.333333 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.044643	test-error:0.333333 
[21]	train-error:0.031250	test-error:0.333333 
[31]	train-error:0.022321	test-error:0.333333 
[41]	train-error:0.013393	test-error:0.333333 
[51]	train-error:0.004464	test-error:0.333333 
[61]	train-error:0.004464	test-error:0.333333 
[71]	train-error:0.004464	test-error:0.333333 
[81]	train-error:0.004464	test-error:0.333333 
[91]	train-error:0.004464	test-error:0.333333 
[101]	train-error:0.000000	test-error:0.333333 
Stopping. Best iteration:
[1]	train-error:0.075893	test-error:0.333333




Call:
lm_robust(formula = mh_scale ~ edu.i_count, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|)  CI Lower CI Upper  DF
(Intercept)   0.7062    0.05987  11.796 2.558e-25  0.607342   0.8051 225
edu.i_count   0.1270    0.07917   1.604 1.102e-01 -0.003807   0.2577 225

Multiple R-squared:  0.01061 ,	Adjusted R-squared:  0.006208 
F-statistic: 2.571 on 1 and 225 DF,  p-value: 0.1102

# Cisgender Man Subgroup

GPA as primary predictor

In [155]:
set.seed(2024)
data.intake <- read.csv('gender_male.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.188679	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.094340	test-error:0.000000 
[21]	train-error:0.075472	test-error:0.000000 
[31]	train-error:0.037736	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.188679	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)    0.77227     0.1021   7.560 6.339e-10   0.6012   0.9433 52
gpa_thresh3_0  0.07611     0.2213   0.344 7.323e-01  -0.2945   0.4467 52

Multiple R-squared:  0.002269 ,	Adjusted R-squared:  -0.01692 
F-statistic: 0.1183 on 1 and 52 DF,  p-value: 0.7323

PTSD

In [156]:
set.seed(2024)
data.intake <- read.csv('gender_male.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.075472	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.056604	test-error:0.000000 
[21]	train-error:0.056604	test-error:0.000000 
[31]	train-error:0.056604	test-error:0.000000 
[41]	train-error:0.056604	test-error:0.000000 
[51]	train-error:0.037736	test-error:0.000000 
[61]	train-error:0.037736	test-error:0.000000 
[71]	train-error:0.018868	test-error:0.000000 
[81]	train-error:0.018868	test-error:0.000000 
[91]	train-error:0.018868	test-error:0.000000 
[101]	train-error:0.018868	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.075472	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.4615    0.05696   8.103 8.746e-11   0.3662   0.5569 52
ptsd_score    1.1782    0.13625   8.647 1.225e-11   0.9500   1.4063 52

Multiple R-squared:  0.6542 ,	Adjusted R-squared:  0.6476 
F-statistic: 74.78 on 1 and 52 DF,  p-value: 1.225e-11

Covid_iso

In [157]:
set.seed(2024)
data.intake <- read.csv('gender_male.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(covid_iso = ifelse(covid_iso > 7, 1, 0))

col_names <- colnames(data.intake)
if ("covid_iso" %in% col_names){
  new_col_order <- c("covid_iso", col_names[col_names != "covid_iso"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$covid_iso==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ covid_iso, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.150943	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.037736	test-error:0.000000 
[21]	train-error:0.000000	test-error:0.000000 
[31]	train-error:0.000000	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.150943	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ covid_iso, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
             Estimate Std. Error  t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)  0.788195     0.1396  5.64450 6.972e-07   0.5543   1.0220 52
covid_iso   -0.005881     0.1833 -0.03208 9.745e-01  -0.3129   0.3012 52

Multiple R-squared:  1.999e-05 ,	Adjusted R-squared:  -0.01921 
F-statistic: 0.001029 on 1 and 52 DF,  p-value: 0.9745

In [158]:
set.seed(2024)
data.intake <- read.csv('gender_male.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(edu.i_count = ifelse(edu.i_count <= 2, 1, 0))

col_names <- colnames(data.intake)
if ("edu.i_count" %in% col_names){
  new_col_order <- c("edu.i_count", col_names[col_names != "edu.i_count"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$edu.i_count==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ edu.i_count, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.113208	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.056604	test-error:1.000000 
[21]	train-error:0.037736	test-error:1.000000 
[31]	train-error:0.018868	test-error:1.000000 
[41]	train-error:0.018868	test-error:1.000000 
[51]	train-error:0.000000	test-error:1.000000 
[61]	train-error:0.000000	test-error:1.000000 
[71]	train-error:0.000000	test-error:1.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[1]	train-error:0.113208	test-error:1.000000




Call:
lm_robust(formula = mh_scale ~ edu.i_count, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.5402     0.1106   4.883 1.035e-05   0.3549   0.7254 52
edu.i_count   0.3989     0.1652   2.414 1.932e-02   0.1222   0.6757 52

Multiple R-squared:  0.08883 ,	Adjusted R-squared:  0.0713 
F-statistic: 5.829 on 1 and 52 DF,  p-value: 0.01932

# 18-20 subgroup

GPA

In [159]:
set.seed(2024)
data.intake <- read.csv('ages18_20.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.072727	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.090909	test-error:0.000000 
[21]	train-error:0.072727	test-error:0.000000 
[31]	train-error:0.072727	test-error:0.000000 
[41]	train-error:0.054545	test-error:0.000000 
[51]	train-error:0.036364	test-error:0.000000 
[61]	train-error:0.036364	test-error:0.000000 
[71]	train-error:0.036364	test-error:0.000000 
[81]	train-error:0.018182	test-error:0.000000 
[91]	train-error:0.018182	test-error:0.000000 
[101]	train-error:0.018182	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.072727	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)     0.8894    0.09546   9.317 7.854e-13   0.7297   1.0492 54
gpa_thresh3_0  -0.2728    0.24928  -1.095 2.786e-01  -0.6900   0.1443 54

Multiple R-squared:  0.02219 ,	Adjusted R-squared:  0.004079 
F-statistic: 1.198 on 1 and 54 DF,  p-value: 0.2786

ptsd

In [160]:
set.seed(2024)
data.intake <- read.csv('ages18_20.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.127273	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.090909	test-error:0.000000 
[21]	train-error:0.072727	test-error:0.000000 
[31]	train-error:0.054545	test-error:0.000000 
[41]	train-error:0.018182	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.127273	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.4649    0.06553   7.095 2.886e-09   0.3553   0.5746 54
ptsd_score    0.9857    0.13515   7.294 1.373e-09   0.7596   1.2119 54

Multiple R-squared:  0.5403 ,	Adjusted R-squared:  0.5318 
F-statistic:  53.2 on 1 and 54 DF,  p-value: 1.373e-09

covid_iso

In [161]:
set.seed(2024)
data.intake <- read.csv('ages18_20.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(covid_iso = ifelse(covid_iso > 7, 1, 0))

col_names <- colnames(data.intake)
if ("covid_iso" %in% col_names){
  new_col_order <- c("covid_iso", col_names[col_names != "covid_iso"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$covid_iso==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ covid_iso, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.145455	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.090909	test-error:0.000000 
[21]	train-error:0.072727	test-error:0.000000 
[31]	train-error:0.018182	test-error:0.000000 
[41]	train-error:0.018182	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:1.000000 
[71]	train-error:0.000000	test-error:1.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[8]	train-error:0.090909	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ covid_iso, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.6881     0.1114   6.178 8.832e-08 0.501707   0.8745 54
covid_iso     0.2956     0.1758   1.682 9.837e-02 0.001461   0.5898 54

Multiple R-squared:  0.04901 ,	Adjusted R-squared:  0.03139 
F-statistic: 2.829 on 1 and 54 DF,  p-value: 0.09837

max.emohelp

In [162]:
set.seed(2024)
data.intake <- read.csv('ages18_20.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(max.emohelp = ifelse(max.emohelp >= 9, 1, 0))

col_names <- colnames(data.intake)
if ("max.emohelp" %in% col_names){
  new_col_order <- c("max.emohelp", col_names[col_names != "max.emohelp"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$max.emohelp==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ max.emohelp, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.036364	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.036364	test-error:0.000000 
[21]	train-error:0.036364	test-error:0.000000 
[31]	train-error:0.036364	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.036364	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ max.emohelp, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   1.0020     0.1637   6.120 1.095e-07   0.7280    1.276 54
max.emohelp  -0.2014     0.1950  -1.033 3.064e-01  -0.5278    0.125 54

Multiple R-squared:  0.01882 ,	Adjusted R-squared:  0.0006496 
F-statistic: 1.066 on 1 and 54 DF,  p-value: 0.3064

#Ages 21-24

GPA

In [171]:
set.seed(2024)
data.intake <- read.csv('ages21_24.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.208791	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.043956	test-error:0.000000 
[21]	train-error:0.032967	test-error:0.000000 
[31]	train-error:0.000000	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.208791	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)     0.9550    0.08725  10.946 3.169e-18   0.8100   1.1000 90
gpa_thresh3_0  -0.3766    0.12949  -2.909 4.571e-03  -0.5919  -0.1614 90

Multiple R-squared:  0.07475 ,	Adjusted R-squared:  0.06447 
F-statistic: 8.461 on 1 and 90 DF,  p-value: 0.004571

ptsd

In [172]:
set.seed(2024)
data.intake <- read.csv('ages21_24.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.109890	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.065934	test-error:0.000000 
[21]	train-error:0.032967	test-error:0.000000 
[31]	train-error:0.010989	test-error:0.000000 
[41]	train-error:0.010989	test-error:0.000000 
[51]	train-error:0.010989	test-error:0.000000 
[61]	train-error:0.010989	test-error:0.000000 
[71]	train-error:0.010989	test-error:0.000000 
[81]	train-error:0.010989	test-error:0.000000 
[91]	train-error:0.010989	test-error:0.000000 
[101]	train-error:0.010989	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.109890	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)    0.485    0.05096   9.517 2.883e-15   0.4003   0.5697 90
ptsd_score     1.028    0.10830   9.488 3.324e-15   0.8475   1.2075 90

Multiple R-squared:  0.5498 ,	Adjusted R-squared:  0.5448 
F-statistic: 90.01 on 1 and 90 DF,  p-value: 3.324e-15

covid_iso

In [173]:
set.seed(2024)
data.intake <- read.csv('ages21_24.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(covid_iso = ifelse(covid_iso > 7, 1, 0))

col_names <- colnames(data.intake)
if ("covid_iso" %in% col_names){
  new_col_order <- c("covid_iso", col_names[col_names != "covid_iso"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$covid_iso==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ covid_iso, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.186813	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.043956	test-error:0.000000 
[21]	train-error:0.021978	test-error:0.000000 
[31]	train-error:0.010989	test-error:0.000000 
[41]	train-error:0.010989	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[7]	train-error:0.076923	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ covid_iso, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.7251     0.1039   6.976 4.957e-10  0.55235   0.8979 90
covid_iso     0.2052     0.1372   1.496 1.381e-01 -0.02273   0.4332 90

Multiple R-squared:  0.02481 ,	Adjusted R-squared:  0.01397 
F-statistic: 2.239 on 1 and 90 DF,  p-value: 0.1381

max.emohelp

In [174]:
set.seed(2024)
data.intake <- read.csv('ages21_24.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(max.emohelp = ifelse(max.emohelp >= 9, 1, 0))

col_names <- colnames(data.intake)
if ("max.emohelp" %in% col_names){
  new_col_order <- c("max.emohelp", col_names[col_names != "max.emohelp"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$max.emohelp==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ max.emohelp, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.032967	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.032967	test-error:0.000000 
[21]	train-error:0.010989	test-error:0.000000 
[31]	train-error:0.010989	test-error:0.000000 
[41]	train-error:0.010989	test-error:0.000000 
[51]	train-error:0.010989	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.032967	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ max.emohelp, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.6529     0.0824   7.924 5.848e-12  0.51600   0.7899 90
max.emohelp   0.2503     0.1217   2.056 4.263e-02  0.04802   0.4526 90

Multiple R-squared:  0.03105 ,	Adjusted R-squared:  0.02029 
F-statistic: 4.229 on 1 and 90 DF,  p-value: 0.04263

#25+

GPA

In [175]:
set.seed(2024)
data.intake <- read.csv('ages25_63.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.114286	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.085714	test-error:0.000000 
[21]	train-error:0.085714	test-error:0.000000 
[31]	train-error:0.085714	test-error:0.000000 
[41]	train-error:0.085714	test-error:0.000000 
[51]	train-error:0.085714	test-error:0.000000 
[61]	train-error:0.085714	test-error:0.000000 
[71]	train-error:0.028571	test-error:0.000000 
[81]	train-error:0.028571	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.114286	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)     0.6724    0.08301   8.100 1.911e-09   0.5320   0.8127 34
gpa_thresh3_0   0.3946    0.24495   1.611 1.164e-01  -0.0196   0.8088 34

Multiple R-squared:  0.1078 ,	Adjusted R-squared:  0.08153 
F-statistic: 2.595 on 1 and 34 DF,  p-value: 0.1164

ptsd

In [176]:
set.seed(2024)
data.intake <- read.csv('ages25_63.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.085714	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.085714	test-error:0.000000 
[21]	train-error:0.057143	test-error:0.000000 
[31]	train-error:0.028571	test-error:0.000000 
[41]	train-error:0.028571	test-error:0.000000 
[51]	train-error:0.028571	test-error:0.000000 
[61]	train-error:0.028571	test-error:0.000000 
[71]	train-error:0.028571	test-error:0.000000 
[81]	train-error:0.028571	test-error:0.000000 
[91]	train-error:0.028571	test-error:0.000000 
[101]	train-error:0.028571	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.085714	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.6747    0.08454   7.981 2.671e-09   0.5318   0.8177 34
ptsd_score    0.5876    0.24014   2.447 1.972e-02   0.1816   0.9937 34

Multiple R-squared:  0.1674 ,	Adjusted R-squared:  0.1429 
F-statistic: 5.988 on 1 and 34 DF,  p-value: 0.01972

covid_iso

In [177]:
set.seed(2024)
data.intake <- read.csv('ages25_63.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(covid_iso = ifelse(covid_iso > 7, 1, 0))

col_names <- colnames(data.intake)
if ("covid_iso" %in% col_names){
  new_col_order <- c("covid_iso", col_names[col_names != "covid_iso"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$covid_iso==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ covid_iso, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.114286	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.057143	test-error:1.000000 
[21]	train-error:0.057143	test-error:1.000000 
[31]	train-error:0.057143	test-error:1.000000 
[41]	train-error:0.057143	test-error:1.000000 
[51]	train-error:0.057143	test-error:1.000000 
[61]	train-error:0.057143	test-error:1.000000 
[71]	train-error:0.057143	test-error:1.000000 
[81]	train-error:0.057143	test-error:1.000000 
[91]	train-error:0.057143	test-error:1.000000 
[101]	train-error:0.028571	test-error:1.000000 
Stopping. Best iteration:
[1]	train-error:0.114286	test-error:1.000000




Call:
lm_robust(formula = mh_scale ~ covid_iso, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.6407    0.09552   6.708 1.053e-07   0.4792   0.8023 34
covid_iso     0.4223    0.17899   2.359 2.419e-02   0.1196   0.7250 34

Multiple R-squared:  0.142 ,	Adjusted R-squared:  0.1168 
F-statistic: 5.566 on 1 and 34 DF,  p-value: 0.02419

max.emohelp

In [178]:
set.seed(2024)
data.intake <- read.csv('ages25_63.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(max.emohelp = ifelse(max.emohelp >= 9, 1, 0))

col_names <- colnames(data.intake)
if ("max.emohelp" %in% col_names){
  new_col_order <- c("max.emohelp", col_names[col_names != "max.emohelp"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$max.emohelp==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ max.emohelp, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.057143	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.000000	test-error:0.000000 
[21]	train-error:0.000000	test-error:0.000000 
[31]	train-error:0.000000	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.057143	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ max.emohelp, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   1.0680     0.2316   4.611 5.463e-05   0.6763  1.45968 34
max.emohelp  -0.3483     0.2485  -1.402 1.701e-01  -0.7686  0.07191 34

Multiple R-squared:  0.04978 ,	Adjusted R-squared:  0.02183 
F-statistic: 1.964 on 1 and 34 DF,  p-value: 0.1701

# WHITE

GPA

In [94]:
set.seed(2024)
data.intake <- read.csv('white.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.136000	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.088000	test-error:0.000000 
[21]	train-error:0.072000	test-error:0.000000 
[31]	train-error:0.032000	test-error:0.000000 
[41]	train-error:0.024000	test-error:0.000000 
[51]	train-error:0.016000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.136000	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.7263    0.05539  13.111 3.066e-25   0.6345  0.81806 125
gpa_thresh3_0  -0.1679    0.11962  -1.404 1.629e-01  -0.3661  0.03032 125

Multiple R-squared:  0.01252 ,	Adjusted R-squared:  0.004617 
F-statistic:  1.97 on 1 and 125 DF,  p-value: 0.1629

PTSD

In [95]:
set.seed(2024)
data.intake <- read.csv('white.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.088000	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.072000	test-error:0.000000 
[21]	train-error:0.056000	test-error:0.000000 
[31]	train-error:0.056000	test-error:0.000000 
[41]	train-error:0.040000	test-error:0.000000 
[51]	train-error:0.032000	test-error:0.000000 
[61]	train-error:0.024000	test-error:0.000000 
[71]	train-error:0.016000	test-error:0.000000 
[81]	train-error:0.008000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.088000	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   0.4809    0.03452  13.929 3.363e-27   0.4237   0.5381 125
ptsd_score    0.9614    0.10196   9.429 2.859e-16   0.7924   1.1303 125

Multiple R-squared:  0.5267 ,	Adjusted R-squared:  0.5229 
F-statistic: 88.91 on 1 and 125 DF,  p-value: 2.859e-16

Age

In [96]:
set.seed(2024)
data.intake <- read.csv('white.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(age = ifelse(age < 21, 1, 0))

col_names <- colnames(data.intake)
if ("age" %in% col_names){
  new_col_order <- c("age", col_names[col_names != "age"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$age==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ age, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.168000	test-error:0.500000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.064000	test-error:0.500000 
[21]	train-error:0.008000	test-error:0.500000 
[31]	train-error:0.008000	test-error:0.500000 
[41]	train-error:0.000000	test-error:0.500000 
[51]	train-error:0.000000	test-error:0.500000 
[61]	train-error:0.000000	test-error:0.500000 
[71]	train-error:0.000000	test-error:0.500000 
[81]	train-error:0.000000	test-error:0.500000 
[91]	train-error:0.000000	test-error:0.500000 
[101]	train-error:0.000000	test-error:0.500000 
Stopping. Best iteration:
[3]	train-error:0.120000	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ age, data = data.intake, weights = ipw, 
    alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)  0.74847    0.07434 10.0677 8.036e-18   0.6253   0.8717 125
age         -0.07789    0.10113 -0.7702 4.426e-01  -0.2455   0.0897 125

Multiple R-squared:  0.004728 ,	Adjusted R-squared:  -0.003234 
F-statistic: 0.5932 on 1 and 125 DF,  p-value: 0.4426

edu_i.count

In [97]:
set.seed(2024)
data.intake <- read.csv('white.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(edu.i_count = ifelse(edu.i_count <= 2, 1, 0))

col_names <- colnames(data.intake)
if ("edu.i_count" %in% col_names){
  new_col_order <- c("edu.i_count", col_names[col_names != "edu.i_count"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$edu.i_count==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ edu.i_count, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.072000	test-error:0.500000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.032000	test-error:0.500000 
[21]	train-error:0.016000	test-error:0.500000 
[31]	train-error:0.016000	test-error:0.500000 
[41]	train-error:0.016000	test-error:0.500000 
[51]	train-error:0.016000	test-error:0.500000 
[61]	train-error:0.008000	test-error:0.500000 
[71]	train-error:0.008000	test-error:0.500000 
[81]	train-error:0.000000	test-error:0.500000 
[91]	train-error:0.000000	test-error:0.500000 
[101]	train-error:0.000000	test-error:0.500000 
Stopping. Best iteration:
[1]	train-error:0.072000	test-error:0.500000




Call:
lm_robust(formula = mh_scale ~ edu.i_count, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   0.5813    0.07643   7.605 5.944e-12  0.45459   0.7079 125
edu.i_count   0.1820    0.09964   1.827 7.011e-02  0.01691   0.3471 125

Multiple R-squared:  0.0245 ,	Adjusted R-squared:  0.01669 
F-statistic: 3.337 on 1 and 125 DF,  p-value: 0.07011

# ASIAN

GPA

In [98]:
set.seed(2024)
data.intake <- read.csv('asian.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.178571	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.080357	test-error:1.000000 
[21]	train-error:0.026786	test-error:0.500000 
[31]	train-error:0.008929	test-error:0.500000 
[41]	train-error:0.000000	test-error:0.500000 
[51]	train-error:0.000000	test-error:0.500000 
[61]	train-error:0.000000	test-error:0.500000 
[71]	train-error:0.000000	test-error:0.500000 
[81]	train-error:0.000000	test-error:0.500000 
[91]	train-error:0.000000	test-error:0.500000 
[101]	train-error:0.000000	test-error:0.500000 
[111]	train-error:0.000000	test-error:0.500000 
Stopping. Best iteration:
[13]	train-error:0.044643	test-error:0.500000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.9326    0.07315  12.750 1.514e-23   0.8113  1.05396 112
gpa_thresh3_0  -0.2296    0.11797  -1.946 5.418e-02  -0.4252 -0.03389 112

Multiple R-squared:  0.03446 ,	Adjusted R-squared:  0.02584 
F-statistic: 3.786 on 1 and 112 DF,  p-value: 0.05418

PTSD

In [99]:
set.seed(2024)
data.intake <- read.csv('asian.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.107143	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.062500	test-error:0.000000 
[21]	train-error:0.044643	test-error:0.000000 
[31]	train-error:0.026786	test-error:0.000000 
[41]	train-error:0.026786	test-error:0.000000 
[51]	train-error:0.026786	test-error:0.000000 
[61]	train-error:0.008929	test-error:0.000000 
[71]	train-error:0.008929	test-error:0.000000 
[81]	train-error:0.008929	test-error:0.000000 
[91]	train-error:0.008929	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.107143	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   0.5554    0.05267  10.546 1.786e-18   0.4681   0.6428 112
ptsd_score    0.8038    0.10276   7.822 3.134e-12   0.6333   0.9742 112

Multiple R-squared:  0.3836 ,	Adjusted R-squared:  0.3781 
F-statistic: 61.18 on 1 and 112 DF,  p-value: 3.134e-12

Age

In [100]:
set.seed(2024)
data.intake <- read.csv('asian.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(age = ifelse(age < 21, 1, 0))

col_names <- colnames(data.intake)
if ("age" %in% col_names){
  new_col_order <- c("age", col_names[col_names != "age"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$age==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ age, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.187500	test-error:0.500000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.071429	test-error:0.000000 
[21]	train-error:0.035714	test-error:0.500000 
[31]	train-error:0.008929	test-error:0.500000 
[41]	train-error:0.000000	test-error:0.500000 
[51]	train-error:0.000000	test-error:0.500000 
[61]	train-error:0.000000	test-error:0.500000 
[71]	train-error:0.000000	test-error:0.500000 
[81]	train-error:0.000000	test-error:0.500000 
[91]	train-error:0.000000	test-error:0.500000 
[101]	train-error:0.000000	test-error:0.500000 
Stopping. Best iteration:
[2]	train-error:0.142857	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ age, data = data.intake, weights = ipw, 
    alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)  0.90589     0.0900 10.0652 2.321e-17   0.7566  1.05516 112
age         -0.09956     0.1193 -0.8349 4.056e-01  -0.2973  0.09823 112

Multiple R-squared:  0.006271 ,	Adjusted R-squared:  -0.002602 
F-statistic: 0.697 on 1 and 112 DF,  p-value: 0.4056

edu_i.count

In [101]:
set.seed(2024)
data.intake <- read.csv('asian.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(edu.i_count = ifelse(edu.i_count <= 2, 1, 0))

col_names <- colnames(data.intake)
if ("edu.i_count" %in% col_names){
  new_col_order <- c("edu.i_count", col_names[col_names != "edu.i_count"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$edu.i_count==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ edu.i_count, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.142857	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.035714	test-error:0.000000 
[21]	train-error:0.026786	test-error:0.000000 
[31]	train-error:0.017857	test-error:0.000000 
[41]	train-error:0.008929	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.142857	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ edu.i_count, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   0.7573    0.08923   8.488 9.895e-14  0.60935   0.9053 112
edu.i_count   0.1514    0.11822   1.281 2.030e-01 -0.04469   0.3475 112

Multiple R-squared:  0.0138 ,	Adjusted R-squared:  0.005 
F-statistic:  1.64 on 1 and 112 DF,  p-value: 0.203

# RACE OTHER

GPA

In [102]:
set.seed(2024)
data.intake <- read.csv('race_other.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.150943	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.150943	test-error:1.000000 
[21]	train-error:0.113208	test-error:0.000000 
[31]	train-error:0.094340	test-error:0.000000 
[41]	train-error:0.037736	test-error:0.000000 
[51]	train-error:0.018868	test-error:0.000000 
[61]	train-error:0.018868	test-error:0.000000 
[71]	train-error:0.018868	test-error:0.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[7]	train-error:0.132075	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)    0.82820    0.09776  8.4715 2.307e-11   0.6645   0.9919 52
gpa_thresh3_0  0.05455    0.25288  0.2157 8.301e-01  -0.3690   0.4780 52

Multiple R-squared:  0.001247 ,	Adjusted R-squared:  -0.01796 
F-statistic: 0.04652 on 1 and 52 DF,  p-value: 0.8301

PTSD

In [103]:
set.seed(2024)
data.intake <- read.csv('race_other.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(ptsd_score = ifelse(ptsd_score >= 33, 1, 0))

col_names <- colnames(data.intake)
if ("ptsd_score" %in% col_names){
  new_col_order <- c("ptsd_score", col_names[col_names != "ptsd_score"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$ptsd_score==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ ptsd_score, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.094340	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.018868	test-error:1.000000 
[21]	train-error:0.018868	test-error:1.000000 
[31]	train-error:0.000000	test-error:1.000000 
[41]	train-error:0.000000	test-error:1.000000 
[51]	train-error:0.000000	test-error:1.000000 
[61]	train-error:0.000000	test-error:1.000000 
[71]	train-error:0.000000	test-error:1.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[1]	train-error:0.094340	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ ptsd_score, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.5190     0.0755   6.874 7.874e-09   0.3925   0.6454 52
ptsd_score    0.9432     0.1372   6.875 7.842e-09   0.7134   1.1729 52

Multiple R-squared:  0.4914 ,	Adjusted R-squared:  0.4816 
F-statistic: 47.26 on 1 and 52 DF,  p-value: 7.842e-09

Age

In [104]:
set.seed(2024)
data.intake <- read.csv('race_other.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(age = ifelse(age < 21, 1, 0))

col_names <- colnames(data.intake)
if ("age" %in% col_names){
  new_col_order <- c("age", col_names[col_names != "age"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$age==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ age, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.150943	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.037736	test-error:0.000000 
[21]	train-error:0.000000	test-error:0.000000 
[31]	train-error:0.000000	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.150943	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ age, data = data.intake, weights = ipw, 
    alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept) 0.828656     0.1575 5.26259 2.731e-06   0.5650   1.0924 52
age         0.007848     0.1871 0.04194 9.667e-01  -0.3056   0.3212 52

Multiple R-squared:  3.707e-05 ,	Adjusted R-squared:  -0.01919 
F-statistic: 0.001759 on 1 and 52 DF,  p-value: 0.9667

edu_i.count

In [105]:
set.seed(2024)
data.intake <- read.csv('race_other.csv')
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL
data.intake <- data.intake %>%
  mutate(edu.i_count = ifelse(edu.i_count <= 2, 1, 0))

col_names <- colnames(data.intake)
if ("edu.i_count" %in% col_names){
  new_col_order <- c("edu.i_count", col_names[col_names != "edu.i_count"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$edu.i_count==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ edu.i_count, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.094340	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.056604	test-error:0.000000 
[21]	train-error:0.018868	test-error:0.000000 
[31]	train-error:0.018868	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.094340	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ edu.i_count, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)   0.7442     0.1123  6.6247 1.963e-08   0.5561   0.9324 52
edu.i_count   0.1349     0.1672  0.8065 4.236e-01  -0.1452   0.4149 52

Multiple R-squared:  0.0108 ,	Adjusted R-squared:  -0.008221 
F-statistic: 0.6505 on 1 and 52 DF,  p-value: 0.4236

#TESTING GPA WITH GREATER THAN 3.0 = 1 INSTEAD OF 0

In [3]:
install.packages('dplyr')
install.packages('estimatr')
install.packages('xgboost')

library(dplyr)
library(estimatr)
library(xgboost)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘Formula’, ‘Rcpp’, ‘RcppEigen’


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘xgboost’


The following object is masked from ‘package:dplyr’:

    slice




In [7]:
set.seed(2024)
data.intake <- read.csv('numeric_GPA3_0_first.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.133562	test-error:0.333333 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.061644	test-error:0.333333 
[21]	train-error:0.023973	test-error:0.333333 
[31]	train-error:0.013699	test-error:0.333333 
[41]	train-error:0.003425	test-error:0.333333 
[51]	train-error:0.000000	test-error:0.333333 
[61]	train-error:0.000000	test-error:0.333333 
[71]	train-error:0.000000	test-error:0.333333 
[81]	train-error:0.000000	test-error:0.333333 
[91]	train-error:0.000000	test-error:0.333333 
[101]	train-error:0.000000	test-error:0.333333 
Stopping. Best iteration:
[1]	train-error:0.133562	test-error:0.333333




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.6769    0.06827   9.915 3.589e-20 0.564276   0.7896 293
gpa_thresh3_0   0.1377    0.07960   1.729 8.481e-02 0.006305   0.2690 293

Multiple R-squared:  0.009545 ,	Adjusted R-squared:  0.006165 
F-statistic:  2.99 on 1 and 293 DF,  p-value: 0.08481

In [8]:
set.seed(2024)
data.intake <- read.csv('ages18_20.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.072727	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.090909	test-error:0.000000 
[21]	train-error:0.072727	test-error:0.000000 
[31]	train-error:0.072727	test-error:0.000000 
[41]	train-error:0.054545	test-error:0.000000 
[51]	train-error:0.036364	test-error:0.000000 
[61]	train-error:0.036364	test-error:0.000000 
[71]	train-error:0.036364	test-error:0.000000 
[81]	train-error:0.018182	test-error:0.000000 
[91]	train-error:0.018182	test-error:0.000000 
[101]	train-error:0.018182	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.072727	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
(Intercept)     0.6166     0.2303   2.678 0.009804   0.2312    1.002 54
gpa_thresh3_0   0.2728     0.2493   1.095 0.278583  -0.1443    0.690 54

Multiple R-squared:  0.02219 ,	Adjusted R-squared:  0.004079 
F-statistic: 1.198 on 1 and 54 DF,  p-value: 0.2786

In [9]:
set.seed(2024)
data.intake <- read.csv('ages21_24.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.208791	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.043956	test-error:0.000000 
[21]	train-error:0.032967	test-error:0.000000 
[31]	train-error:0.000000	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[3]	train-error:0.120879	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)     0.5771    0.09595   6.014 3.820e-08   0.4176   0.7365 90
gpa_thresh3_0   0.3751    0.12920   2.904 4.641e-03   0.1604   0.5899 90

Multiple R-squared:  0.07633 ,	Adjusted R-squared:  0.06607 
F-statistic:  8.43 on 1 and 90 DF,  p-value: 0.004641

In [18]:
set.seed(2024)
data.intake <- read.csv('ages25_63.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.114286	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.085714	test-error:0.000000 
[21]	train-error:0.085714	test-error:0.000000 
[31]	train-error:0.085714	test-error:0.000000 
[41]	train-error:0.085714	test-error:0.000000 
[51]	train-error:0.085714	test-error:0.000000 
[61]	train-error:0.085714	test-error:0.000000 
[71]	train-error:0.028571	test-error:0.000000 
[81]	train-error:0.028571	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.114286	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)     1.0670     0.2305   4.630 0.0000516   0.6773   1.4567 34
gpa_thresh3_0  -0.3946     0.2450  -1.611 0.1164445  -0.8088   0.0196 34

Multiple R-squared:  0.1078 ,	Adjusted R-squared:  0.08153 
F-statistic: 2.595 on 1 and 34 DF,  p-value: 0.1164

In [19]:
set.seed(2024)
data.intake <- read.csv('asian.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.196429	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.080357	test-error:1.000000 
[21]	train-error:0.026786	test-error:0.500000 
[31]	train-error:0.008929	test-error:0.500000 
[41]	train-error:0.000000	test-error:0.500000 
[51]	train-error:0.000000	test-error:0.500000 
[61]	train-error:0.000000	test-error:0.500000 
[71]	train-error:0.000000	test-error:0.500000 
[81]	train-error:0.000000	test-error:0.500000 
[91]	train-error:0.000000	test-error:0.500000 
[101]	train-error:0.000000	test-error:0.500000 
[111]	train-error:0.000000	test-error:0.500000 
Stopping. Best iteration:
[13]	train-error:0.044643	test-error:0.500000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.7031    0.09256   7.596 9.931e-12  0.54957   0.8566 112
gpa_thresh3_0   0.2296    0.11797   1.946 5.418e-02  0.03389   0.4252 112

Multiple R-squared:  0.03446 ,	Adjusted R-squared:  0.02584 
F-statistic: 3.786 on 1 and 112 DF,  p-value: 0.05418

In [20]:
set.seed(2024)
data.intake <- read.csv('white.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.136000	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.072000	test-error:0.000000 
[21]	train-error:0.072000	test-error:0.000000 
[31]	train-error:0.040000	test-error:0.000000 
[41]	train-error:0.016000	test-error:0.000000 
[51]	train-error:0.008000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.136000	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.5584     0.1060   5.267 5.877e-07  0.38267   0.7341 125
gpa_thresh3_0   0.1679     0.1196   1.404 1.629e-01 -0.03032   0.3661 125

Multiple R-squared:  0.01252 ,	Adjusted R-squared:  0.004617 
F-statistic:  1.97 on 1 and 125 DF,  p-value: 0.1629

In [21]:
set.seed(2024)
data.intake <- read.csv('race_other.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.169811	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.150943	test-error:1.000000 
[21]	train-error:0.113208	test-error:0.000000 
[31]	train-error:0.094340	test-error:0.000000 
[41]	train-error:0.037736	test-error:0.000000 
[51]	train-error:0.018868	test-error:0.000000 
[61]	train-error:0.018868	test-error:0.000000 
[71]	train-error:0.018868	test-error:0.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[7]	train-error:0.132075	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)    0.88274     0.2332  3.7850 0.0003994   0.4922    1.273 52
gpa_thresh3_0 -0.05455     0.2529 -0.2157 0.8300716  -0.4780    0.369 52

Multiple R-squared:  0.001247 ,	Adjusted R-squared:  -0.01796 
F-statistic: 0.04652 on 1 and 52 DF,  p-value: 0.8301

In [22]:
set.seed(2024)
data.intake <- read.csv('gender_female.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.120536	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.040179	test-error:1.000000 
[21]	train-error:0.026786	test-error:1.000000 
[31]	train-error:0.008929	test-error:1.000000 
[41]	train-error:0.004464	test-error:1.000000 
[51]	train-error:0.004464	test-error:1.000000 
[61]	train-error:0.000000	test-error:1.000000 
[71]	train-error:0.000000	test-error:1.000000 
[81]	train-error:0.000000	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[1]	train-error:0.120536	test-error:1.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)     0.6428    0.07627   8.428 4.206e-15  0.51682   0.7688 225
gpa_thresh3_0   0.1896    0.08903   2.129 3.433e-02  0.04251   0.3366 225

Multiple R-squared:  0.01854 ,	Adjusted R-squared:  0.01418 
F-statistic: 4.533 on 1 and 225 DF,  p-value: 0.03433

In [23]:
set.seed(2024)
data.intake <- read.csv('gender_male.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.188679	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.094340	test-error:0.000000 
[21]	train-error:0.075472	test-error:0.000000 
[31]	train-error:0.037736	test-error:0.000000 
[41]	train-error:0.000000	test-error:0.000000 
[51]	train-error:0.000000	test-error:0.000000 
[61]	train-error:0.000000	test-error:0.000000 
[71]	train-error:0.000000	test-error:0.000000 
[81]	train-error:0.000000	test-error:0.000000 
[91]	train-error:0.000000	test-error:0.000000 
[101]	train-error:0.000000	test-error:0.000000 
Stopping. Best iteration:
[1]	train-error:0.188679	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
(Intercept)    0.84838     0.1963   4.322  0.00007   0.5197   1.1771 52
gpa_thresh3_0 -0.07611     0.2213  -0.344  0.73227  -0.4467   0.2945 52

Multiple R-squared:  0.002269 ,	Adjusted R-squared:  -0.01692 
F-statistic: 0.1183 on 1 and 52 DF,  p-value: 0.7323

In [24]:
set.seed(2024)
data.intake <- read.csv('highptsd_score.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.147727	test-error:1.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.034091	test-error:0.000000 
[21]	train-error:0.045455	test-error:0.000000 
[31]	train-error:0.034091	test-error:0.000000 
[41]	train-error:0.022727	test-error:1.000000 
[51]	train-error:0.022727	test-error:1.000000 
[61]	train-error:0.011364	test-error:1.000000 
[71]	train-error:0.011364	test-error:1.000000 
[81]	train-error:0.011364	test-error:1.000000 
[91]	train-error:0.000000	test-error:1.000000 
[101]	train-error:0.000000	test-error:1.000000 
Stopping. Best iteration:
[2]	train-error:0.090909	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper DF
(Intercept)     1.2096     0.1051  11.510 3.589e-19  1.03486   1.3843 87
gpa_thresh3_0   0.2476     0.1234   2.008 4.780e-02  0.04255   0.4527 87

Multiple R-squared:  0.03716 ,	Adjusted R-squared:  0.0261 
F-statistic:  4.03 on 1 and 87 DF,  p-value: 0.0478

In [25]:
set.seed(2024)
data.intake <- read.csv('lowptsd_score.csv')
data.intake$gpa_thresh3_0 = 1 - data.intake$gpa_thresh3_0
rownames(data.intake) <- data.intake$record_id
data.intake$record_id <- NULL

col_names <- colnames(data.intake)
if ("gpa_thresh3_0" %in% col_names){
  new_col_order <- c("gpa_thresh3_0", col_names[col_names != "gpa_thresh3_0"])
  data.intake <- data.intake[, new_col_order]}

n <- nrow(data.intake)
train_rows <- sample(1:n, 0.99 * n, replace = F)
train.intake <- xgb.DMatrix(data.matrix(data.intake[train_rows, -1]),
                            label = data.intake[train_rows, 1])
test.intake <- xgb.DMatrix(data.matrix(data.intake[-train_rows, -1]),
                            label = data.intake[-train_rows, 1])

params <- list(
  objective = "binary:logistic",
  learning_rate = 0.05,
  subsample = 0.9,
  colsample_bynode = 1,
  max_depth = 10,
  scale_pos_weight = 1
)

fit_xgb <- xgb.train(
  params,
  data = train.intake,
  watchlist = list(train = train.intake, test = test.intake),
  eval.metric = "error",
  early_stopping_rounds = 100,
  print_every_n = 10,
  nrounds = 10000
)

pred <- predict(fit_xgb, train.intake)
dif <- dim(data.intake)[1] - length(pred)

for (x in 1:dif) {
  pred <- c(pred, mean(pred))
}

ipw <- ifelse(data.intake$gpa_thresh3_0==1, 1/pred, 1/(1-pred))
model <- lm_robust(mh_scale ~ gpa_thresh3_0, data=data.intake, weights=ipw, alpha=0.1)
summary(model)

[1]	train-error:0.123153	test-error:0.000000 
Multiple eval metrics are present. Will use test_error for early stopping.
Will train until test_error hasn't improved in 100 rounds.

[11]	train-error:0.064039	test-error:0.333333 
[21]	train-error:0.039409	test-error:0.333333 
[31]	train-error:0.014778	test-error:0.000000 
[41]	train-error:0.014778	test-error:0.333333 
[51]	train-error:0.009852	test-error:0.333333 
[61]	train-error:0.000000	test-error:0.333333 
[71]	train-error:0.000000	test-error:0.333333 
[81]	train-error:0.000000	test-error:0.333333 
[91]	train-error:0.000000	test-error:0.333333 
[101]	train-error:0.000000	test-error:0.333333 
Stopping. Best iteration:
[1]	train-error:0.123153	test-error:0.000000




Call:
lm_robust(formula = mh_scale ~ gpa_thresh3_0, data = data.intake, 
    weights = ipw, alpha = 0.1)

Weighted, Standard error type:  HC2 

Coefficients:
              Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper  DF
(Intercept)    0.48945    0.06771  7.2285 9.601e-12   0.3776   0.6013 204
gpa_thresh3_0  0.03063    0.07398  0.4141 6.793e-01  -0.0916   0.1529 204

Multiple R-squared:  0.001114 ,	Adjusted R-squared:  -0.003783 
F-statistic: 0.1715 on 1 and 204 DF,  p-value: 0.6793