In [1]:
import numpy as np
import os
import pandas as pd
import pyodbc
import time
import scipy.stats as stats
from datetime import datetime
from tqdm import tqdm
import sys
import gc
import pickle
from joblib import dump, load
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

In [2]:
dataset = 'ccae_3yrs'
path = '/data2/processed_datasets/ak4885/psychosis_schizophrenia_prediction'
data_path = f'{path}/raw_data_{dataset}'
int_path = f'{path}/intermediate_data_{dataset}'
dataset_prefix = '9_26_ccae_2dx_fullhistory_'

model_path = 'models/'
logreg_path = 'ccae_logreg_psychosis_1yr'
xgb_path = 'ccae_xgb_psychosis_1yr'

In [3]:
with open(f'{int_path}/{dataset_prefix}du_nontemporalbaseline_atpsychosis_colnames', "rb") as fp:
    data_columns = pickle.load(fp)
data_df = pd.read_csv(f'{int_path}/{dataset_prefix}nontemporalbaseline_data_atpsychosis.csv')
df_split = pd.read_csv(f'{int_path}/tvt_split_2dx.csv')
df_split = df_split.loc[df_split['person_id'].isin(data_df['person_id'])]
df_pop = pd.read_csv(f'{data_path}/population_2dx.csv', parse_dates = ['psychosis_diagnosis_date', 'scz_diagnosis_date', 'cohort_start_date'])

In [4]:
# limit to people with at least 1 year of obs
count_visits = pd.read_csv(f'{int_path}/hcu_visit_counts.csv', parse_dates = ['first_visit'])
df_pop = df_pop.merge(count_visits[['person_id', 'first_visit']], how = 'left', on = 'person_id')
df_pop['time_until_psychosis'] = (df_pop['psychosis_diagnosis_date']-df_pop['first_visit']).dt.days
df_pop['time_until_psychosis']
print(len(df_pop), len(df_split))
df_pop = df_pop.loc[df_pop['time_until_psychosis']>=365]
df_split = df_split.loc[df_split['person_id'].isin(df_pop['person_id'])]
print(len(df_pop), len(df_split))

120741 113154
92733 92733


In [5]:
data_df.set_index('person_id', inplace=True)
df_pop.set_index('person_id', inplace=True)

In [6]:
data_scaler = StandardScaler()

train_pids = df_split.loc[df_split['split']=='train', 'person_id']
X_train = data_df.loc[train_pids, data_columns]
X_train = data_scaler.fit_transform(X_train)
y_train = df_pop.loc[train_pids, 'sz_flag']

val_pids = df_split.loc[df_split['split']=='val', 'person_id']
X_val = data_df.loc[val_pids, data_columns]
X_val = data_scaler.transform(X_val)
y_val = df_pop.loc[val_pids, 'sz_flag']

test_pids = df_split.loc[df_split['split']=='test', 'person_id']
X_test = data_df.loc[test_pids, data_columns]
X_test = data_scaler.transform(X_test)
y_test = df_pop.loc[test_pids, 'sz_flag']

In [7]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(64864, 1416)
(9261, 1416)
(18608, 1416)


In [8]:
# grid_search XGBoost
xgb_clf = XGBClassifier(seed=3)
xgb_params = {'max_depth': [3,4,5], 'n_estimators': [200,300]}

grid_xgboost = GridSearchCV(estimator = xgb_clf,
    param_grid = xgb_params,
    scoring = 'roc_auc',
    n_jobs = 3,
    cv = 5,
    verbose = 3)

grid_xgboost.fit(X_train, y_train)

with open(f'{model_path}/{xgb_path}/xgboost.pkl','wb') as f:
    pickle.dump(grid_xgboost.best_estimator_,f)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [9]:
val_probs = grid_xgboost.best_estimator_.predict_proba(X_val)[:,1]
print(len(val_probs))
val_output = pd.DataFrame(val_pids, columns = ['person_id'])
val_output['y_pred'] = val_probs
val_output = val_output.merge(df_pop['sz_flag'], how = 'inner', left_on = 'person_id', right_index=True)
val_output.rename(columns={'sz_flag': 'y_true'}, inplace=True)
val_output.to_csv(f'{model_path}/{xgb_path}/val_outputs.csv')
print(len(val_output))

test_probs = grid_xgboost.best_estimator_.predict_proba(X_test)[:,1]
print(len(test_probs))
test_output = pd.DataFrame(test_pids, columns = ['person_id'])
test_output['y_pred'] = test_probs
test_output = test_output.merge(df_pop['sz_flag'], how = 'inner', left_on = 'person_id', right_index=True)
test_output.rename(columns={'sz_flag': 'y_true'}, inplace=True)
test_output.to_csv(f'{model_path}/{xgb_path}/test_outputs.csv')
print(len(test_output))

9261
9261
18608
18608


In [10]:
print(roc_auc_score(val_output['y_true'], val_output['y_pred']))
print(roc_auc_score(test_output['y_true'], test_output['y_pred']))

0.6572545577312625
0.6290221415660023


In [11]:
# grid_search Logistic Regression
lr_clf = LogisticRegression()
lr_params = {'penalty': ['l1','l2'], 'C': [0.01, 0.1, 1, 10, 100]}

grid_lr = GridSearchCV(estimator = lr_clf,
    param_grid = lr_params,
    scoring = 'roc_auc',
    n_jobs = 3,
    cv = 5,
    verbose = 3)

grid_lr.fit(X_train, y_train)
with open(f'{model_path}/{logreg_path}/logreg_model.pkl','wb') as f:
    pickle.dump(grid_lr.best_estimator_,f)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
val_probs = grid_lr.best_estimator_.predict_proba(X_val)[:,1]
print(len(val_probs))
val_output = pd.DataFrame(val_pids, columns = ['person_id'])
val_output['y_pred'] = val_probs
val_output = val_output.merge(df_pop['sz_flag'], how = 'inner', left_on = 'person_id', right_index=True)
val_output.rename(columns={'sz_flag': 'y_true'}, inplace=True)
val_output.to_csv(f'{model_path}/{logreg_path}/val_outputs.csv')
print(len(val_output))

test_probs = grid_lr.best_estimator_.predict_proba(X_test)[:,1]
print(len(test_probs))
test_output = pd.DataFrame(test_pids, columns = ['person_id'])
test_output['y_pred'] = test_probs
test_output = test_output.merge(df_pop['sz_flag'], how = 'inner', left_on = 'person_id', right_index=True)
test_output.rename(columns={'sz_flag': 'y_true'}, inplace=True)
test_output.to_csv(f'{model_path}/{logreg_path}/test_outputs.csv')
print(len(test_output))

9261
9261
18608
18608


In [13]:
print(roc_auc_score(val_output['y_true'], val_output['y_pred']))
print(roc_auc_score(test_output['y_true'], test_output['y_pred']))

0.6029388551279166
0.6058296521615014
