# SDOH Prediction Pipeline 

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd 

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import transformers
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

import os
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load data

In [4]:
#set data path
va_path = "/sdoh_dataset_processed.xlsx" 
va_df = pd.read_excel(va_path)

In [5]:
va_df.edu_level_composite.value_counts()

 1    3247
 2    1756
 3    1552
 4     958
-1      67
Name: edu_level_composite, dtype: int64

In [6]:
#make list of texts and labels 
va_df = va_df[va_df["edu_level_composite"] != -1] #remove all rows that have missing values or unknowns in edu level 
va_df = va_df[va_df["open_response"].notna()]

va_df = va_df.iloc[1:] #removing the first row since it is just not necessary

In [7]:
texts = va_df["open_response"]
va_df["edu_level_composite"] = pd.to_numeric(va_df["edu_level_composite"])
labels = va_df["edu_level_composite"]

In [8]:
#split up the data into a train, development and test portion.
#this lines splits off 10% into test 
rest_texts, test_texts, rest_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=2)
#this line splits 10% of the remainer to dev 
train_texts, dev_texts, train_labels, dev_labels = train_test_split(rest_texts, rest_labels, test_size=0.1, random_state=2)

In [9]:
print("Train size:", len(train_texts))
print("Dev size:", len(dev_texts))
print("Test size:", len(test_texts))

Train size: 5535
Dev size: 615
Test size: 684


In [10]:
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{1: 0, 2: 1, 3: 2, 4: 3}


In [11]:
train_labels.value_counts()

1    2440
2    1241
3    1153
4     701
Name: edu_level_composite, dtype: int64

## Baseline

We need to decide on a baseline model: Logistic regression

In [12]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter = 100000))
])

parameters = {'lr__C': [0.01, 0.1, 0.5, 1, 2, 5, 10, 100, 1000]}

best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
best_classifier.fit(train_texts, train_labels)
best_predictions = best_classifier.predict(test_texts)

baseline_accuracy = np.mean(best_predictions == test_labels)
print("Baseline accuracy:", baseline_accuracy)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  1.1min finished


Baseline accuracy: 0.4926900584795322


In [13]:
print(classification_report(test_labels, best_predictions))

              precision    recall  f1-score   support

           1       0.53      0.84      0.65       319
           2       0.41      0.37      0.39       151
           3       0.32      0.09      0.14       132
           4       0.11      0.01      0.02        82

    accuracy                           0.49       684
   macro avg       0.34      0.33      0.30       684
weighted avg       0.42      0.49      0.42       684

