In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import random
import pickle

# Set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# Define data path
DATA_PATH = "../HW3_RNN-lib/data"

# Load data
pids = pickle.load(open(os.path.join(DATA_PATH, 'train/pids.pkl'), 'rb'))
vids = pickle.load(open(os.path.join(DATA_PATH, 'train/vids.pkl'), 'rb'))
hfs = pickle.load(open(os.path.join(DATA_PATH, 'train/hfs.pkl'), 'rb'))
seqs = pickle.load(open(os.path.join(DATA_PATH, 'train/seqs.pkl'), 'rb'))
types = pickle.load(open(os.path.join(DATA_PATH, 'train/types.pkl'), 'rb'))
rtypes = pickle.load(open(os.path.join(DATA_PATH, 'train/rtypes.pkl'), 'rb'))

input_size = len(types)

def pad_and_encode_sequence(sequence, max_visits, max_seq_length):
    padded_sequence = np.zeros((max_visits, max_seq_length))
    for i, visit in enumerate(sequence[:max_visits]):
        for code in visit:
            padded_sequence[i][code] = 1
    return padded_sequence

# Preprocess the data for 12-month observation window
max_visits_12 = 12
seqs_12 = [pad_and_encode_sequence(seq, max_visits_12, input_size) for seq in seqs]
seqs_12 = np.array(seqs_12)

# Preprocess the data for 18-month observation window
max_visits_18 = 18
seqs_18 = [pad_and_encode_sequence(seq, max_visits_18, input_size) for seq in seqs]
seqs_18 = np.array(seqs_18)

def train_and_evaluate_models(train_seqs, test_seqs, train_hfs, test_hfs):
    class RNNModel(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(RNNModel, self).__init__()
            self.hidden_size = hidden_size
            self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
            self.fc = nn.Linear(hidden_size, output_size)

        def forward(self, x):
            h0 = torch.zeros(1, x.size(0), self.hidden_size)
            out, _ = self.rnn(x, h0)
            out = self.fc(out[:, -1, :])
            return out

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    hidden_size = 128
    output_size = 1
    learning_rate = 0.001
    num_epochs = 100

    rnn_model = RNNModel(input_size, hidden_size, output_size).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)

    train_seqs_tensor = torch.tensor(train_seqs, dtype=torch.float).to(device)
    train_hfs_tensor = torch.tensor(train_hfs, dtype=torch.float).unsqueeze(1).to(device)

    for epoch in range(num_epochs):
        rnn_model.train()
        optimizer.zero_grad()
        predictions = rnn_model(train_seqs_tensor)
        loss = criterion(predictions, train_hfs_tensor)
        loss.backward()
        optimizer.step()

    lr_model = LogisticRegression()
    lr_model.fit(train_seqs.reshape(len(train_seqs), -1), train_hfs)

    mlp_model = MLPClassifier(hidden_layer_sizes=(128,), random_state=seed)
    mlp_model.fit(train_seqs.reshape(len(train_seqs), -1), train_hfs)

    svm_model = SVC(probability=True, random_state=seed)
    svm_model.fit(train_seqs.reshape(len(train_seqs), -1), train_hfs)

    knn_model = KNeighborsClassifier()
    knn_model.fit(train_seqs.reshape(len(train_seqs), -1), train_hfs)

    test_seqs_tensor = torch.tensor(test_seqs, dtype=torch.float).to(device)
    rnn_auc = roc_auc_score(test_hfs, torch.sigmoid(rnn_model(test_seqs_tensor)).detach().numpy())
    lr_auc = roc_auc_score(test_hfs, lr_model.predict_proba(test_seqs.reshape(len(test_seqs), -1))[:, 1])
    mlp_auc = roc_auc_score(test_hfs, mlp_model.predict_proba(test_seqs.reshape(len(test_seqs), -1))[:, 1])
    svm_auc = roc_auc_score(test_hfs, svm_model.predict_proba(test_seqs.reshape(len(test_seqs), -1))[:, 1])
    knn_auc = roc_auc_score(test_hfs, knn_model.predict_proba(test_seqs.reshape(len(test_seqs), -1))[:, 1])

    return rnn_auc, lr_auc, mlp_auc, svm_auc, knn_auc

train_seqs_12, test_seqs_12, train_hfs_12, test_hfs_12 = train_test_split(seqs_12, hfs, test_size=0.2, random_state=seed)
train_seqs_18, test_seqs_18, train_hfs_18, test_hfs_18 = train_test_split(seqs_18, hfs, test_size=0.2, random_state=seed)

aucs_12 = train_and_evaluate_models(train_seqs_12, test_seqs_12, train_hfs_12, test_hfs_12)
print("Results for 12-month observation window:")
print(f"RNN AUC: {aucs_12[0]:.3f}")
print(f"Logistic Regression AUC: {aucs_12[1]:.3f}")
print(f"MLP AUC: {aucs_12[2]:.3f}")
print(f"SVM AUC: {aucs_12[3]:.3f}")
print(f"KNN AUC: {aucs_12[4]:.3f}")

aucs_18 = train_and_evaluate_models(train_seqs_18, test_seqs_18, train_hfs_18, test_hfs_18)
print("Results for 18-month observation window:")
print(f"RNN AUC: {aucs_18[0]:.3f}")
print(f"Logistic Regression AUC: {aucs_18[1]:.3f}")
print(f"MLP AUC: {aucs_18[2]:.3f}")
print(f"SVM AUC: {aucs_18[3]:.3f}")
print(f"KNN AUC: {aucs_18[4]:.3f}")

