In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/Colab Notebooks/Data Mining/Data'

In [None]:
X = df.drop(columns=['y_sum', 'y_binary'])
y_binary = df.iloc[:, -1]
y_sum = df.iloc[:, -2]

In [None]:
def get_binary_columns(df):
    binary_columns = []
    for column in df.columns:
        unique_values = df[column].unique()
        if len(unique_values) == 2:
            binary_columns.append(column)
    return binary_columns

binary_columns = get_binary_columns(X)
cont_columns = [column for column in X.columns if column not in binary_columns]

In [None]:
X = X[binary_columns + cont_columns]
df_binary = pd.concat([X, y_binary], axis=1)
df_sum = pd.concat([X, y_sum], axis=1)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
split=list(enumerate(skf.split(df_binary,df_binary.y_binary)))

In [None]:
folds={i[0]:i[1][1] for i in split}
folds

{0: array([    1,     5,     7, ..., 64414, 64415, 64417]),
 1: array([    0,     8,    21, ..., 64404, 64412, 64419]),
 2: array([    2,    11,    13, ..., 64405, 64409, 64416]),
 3: array([    3,     6,    22, ..., 64408, 64410, 64421]),
 4: array([    4,     9,    10, ..., 64403, 64418, 64420])}

In [None]:
train_idx = np.concatenate((folds[0], folds[1], folds[2]))
val_idx = folds[3]
test_idx = folds[4]

In [None]:
len(train_idx), len(val_idx), len(test_idx)

(38654, 12884, 12884)

In [None]:
X_train = X.iloc[train_idx]
X_val = X.iloc[val_idx]
X_test = X.iloc[test_idx]

y_binary_train = y_binary.iloc[train_idx]
y_binary_val = y_binary.iloc[val_idx]
y_binary_test = y_binary.iloc[test_idx]

y_sum_train = y_sum.iloc[train_idx]
y_sum_val = y_sum.iloc[val_idx]
y_sum_test = y_sum.iloc[test_idx]

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator,TransformerMixin):
    # note: returns the feature matrix with the binary columns ordered first
    def __init__(self,bin_col,cont_col):
        self.scaler = StandardScaler()
        self.bin_col = bin_col
        self.cont_col = cont_col

    def fit(self, X):
        self.scaler.fit(X[self.cont_col])
        return self

    def transform(self, X):
        X_tail = self.scaler.transform(X[self.cont_col])
        return np.concatenate((X[self.bin_col],X_tail), axis=1)

In [None]:
scale = CustomScaler(bin_col=binary_columns,cont_col=cont_columns)
X_train = scale.fit_transform(X_train)
X_val = scale.transform(X_val)
X_test = scale.transform(X_test)

In [None]:
from scipy.stats import pointbiserialr

corr, _ = pointbiserialr(df['isolation_Always'], df['y_binary'])
corr

0.10041265061078342

In [None]:
#@title MLP
import torch # torch

from torchvision import transforms # transform is used for data pre-processing
from torch.utils import data as Data
from torch.utils.data import Dataset # for dataset construction
from torch.utils.data import DataLoader

import torch.optim as optim

import numpy as np

import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

In [None]:
#@title MLP
XTrain = torch.from_numpy(X_train).float()
XVal = torch.from_numpy(X_val).float()
XTest = torch.from_numpy(X_test).float()

yBinaryTrain = torch.from_numpy(y_binary_train.values)
yBinaryVal = torch.from_numpy(y_binary_val.values)
yBinaryTest = torch.from_numpy(y_binary_test.values)

ySumTrain = torch.from_numpy(y_sum_train.values)
ySumVal = torch.from_numpy(y_sum_val.values)
ySumTest = torch.from_numpy(y_sum_test.values)

In [None]:
#@title MLP
class MyDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [None]:
#@title MLP
train_dataset = MyDataset(XTrain, yBinaryTrain)
val_dataset = MyDataset(XVal, yBinaryVal)
test_dataset = MyDataset(XTest, yBinaryTest)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [None]:
#@title MLP
class SimpleMLP(nn.Module):
    def __init__(self):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(XTrain.shape[1], 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [None]:
#@title MLP
model = SimpleMLP()

# loss = nn.CrossEntropyLoss(weight=torch.tensor(sum(y_binary_train)/(np.bincount(y_binary_train)*2), dtype=torch.float32))
loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
#@title MLP
for i in range(25):
  model.train()
  train_acc, val_acc = [], []

  true_labs, pred_labs = [], []
  for batch, (X, y) in enumerate(train_loader):
    optimizer.zero_grad()
    outputs = model(X)

    loss_value = loss(outputs, y)
    loss_value.backward()
    optimizer.step()

    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
    y_true = y.cpu().numpy()

    true_labs.extend(y_true)
    pred_labs.extend(y_pred)

  train_acc.append(accuracy_score(true_labs, pred_labs))
  print(f'Epoch: [{i+1}/25]: training accuracy is {train_acc[-1]:.4f}')

  model.eval()
  true_labs, pred_labs = [], []
  for batch, (X, y) in enumerate(val_loader):
    outputs = model(X)

    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
    y_true = y.cpu().numpy()
    true_labs.extend(y_true)
    pred_labs.extend(y_pred)

  val_acc.append(accuracy_score(true_labs, pred_labs))
  print(f'Epoch: [{i+1}/25]: validation accuracy is {val_acc[-1]:.4f}')


In [None]:
#@title MLP
train_losses, val_losses = [], []
for i in range(25):
  model.train()

  batch_loss = []
  for batch, (X, y) in enumerate(train_loader):
    y = y.float()
    optimizer.zero_grad()
    outputs = model(X)

    loss_value = loss(outputs, y)
    loss_value.backward()
    optimizer.step()

    batch_loss.append(loss_value.item())

  train_losses.append(np.mean(batch_loss))
  print(f'Epoch: [{i+1}/25]: training loss is {train_losses[-1]:.4f}')

  model.eval()
  batch_loss = []
  for batch, (X, y) in enumerate(val_loader):
    y = y.float()
    outputs = model(X)

    batch_loss.append(loss(outputs, y).item())


  val_losses.append(np.mean(batch_loss))
  print(f'Epoch: [{i+1}/25]: validation loss is {val_losses[-1]:.4f}')


In [None]:
from scipy.stats import pointbiserialr

test = df[binary_columns+['y_binary']]

correlations = {}
for col in test.columns:
    if col != 'Output':  # Exclude the output variable itself
        corr, _ = pointbiserialr(test[col], test['y_binary'])
        correlations[col] = corr

print("Point-biserial correlation coefficients with respect to the output variable:")
print(correlations)

Point-biserial correlation coefficients with respect to the output variable:
{'SEXVAR': 0.0027894498801620355, 'EXERANY2': -0.10869831603557942, 'CVDINFR4': 0.07869198792734275, 'Florida': 0.018758690488365662, 'Idaho': 0.002054278443513995, 'Indiana': -0.002663846520399179, 'Maine': -0.010461701711950046, 'Nevada': 0.010980135925191495, 'Oregon': 0.007697308292232545, 'Rhode Island': 0.004419414932534988, 'South Carolina': 0.0023890075344300404, 'Utah': 0.006775737268180428, 'Vermont': -0.009447287811401363, 'Virginia': -0.010807795628769844, 'Wisconsin': -0.011334906279258976, 'Insur_CHIP': 0.0008228613441700797, 'Insur_Employer/Union': -0.08703139318722194, 'Insur_Gvmt': 0.004258007818376025, 'Insur_Indian': 0.004307208487760126, 'Insur_Medicaid': 0.06238468041047344, 'Insur_Medicare': 0.040596015915157116, 'Insur_Medigap': 0.005333850303486451, 'Insur_Military': 0.026203625321734474, 'Insur_None': 0.008764488792069968, 'Insur_Private Plan': -0.02301076567650768, 'Insur_State': 0.01

In [None]:
[item for item in correlations.items() if item[1] > 0.1]

[('_DRDXAR2', 0.11913869347332429),
 ('ADDEPEV3', 0.21083728520985387),
 ('DEAF', 0.11680459950197934),
 ('BLIND', 0.12749620517928487),
 ('DECIDE', 0.406855759523034),
 ('DIFFWALK', 0.20694250666589867),
 ('SDHBILLS', 0.1299956912653079),
 ('SDHTRNSP', 0.14001149210034108),
 ('Dissatisfied', 0.1426202434411109),
 ('Unable to work', 0.16300993892383603),
 ('emotional_support_Rarely', 0.11218089052671495),
 ('emotional_support_Sometimes', 0.10655993025162175),
 ('isolation_Always', 0.10041265061078342),
 ('isolation_Sometimes', 0.10695614355648322),
 ('y_binary', 0.9999999999999665)]

In [None]:
#@title MLP
true_labs, pred_labs = [], []
for batch, (X, y) in enumerate(test_loader):
  outputs = model(X)
  y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
  y_true = y.cpu().numpy()
  true_labs.extend(y_true)
  pred_labs.extend(y_pred)



In [None]:
#@title MLP
import seaborn as sns
conf_matrix = confusion_matrix(true_labs, pred_labs)
sns.set(font_scale=1.2)  # Adjust font size if needed
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()