## Imports

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import copy
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.tree import DecisionTreeRegressor

## Decision Tree

In [None]:
class TreeNode:
  def __init__(self, best_split, X, y, left_node = None, right_node = None, left_prob = 0, right_prob = 0, isLeaf = False):
    self.best_split = best_split
    self.left_node = left_node
    self.right_node = right_node
    self.left_prob = left_prob
    self.right_prob = right_prob
    self.X = X
    self.y = y
    self.isLeaf = isLeaf


class DecisionTree:
    def __init__(self, max_depth=None, randomWalkIter = 100):
        self.max_depth = max_depth
        self.randomWalkIter = randomWalkIter


    def calculate_feature_importance(self):
      n = self.randomWalkIter
      self.importance_dict = {}
      for i in self.train_features:
        self.importance_dict[i] = 0

      for _ in range(n):
        self.run_random_walk()

      for i in self.importance_dict.keys():
        self.importance_dict[i] /= n

    def run_random_walk(self):
      steps = 0
      temp = copy.deepcopy(self.importance_dict)
      for i in temp.keys():
        temp[i] = 0
      tree = copy.deepcopy(self.treeNode)
      while True:
        if tree.isLeaf:
          break
        temp[self.train_features[tree.best_split[0]]] += 1
        steps += 1
        if tree.left_node != None and tree.right_node != None:
          tree = random.choices([tree.left_node, tree.right_node], weights=[tree.left_prob, tree.right_prob])[0]
        else:
          tree = random.choice([tree.left_node, tree.right_node])
      for i in self.importance_dict.keys():
        self.importance_dict[i] += (temp[i] / steps)

    def get_max_corr(self, X_train, y_train, feat):
      df = X_train.copy()
      df_in = X_train.copy()
      y_train = y_train.copy()
      drop_list = []
      for i in df_in.columns:
        if i not in self.prediction_feature_list and i != feat:
          drop_list.append(i)

      df = df.drop(drop_list, axis=1)
      lst = dict(abs(df.corr()[feat]))
      del lst[feat]
      df = df.drop(feat, axis=1)
      df['label'] = y_train
      label_lst = dict(abs(df.corr()['label']))
      del label_lst["label"]


      final_dict = {}
      for i in lst:
        final_dict[i] = lst[i] + label_lst[i]
      sorted_corr = sorted(final_dict.items(), key = lambda x : x[1], reverse = True)
      return sorted_corr

    def fit(self, X, y):
      self.train_features = list(X.columns)
      self.X_train = X.copy()
      self.y_train = y.copy()
      self.tree, self.treeNode = self._grow_tree(X.to_numpy(), y.to_numpy())
      self.calculate_feature_importance()

    def get_best_split(self, feature, X):
      feature_idx = self.train_features.index(feature)
      best_gini = np.inf
      best_split = None
      best_left_indices = None
      best_right_indices = None
      thresholds = np.unique(X[:, feature_idx])
      for threshold in thresholds:
          left_indices = np.where(X[:, feature_idx] <= threshold)[0]
          right_indices = np.where(X[:, feature_idx] > threshold)[0]

          if len(left_indices) == 0 or len(right_indices) == 0:
              continue

          gini = self._gini_impurity(y[left_indices], y[right_indices])

          if gini < best_gini:
              best_gini = gini
              best_split = (feature_idx, threshold)
              best_left_indices = left_indices
              best_right_indices = right_indices
      return best_split

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))

        if (self.max_depth is not None and depth >= self.max_depth) or num_classes == 1:
            final_class = int(np.bincount(y).argmax())
            return final_class, TreeNode(final_class, X = X, y = y, isLeaf=True)
        best_gini = np.inf
        best_split = None
        best_left_indices = None
        best_right_indices = None

        for feature_idx in range(num_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature_idx] <= threshold)[0]
                right_indices = np.where(X[:, feature_idx] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini_impurity(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_split = (feature_idx, threshold)
                    best_left_indices = left_indices
                    best_right_indices = right_indices

        if best_gini == np.inf:
            final_class = int(np.bincount(y).argmax())
            return final_class, TreeNode(final_class, X = X, y = y, isLeaf=True)


        left_subtree, leftNode = self._grow_tree(X[best_left_indices], y[best_left_indices], depth + 1)
        right_subtree, rightNode = self._grow_tree(X[best_right_indices], y[best_right_indices], depth + 1)

        currNode = TreeNode(best_split, X = X, y = y, left_node = leftNode, right_node = rightNode, left_prob = len(X[best_left_indices]) / len(X), right_prob = len(X[best_right_indices])/ len(X))

        return (best_split, left_subtree, right_subtree), currNode

    def _gini_impurity(self, left_y, right_y):
        p_left = len(left_y) / (len(left_y) + len(right_y))
        p_right = len(right_y) / (len(left_y) + len(right_y))
        gini_left = 1 - sum((np.bincount(left_y) / len(left_y))**2)
        gini_right = 1 - sum((np.bincount(right_y) / len(right_y))**2)
        gini = p_left * gini_left + p_right * gini_right
        return gini


    def get_inp_count(self, inp, X_train):
      drop_cols = []
      X_train = X_train.reset_index(drop = True)
      for i in list(X_train.columns):
        if i not in self.prediction_feature_list:
          drop_cols.append(i)
      X_train = X_train.drop(drop_cols, axis=1)
      occ_count = 0

      for i in X_train.index:
        flag = True
        for col in list(inp.columns):
          threshold = (max(X_train[col]) - min(X_train[col])) / 20
          if abs(inp[col][0] - X_train[col][i]) > threshold:
            flag = False
            break
        if flag:
          occ_count += 1
      return occ_count

    def get_counts(self, inp, left_data, right_data, correlated_features):


      left_count, right_count = 0, 0
      iter = 0

      drop_cols = []
      left_data = left_data.reset_index(drop = True)
      right_data = right_data.reset_index(drop = True)
      for i in list(left_data.columns):
        if i not in self.prediction_feature_list:
          drop_cols.append(i)
      left_data = left_data.drop(drop_cols, axis=1)
      right_data = right_data.drop(drop_cols, axis=1)

      inp_left = tuple(np.array(inp) / (left_data.max().to_numpy() - left_data.min().to_numpy() + 1e-10))
      inp_right = tuple(np.array(inp) / (right_data.max().to_numpy()  - right_data.min().to_numpy() + 1e-10))
      left_diff = np.abs(right_data - inp) / (left_data.max().to_numpy() - left_data.min().to_numpy() + 1e-10)
      right_diff = np.abs(right_data - inp) / (right_data.max().to_numpy()  - right_data.min().to_numpy() + 1e-10)
      tolerance = 0.05
      out_left = (left_diff <= tolerance).all(axis=1)
      out_right = (right_diff <= tolerance).all(axis=1)

      left_count = out_left.sum()
      right_count = out_right.sum()

      return left_count + 1, right_count + 1


    def predict(self, X):
      self.prediction_feature_list = list(X.columns)
      return np.array([self._predict_single(x, self.tree, self.treeNode, list(X.columns))[0] for x in tqdm(X.to_numpy())])

    def _predict_single(self, x, tree, treeNode, inp_cols):
        if isinstance(tree, int) or treeNode.isLeaf:
            return tree, treeNode

        feature_idx, threshold = tree[0]
        best_feature_name = self.train_features[feature_idx]
        inp_df = pd.DataFrame([list(x)], columns=self.prediction_feature_list)

        # Logic to handle missing feature
        if best_feature_name not in self.prediction_feature_list:
          X_t = pd.DataFrame(treeNode.X, columns = self.train_features)
          y_t = pd.Series(treeNode.y)
          correlated_features = self.get_max_corr(X_t, y_t, best_feature_name)
          left_weighted_sum = 0
          right_weighted_sum  = 0
          normalization_factor = 0
          for feat, _ in correlated_features[:3]:
            out = self.get_best_split(feat, treeNode.X)
            if out == None:
              continue
            _, threshold = out
            left_data = X_t[X_t[feat] <= threshold].reset_index(drop=True)
            right_data = X_t[X_t[feat] > threshold].reset_index(drop=True)
            dropped_x = list(x)
            dropped_x.pop(inp_cols.index(feat))
            left_count, right_count = self.get_counts(np.array(dropped_x), left_data.drop(feat, axis = 1), right_data.drop(feat, axis = 1), correlated_features)
            left_weighted_sum += (self.importance_dict[feat]*left_count*treeNode.left_prob)
            right_weighted_sum += (self.importance_dict[feat]*right_count*treeNode.right_prob)
            normalization_factor += self.importance_dict[feat]

          left_dec = left_weighted_sum / (normalization_factor + 1)
          right_dec = right_weighted_sum / (normalization_factor + 1)

          if left_dec >= right_dec:
            return self._predict_single(x, tree[1], treeNode.left_node, inp_cols)
          else:
            return self._predict_single(x, tree[2], treeNode.right_node, inp_cols)

        elif inp_df[best_feature_name][0] <= threshold:
            return self._predict_single(x, tree[1], treeNode.left_node, inp_cols)
        else:
            return self._predict_single(x, tree[2], treeNode.right_node, inp_cols)




In [None]:
def run_smaller_model(X_train, X_test, y_train, y_test):
  model = DecisionTree(max_depth=5)
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  print(classification_report(y_test, preds, digits=4))

In [None]:
def run_imputation_model(model, X_train, X_test, y_train, y_test, dropped_columns):
  X_test_final = X_test.copy()
  while len(dropped_columns) != 0:
    clf = DecisionTree(max_depth=5)
    y_train_temp = X_train[dropped_columns[0]]
    if min(y_train_temp) < 0:
      y_train_temp += abs(min(y_train_temp))
    X_train_temp = X_train.drop(dropped_columns, axis=1)
    clf.fit(X_train_temp, y_train_temp)
    X_test_final[dropped_columns[0]] = clf.predict(X_test_final)
    dropped_columns.pop(0)
  predictions = model.predict(X_test_final)
  print(classification_report(y_test, predictions, digits=4))


In [None]:
def run_imputation_sklearn(model, X_train, X_test, y_train, y_test, dropped_columns):
    X_test_final = X_test.copy()
    while len(dropped_columns) != 0:
      clf = DecisionTreeRegressor(max_depth=3)
      y_train_temp = X_train[dropped_columns[0]]
      if min(y_train_temp) < 0:
        y_train_temp += abs(min(y_train_temp))
      X_train_temp = X_train.drop(dropped_columns, axis=1)
      clf.fit(X_train_temp, y_train_temp)
      X_test_final[dropped_columns[0]] = clf.predict(X_test_final)
      dropped_columns.pop(0)
    predictions = model.predict(X_test_final)
    print(classification_report(y_test, predictions, digits=4))


In [None]:
def run_imputation_diabetes(model, X_train, X_test, y_train, y_test, dropped_columns):
    X_test_final = X_test.copy()
    while len(dropped_columns) != 0:
      if dropped_columns[0] == "diag_2":
        clf = DecisionTreeRegressor(max_depth=3)
      else:
        clf = DecisionTree(max_depth=3)
      y_train_temp = X_train[dropped_columns[0]]
      if min(y_train_temp) < 0:
        y_train_temp += abs(min(y_train_temp))
      X_train_temp = X_train.drop(dropped_columns, axis=1)
      clf.fit(X_train_temp, y_train_temp)
      X_test_final[dropped_columns[0]] = clf.predict(X_test_final)
      dropped_columns.pop(0)
    predictions = model.predict(X_test_final)
    print(classification_report(y_test, predictions, digits=4))

# Predictions

In [None]:
def print_tree(train_features, node):
  if node.isLeaf:
    return
  else:
    print(train_features[node.best_split[0]])
    print_tree(train_features, node.left_node)
    print_tree(train_features, node.right_node)

## Diabetes

In [None]:
df = pd.read_csv("/content/drive/MyDrive/diabetic_data.csv")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/diabetic_data.csv")
label = 'readmitted'

df = df.drop(["weight", 'patient_nbr', 'encounter_id', 'max_glu_serum', 'A1Cresult'] , axis = 1)
df = df.dropna()


le = LabelEncoder()
for column in df.select_dtypes(include='object'):
  df[column] = le.fit_transform(df[column])
  df[column] += 1


X = df.drop(label, axis=1)
y = df[label]

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=42)

In [None]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [None]:
clf = DecisionTree(max_depth=7)
clf.fit(X_train, y_train)

In [None]:
X_test = X_test.iloc[:1000, :]
y_test = y_test[:1000]

In [None]:
df.medical_specialty.unique()

array([39,  1, 20, 13,  5, 64, 29, 14, 62, 21, 30, 49, 10, 53, 66, 24, 26,
       35, 17, 32, 63, 37, 11, 73, 51, 43, 22,  4, 55, 40, 52, 47, 15, 27,
       42, 68, 70, 69, 28, 67, 38, 45, 19,  3, 57,  2, 65, 41, 36,  8, 60,
       71, 31, 50, 61, 46, 16, 48, 25, 72, 54, 34,  9, 59, 58, 18, 33,  6,
       44, 23, 12,  7, 56])

In [None]:
print_tree(clf.train_features, clf.treeNode)

number_inpatient
number_diagnoses
admission_source_id
number_outpatient
admission_source_id
num_procedures
num_medications
num_lab_procedures
insulin
discharge_disposition_id
discharge_disposition_id
num_procedures
time_in_hospital
age
race
age
diag_1
diag_2
age
number_emergency
admission_type_id
num_medications
admission_type_id
diag_3
number_emergency
insulin
diabetesMed
admission_type_id
number_emergency
diag_3
discharge_disposition_id
age
discharge_disposition_id
discharge_disposition_id
number_outpatient
number_emergency
diabetesMed
discharge_disposition_id
age
number_diagnoses
diag_2
medical_specialty
num_lab_procedures
discharge_disposition_id
age
payer_code
age
diag_1
num_medications
discharge_disposition_id
diag_3
medical_specialty
glimepiride
num_lab_procedures
gender
payer_code
admission_type_id
discharge_disposition_id
diag_1
medical_specialty
discharge_disposition_id
admission_type_id
number_inpatient
discharge_disposition_id
admission_source_id
admission_source_id
num_lab

### Original Model

In [None]:
predictions  = clf.predict(X_test)
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [00:06<00:00, 150.77it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4534    0.3040    0.3639       352
           3     0.5887    0.8296    0.6887       540

    accuracy                         0.5550      1000
   macro avg     0.3474    0.3779    0.3509      1000
weighted avg     0.4775    0.5550    0.5000      1000






### Binary

In [None]:
X_test_temp = X_test.drop("diabetesMed", axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [02:04<00:00,  8.03it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4534    0.3040    0.3639       352
           3     0.5887    0.8296    0.6887       540

    accuracy                         0.5550      1000
   macro avg     0.3474    0.3779    0.3509      1000
weighted avg     0.4775    0.5550    0.5000      1000






In [None]:
run_smaller_model(X_train.drop("diabetesMed", axis=1), X_test.drop("diabetesMed", axis=1), y_train, y_test)

100%|██████████| 1000/1000 [00:04<00:00, 218.73it/s]

              precision    recall  f1-score   support

           1     0.3333    0.0093    0.0180       108
           2     0.4585    0.2983    0.3614       352
           3     0.5859    0.8333    0.6881       540

    accuracy                         0.5560      1000
   macro avg     0.4593    0.3803    0.3558      1000
weighted avg     0.5138    0.5560    0.5007      1000






In [None]:
run_imputation_model(clf, X_train, X_test.drop("diabetesMed", axis=1), y_train, y_test, ["diabetesMed"])

100%|██████████| 1000/1000 [00:02<00:00, 368.25it/s]
100%|██████████| 1000/1000 [00:06<00:00, 153.92it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4534    0.3040    0.3639       352
           3     0.5887    0.8296    0.6887       540

    accuracy                         0.5550      1000
   macro avg     0.3474    0.3779    0.3509      1000
weighted avg     0.4775    0.5550    0.5000      1000






### Ordinal

In [None]:
X_test_temp = X_test.drop("num_lab_procedures", axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [00:42<00:00, 23.55it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4484    0.3210    0.3742       352
           3     0.5919    0.8167    0.6864       540

    accuracy                         0.5540      1000
   macro avg     0.3468    0.3792    0.3535      1000
weighted avg     0.4775    0.5540    0.5024      1000






In [None]:
run_smaller_model(X_train.drop("num_lab_procedures", axis=1), X_test.drop("num_lab_procedures", axis=1), y_train, y_test)

100%|██████████| 1000/1000 [00:06<00:00, 160.96it/s]

              precision    recall  f1-score   support

           1     0.3333    0.0093    0.0180       108
           2     0.4839    0.2557    0.3346       352
           3     0.5832    0.8759    0.7002       540

    accuracy                         0.5640      1000
   macro avg     0.4668    0.3803    0.3509      1000
weighted avg     0.5213    0.5640    0.4978      1000






In [None]:
run_imputation_model(clf, X_train, X_test.drop("num_lab_procedures", axis=1), y_train, y_test, ["num_lab_procedures"])

100%|██████████| 1000/1000 [00:05<00:00, 185.15it/s]
100%|██████████| 1000/1000 [00:06<00:00, 147.00it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4569    0.3011    0.3630       352
           3     0.5856    0.8296    0.6866       540

    accuracy                         0.5540      1000
   macro avg     0.3475    0.3769    0.3499      1000
weighted avg     0.4771    0.5540    0.4985      1000






### Numeric

In [None]:
X_test_temp = X_test.drop("diag_2", axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [00:15<00:00, 66.16it/s]


              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4576    0.3068    0.3673       352
           3     0.5892    0.8315    0.6897       540

    accuracy                         0.5570      1000
   macro avg     0.3490    0.3794    0.3524      1000
weighted avg     0.4793    0.5570    0.5017      1000



In [None]:
run_smaller_model(X_train.drop("diag_2", axis=1), X_test.drop("diag_2", axis=1), y_train, y_test)

100%|██████████| 1000/1000 [00:04<00:00, 217.80it/s]

              precision    recall  f1-score   support

           1     0.3333    0.0093    0.0180       108
           2     0.4585    0.2983    0.3614       352
           3     0.5859    0.8333    0.6881       540

    accuracy                         0.5560      1000
   macro avg     0.4593    0.3803    0.3558      1000
weighted avg     0.5138    0.5560    0.5007      1000






In [None]:
run_imputation_sklearn(clf, X_train, X_test.drop("diag_2", axis=1), y_train, y_test, ["diag_2"])

100%|██████████| 1000/1000 [00:11<00:00, 87.19it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4557    0.3068    0.3667       352
           3     0.5887    0.8296    0.6887       540

    accuracy                         0.5560      1000
   macro avg     0.3481    0.3788    0.3518      1000
weighted avg     0.4783    0.5560    0.5010      1000






### Combined

In [None]:
X_test_temp = X_test.drop(["diag_2", "num_lab_procedures", "diabetesMed"], axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [03:46<00:00,  4.41it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4524    0.3239    0.3775       352
           3     0.5925    0.8185    0.6874       540

    accuracy                         0.5560      1000
   macro avg     0.3483    0.3808    0.3550      1000
weighted avg     0.4792    0.5560    0.5041      1000






In [None]:
run_smaller_model(X_train.drop(["diag_2", "num_lab_procedures", "diabetesMed"], axis=1), X_test.drop(["diag_2", "num_lab_procedures", "diabetesMed"], axis=1), y_train, y_test)

100%|██████████| 1000/1000 [00:04<00:00, 218.70it/s]

              precision    recall  f1-score   support

           1     0.3333    0.0093    0.0180       108
           2     0.4839    0.2557    0.3346       352
           3     0.5832    0.8759    0.7002       540

    accuracy                         0.5640      1000
   macro avg     0.4668    0.3803    0.3509      1000
weighted avg     0.5213    0.5640    0.4978      1000






In [None]:
run_imputation_diabetes(clf, X_train, X_test.drop(["diag_2", "num_lab_procedures", "diabetesMed"], axis=1), y_train, y_test, ["diag_2", "num_lab_procedures", "diabetesMed"])

100%|██████████| 1000/1000 [00:02<00:00, 373.32it/s]
100%|██████████| 1000/1000 [00:01<00:00, 585.83it/s]
100%|██████████| 1000/1000 [00:07<00:00, 127.55it/s]

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000       108
           2     0.4534    0.3040    0.3639       352
           3     0.5840    0.8241    0.6836       540

    accuracy                         0.5520      1000
   macro avg     0.3458    0.3760    0.3492      1000
weighted avg     0.4749    0.5520    0.4972      1000




