## Imports

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import copy
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm

## Decision Tree

In [None]:
class TreeNode:
  def __init__(self, best_split, X, y, left_node = None, right_node = None, left_prob = 0, right_prob = 0, isLeaf = False):

    # This variable contains the prediction in case this node is the leaf
    self.best_split = best_split
    self.left_node = left_node
    self.right_node = right_node
    self.left_prob = left_prob
    self.right_prob = right_prob
    self.X = X
    self.y = y
    self.isLeaf = isLeaf


class DecisionTree:
    def __init__(self, max_depth=None, randomWalkIter = 100):
        self.max_depth = max_depth
        self.randomWalkIter = randomWalkIter


    def calculate_feature_importance(self):
      n = self.randomWalkIter
      self.importance_dict = {}
      for i in self.train_features:
        self.importance_dict[i] = 0

      for _ in range(n):
        self.run_random_walk()

      for i in self.importance_dict.keys():
        self.importance_dict[i] /= n
      # print(n)

    def run_random_walk(self):
      steps = 0
      temp = copy.deepcopy(self.importance_dict)
      for i in temp.keys():
        temp[i] = 0
      tree = copy.deepcopy(self.treeNode)
      while True:
        if tree.isLeaf:
          break
        temp[self.train_features[tree.best_split[0]]] += 1
        steps += 1
        if tree.left_node != None and tree.right_node != None:
          tree = random.choices([tree.left_node, tree.right_node], weights=[tree.left_prob, tree.right_prob])[0]
        else:
          tree = random.choice([tree.left_node, tree.right_node])
      for i in self.importance_dict.keys():
        self.importance_dict[i] += (temp[i] / steps)

    def get_max_corr(self, X_train, y_train, feat):
      df = X_train.copy()
      df_in = X_train.copy()
      y_train = y_train.copy()
      drop_list = []
      for i in df_in.columns:
        if i not in self.prediction_feature_list and i != feat:
          drop_list.append(i)

      df = df.drop(drop_list, axis=1)
      lst = dict(abs(df.corr()[feat]))
      del lst[feat]
      df = df.drop(feat, axis=1)
      df['label'] = y_train
      label_lst = dict(abs(df.corr()['label']))
      del label_lst["label"]


      final_dict = {}
      for i in lst:
        final_dict[i] = lst[i] + label_lst[i]
      sorted_corr = sorted(final_dict.items(), key = lambda x : x[1], reverse = True)
      return sorted_corr

    def fit(self, X, y):
      self.train_features = list(X.columns)
      self.X_train = X.copy()
      self.y_train = y.copy()
      self.tree, self.treeNode = self._grow_tree(X.to_numpy(), y.to_numpy())
      self.calculate_feature_importance()

    def get_best_split(self, feature, X):
      feature_idx = self.train_features.index(feature)
      best_gini = np.inf
      best_split = None
      best_left_indices = None
      best_right_indices = None
      thresholds = np.unique(X[:, feature_idx])
      for threshold in thresholds:
          left_indices = np.where(X[:, feature_idx] <= threshold)[0]
          right_indices = np.where(X[:, feature_idx] > threshold)[0]

          if len(left_indices) == 0 or len(right_indices) == 0:
              continue

          gini = self._gini_impurity(y[left_indices], y[right_indices])

          if gini < best_gini:
              best_gini = gini
              best_split = (feature_idx, threshold)
              best_left_indices = left_indices
              best_right_indices = right_indices
      return best_split

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))

        if (self.max_depth is not None and depth >= self.max_depth) or num_classes == 1:
            final_class = int(np.bincount(y).argmax())
            return final_class, TreeNode(final_class, X = X, y = y, isLeaf=True)

        best_gini = np.inf
        best_split = None
        best_left_indices = None
        best_right_indices = None

        for feature_idx in range(num_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature_idx] <= threshold)[0]
                right_indices = np.where(X[:, feature_idx] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini_impurity(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_split = (feature_idx, threshold)
                    best_left_indices = left_indices
                    best_right_indices = right_indices

        if best_gini == np.inf:
            final_class = int(np.bincount(y).argmax())
            return final_class, TreeNode(final_class, X = X, y = y, isLeaf=True)


        left_subtree, leftNode = self._grow_tree(X[best_left_indices], y[best_left_indices], depth + 1)
        right_subtree, rightNode = self._grow_tree(X[best_right_indices], y[best_right_indices], depth + 1)

        currNode = TreeNode(best_split, X = X, y = y, left_node = leftNode, right_node = rightNode, left_prob = len(X[best_left_indices]) / len(X), right_prob = len(X[best_right_indices])/ len(X))

        return (best_split, left_subtree, right_subtree), currNode

    def _gini_impurity(self, left_y, right_y):
        p_left = len(left_y) / (len(left_y) + len(right_y))
        p_right = len(right_y) / (len(left_y) + len(right_y))
        gini_left = 1 - sum((np.bincount(left_y) / len(left_y))**2)
        gini_right = 1 - sum((np.bincount(right_y) / len(right_y))**2)
        gini = p_left * gini_left + p_right * gini_right
        return gini


    def get_inp_count(self, inp, X_train):
      drop_cols = []
      X_train = X_train.reset_index(drop = True)
      for i in list(X_train.columns):
        if i not in self.prediction_feature_list:
          drop_cols.append(i)
      X_train = X_train.drop(drop_cols, axis=1)
      occ_count = 0

      for i in X_train.index:
        flag = True
        for col in list(inp.columns):
          threshold = (max(X_train[col]) - min(X_train[col])) / 20
          if abs(inp[col][0] - X_train[col][i]) > threshold:
            flag = False
            break
        if flag:
          occ_count += 1
      return occ_count

    def get_counts(self, inp, left_data, right_data, correlated_features):

      left_count, right_count = 0, 0
      iter = 0
      drop_cols = []
      left_data = left_data.reset_index(drop = True)
      right_data = right_data.reset_index(drop = True)
      for i in list(left_data.columns):
        if i not in self.prediction_feature_list:
          drop_cols.append(i)
      left_data = left_data.drop(drop_cols, axis=1)
      right_data = right_data.drop(drop_cols, axis=1)

      inp_left = tuple(np.array(inp) / (left_data.max().to_numpy() - left_data.min().to_numpy() + 1e-10))
      inp_right = tuple(np.array(inp) / (right_data.max().to_numpy()  - right_data.min().to_numpy() + 1e-10))


      left_diff = np.abs(right_data - inp) / (left_data.max().to_numpy() - left_data.min().to_numpy() + 1e-10)
      right_diff = np.abs(right_data - inp) / (right_data.max().to_numpy()  - right_data.min().to_numpy() + 1e-10)
      # print(left_diff, right_diff)
      tolerance = 0.05
      out_left = (left_diff <= tolerance).all(axis=1)
      out_right = (right_diff <= tolerance).all(axis=1)

      left_count = out_left.sum()
      right_count = out_right.sum()

      return left_count + 1, right_count + 1



    def predict(self, X):
      self.prediction_feature_list = list(X.columns)
      return np.array([self._predict_single(x, self.tree, self.treeNode, list(X.columns))[0] for x in tqdm(X.to_numpy())])

    def _predict_single(self, x, tree, treeNode, inp_cols):
        if isinstance(tree, int) or treeNode.isLeaf:
            return tree, treeNode

        feature_idx, threshold = tree[0]
        best_feature_name = self.train_features[feature_idx]
        inp_df = pd.DataFrame([list(x)], columns=self.prediction_feature_list)

        # Logic to handle missing feature
        if best_feature_name not in self.prediction_feature_list:
          X_t = pd.DataFrame(treeNode.X, columns = self.train_features)
          y_t = pd.Series(treeNode.y)
          correlated_features = self.get_max_corr(X_t, y_t, best_feature_name)
          left_weighted_sum = 0
          right_weighted_sum  = 0
          normalization_factor = 0
          for feat, _ in correlated_features[:3]:
            out = self.get_best_split(feat, treeNode.X)
            if out == None:
              continue
            _, threshold = out
            left_data = X_t[X_t[feat] <= threshold].reset_index(drop=True)
            right_data = X_t[X_t[feat] > threshold].reset_index(drop=True)
            dropped_x = list(x)
            dropped_x.pop(inp_cols.index(feat))
            left_count, right_count = self.get_counts(np.array(dropped_x), left_data.drop(feat, axis = 1), right_data.drop(feat, axis = 1), correlated_features)
            left_weighted_sum += (self.importance_dict[feat]*left_count*treeNode.left_prob)
            right_weighted_sum += (self.importance_dict[feat]*right_count*treeNode.right_prob)
            normalization_factor += self.importance_dict[feat]

          left_dec = left_weighted_sum / (normalization_factor + 1)
          right_dec = right_weighted_sum / (normalization_factor + 1)

          if left_dec >= right_dec:
            return self._predict_single(x, tree[1], treeNode.left_node, inp_cols)
          else:
            return self._predict_single(x, tree[2], treeNode.right_node, inp_cols)

        elif inp_df[best_feature_name][0] <= threshold:
            return self._predict_single(x, tree[1], treeNode.left_node, inp_cols)
        else:
            return self._predict_single(x, tree[2], treeNode.right_node, inp_cols)




In [None]:
def run_smaller_model(X_train, X_test, y_train, y_test):
  model = DecisionTree(max_depth=5)
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  print(classification_report(y_test, preds, digits=4))

In [None]:
def run_imputation_model(model, X_train, X_test, y_train, y_test, dropped_columns):
  X_test_final = X_test.copy()
  while len(dropped_columns) != 0:
    clf = DecisionTree(max_depth=5)
    y_train_temp = X_train[dropped_columns[0]]
    # print(y_train[:10])
    if min(y_train_temp) < 0:
      y_train_temp += abs(min(y_train_temp))
    X_train_temp = X_train.drop(dropped_columns, axis=1)
    # print(X_train_temp.head())
    clf.fit(X_train_temp, y_train_temp)
    X_test_final[dropped_columns[0]] = clf.predict(X_test_final)
    dropped_columns.pop(0)
  predictions = model.predict(X_test_final)
  print(classification_report(y_test, predictions, digits=4))


In [None]:
from sklearn.tree import DecisionTreeRegressor
def run_imputation_sklearn(model, X_train, X_test, y_train, y_test, dropped_columns):
    X_test_final = X_test.copy()
    while len(dropped_columns) != 0:
      clf = DecisionTreeRegressor(max_depth=3)
      y_train_temp = X_train[dropped_columns[0]]
      # print(y_train[:10])
      if min(y_train_temp) < 0:
        y_train_temp += abs(min(y_train_temp))
      X_train_temp = X_train.drop(dropped_columns, axis=1)
      # print(X_train_temp.head())
      clf.fit(X_train_temp, y_train_temp)
      X_test_final[dropped_columns[0]] = clf.predict(X_test_final)
      dropped_columns.pop(0)
    predictions = model.predict(X_test_final)
    print(classification_report(y_test, predictions, digits=4))


In [None]:


def run_imputation_banking(model, X_train, X_test, y_train, y_test, dropped_columns):
    X_test_final = X_test.copy()
    while len(dropped_columns) != 0:
      if dropped_columns[0] == "balance":
        clf = DecisionTreeRegressor(max_depth=3)
      else:
        clf = DecisionTree(max_depth=5)
      y_train_temp = X_train[dropped_columns[0]]
      # print(y_train[:10])
      if min(y_train_temp) < 0:
        y_train_temp += abs(min(y_train_temp))
      X_train_temp = X_train.drop(dropped_columns, axis=1)
      # print(X_train_temp.head())
      clf.fit(X_train_temp, y_train_temp)
      X_test_final[dropped_columns[0]] = clf.predict(X_test_final)
      dropped_columns.pop(0)
    predictions = model.predict(X_test_final)
    print(classification_report(y_test, predictions, digits=4))

# Predictions

In [None]:
def print_tree(train_features, node):
  if node.isLeaf:
    return
  else:
    print(train_features[node.best_split[0]])
    print_tree(train_features, node.left_node)
    print_tree(train_features, node.right_node)

## Banking

In [None]:
df = pd.read_csv("/content/drive/MyDrive/bank-full.csv", sep=";")

In [None]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
le = LabelEncoder()
text_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y"]
label = "y"

for col in text_cols:
  df[col] = le.fit_transform(df[col])
  df[col] += 1

In [None]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,5,2,3,1,2143,2,1,3,5,9,261,1,-1,0,4,1
1,44,10,3,2,1,29,2,1,3,5,9,151,1,-1,0,4,1
2,33,3,2,2,1,2,2,2,3,5,9,76,1,-1,0,4,1
3,47,2,2,4,1,1506,2,1,3,5,9,92,1,-1,0,4,1
4,33,12,3,4,1,1,1,1,3,5,9,198,1,-1,0,4,1


In [None]:
X = df.drop(label, axis=1)
y = df[label]

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=42)

In [None]:
clf = DecisionTree(max_depth=6)
clf.fit(X_train, y_train)

In [None]:
X_test = X_test.iloc[:1000, :]
y_test = y_test[:1000]

In [None]:
df.contact.unique()

array([3, 1, 2])

In [None]:
print_tree(clf.train_features, clf.treeNode)

duration
month
age
pdays
duration
month
contact
poutcome
housing
duration
duration
duration
previous
duration
poutcome
duration
age
duration
duration
marital
job
day
month
duration
duration
day
duration
poutcome
poutcome
duration
pdays
duration
contact
duration
campaign
previous
balance
housing
pdays
day
duration
balance
balance
duration
duration
balance
contact
month
pdays
duration
day
balance
age
balance
day
month
balance
age
education
age


### Original model

In [None]:
predictions  = clf.predict(X_test)
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [00:04<00:00, 248.88it/s]

              precision    recall  f1-score   support

           1     0.9211    0.9622    0.9412       873
           2     0.6250    0.4331    0.5116       127

    accuracy                         0.8950      1000
   macro avg     0.7730    0.6976    0.7264      1000
weighted avg     0.8835    0.8950    0.8866      1000






In [None]:
df["housing"].unique()

array([2, 1])

### Binary

In [None]:
X_test_temp = X_test.drop("housing", axis=1)
# predictions  = clf.predict(X_test_temp)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [02:52<00:00,  5.78it/s]

              precision    recall  f1-score   support

           1     0.9229    0.9599    0.9410       873
           2     0.6196    0.4488    0.5205       127

    accuracy                         0.8950      1000
   macro avg     0.7712    0.7044    0.7308      1000
weighted avg     0.8844    0.8950    0.8876      1000






In [None]:
run_smaller_model(X_train.drop("housing", axis=1), X_test.drop("housing", axis=1), y_train, y_test)

100%|██████████| 1000/1000 [00:02<00:00, 374.79it/s]

              precision    recall  f1-score   support

           1     0.9186    0.9565    0.9371       873
           2     0.5824    0.4173    0.4862       127

    accuracy                         0.8880      1000
   macro avg     0.7505    0.6869    0.7117      1000
weighted avg     0.8759    0.8880    0.8799      1000






In [None]:
run_imputation_model(clf, X_train, X_test.drop("housing", axis=1), y_train, y_test, ["housing"])

100%|██████████| 1000/1000 [00:02<00:00, 382.58it/s]
100%|██████████| 1000/1000 [00:03<00:00, 297.48it/s]

              precision    recall  f1-score   support

           1     0.9221    0.9622    0.9417       873
           2     0.6292    0.4409    0.5185       127

    accuracy                         0.8960      1000
   macro avg     0.7756    0.7016    0.7301      1000
weighted avg     0.8849    0.8960    0.8880      1000






### Numeric

In [None]:
X_test_temp = X_test.drop("balance", axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))
print(classification_report(y_test, predictions, digits=4))

100%|██████████| 1000/1000 [00:15<00:00, 65.54it/s]

              precision    recall  f1-score   support

           1     0.9211    0.9622    0.9412       873
           2     0.6250    0.4331    0.5116       127

    accuracy                         0.8950      1000
   macro avg     0.7730    0.6976    0.7264      1000
weighted avg     0.8835    0.8950    0.8866      1000






In [None]:
run_smaller_model(X_train.drop("balance", axis=1), X_test.drop("balance", axis=1), y_train, y_test)

100%|██████████| 1000/1000 [00:02<00:00, 373.94it/s]

              precision    recall  f1-score   support

           1     0.9254    0.9519    0.9385       873
           2     0.5882    0.4724    0.5240       127

    accuracy                         0.8910      1000
   macro avg     0.7568    0.7122    0.7312      1000
weighted avg     0.8826    0.8910    0.8858      1000






In [None]:
run_imputation_sklearn(clf, X_train, X_test.drop("balance", axis=1), y_train, y_test, ["balance"])

100%|██████████| 1000/1000 [00:02<00:00, 338.99it/s]

              precision    recall  f1-score   support

           1     0.9211    0.9622    0.9412       873
           2     0.6250    0.4331    0.5116       127

    accuracy                         0.8950      1000
   macro avg     0.7730    0.6976    0.7264      1000
weighted avg     0.8835    0.8950    0.8866      1000






### Ordinal

In [None]:
X_test_temp = X_test.drop("contact", axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))
print(classification_report(y_test, predictions, digits=4))


  0%|          | 0/1000 [00:00<?, ?it/s][A
  1%|          | 10/1000 [00:01<01:57,  8.40it/s][A
  1%|▏         | 13/1000 [00:01<01:32, 10.64it/s][A
  2%|▏         | 18/1000 [00:01<01:04, 15.14it/s][A
  2%|▏         | 22/1000 [00:01<01:06, 14.71it/s][A
  3%|▎         | 27/1000 [00:01<00:50, 19.35it/s][A
  4%|▍         | 38/1000 [00:02<00:46, 20.73it/s][A
  4%|▍         | 42/1000 [00:02<00:42, 22.70it/s][A
  5%|▍         | 49/1000 [00:02<00:50, 18.95it/s][A
  5%|▌         | 52/1000 [00:03<01:12, 13.03it/s][A
  5%|▌         | 54/1000 [00:03<01:23, 11.35it/s][A
  6%|▌         | 59/1000 [00:03<01:02, 15.07it/s][A
  6%|▌         | 62/1000 [00:04<01:02, 15.10it/s][A
  6%|▋         | 65/1000 [00:04<01:33, 10.02it/s][A
  7%|▋         | 68/1000 [00:05<01:38,  9.49it/s][A
  7%|▋         | 73/1000 [00:05<01:09, 13.28it/s][A
  8%|▊         | 82/1000 [00:05<01:05, 14.02it/s][A
  8%|▊         | 84/1000 [00:06<01:36,  9.50it/s][A
  9%|▊         | 86/1000 [00:06<01:49,  8.33it/s][A
 

              precision    recall  f1-score   support

           1     0.9256    0.9542    0.9397       873
           2     0.6000    0.4724    0.5286       127

    accuracy                         0.8930      1000
   macro avg     0.7628    0.7133    0.7341      1000
weighted avg     0.8842    0.8930    0.8875      1000






In [None]:
run_smaller_model(X_train.drop("contact", axis=1), X_test.drop("contact", axis=1), y_train, y_test)


  0%|          | 0/1000 [00:00<?, ?it/s][A
  4%|▍         | 41/1000 [00:00<00:02, 409.82it/s][A
  8%|▊         | 82/1000 [00:00<00:02, 388.82it/s][A
 12%|█▏        | 121/1000 [00:00<00:02, 362.80it/s][A
 16%|█▌        | 158/1000 [00:00<00:02, 350.80it/s][A
 19%|█▉        | 194/1000 [00:00<00:02, 346.58it/s][A
 23%|██▎       | 229/1000 [00:00<00:02, 330.83it/s][A
 27%|██▋       | 266/1000 [00:00<00:02, 341.05it/s][A
 30%|███       | 301/1000 [00:00<00:02, 324.71it/s][A
 34%|███▎      | 337/1000 [00:00<00:01, 332.71it/s][A
 37%|███▋      | 372/1000 [00:01<00:01, 336.20it/s][A
 41%|████      | 412/1000 [00:01<00:01, 352.65it/s][A
 45%|████▍     | 448/1000 [00:01<00:01, 351.00it/s][A
 49%|████▊     | 487/1000 [00:01<00:01, 362.32it/s][A
 52%|█████▎    | 525/1000 [00:01<00:01, 364.70it/s][A
 56%|█████▌    | 562/1000 [00:01<00:01, 347.94it/s][A
 60%|█████▉    | 597/1000 [00:01<00:01, 334.90it/s][A
 63%|██████▎   | 632/1000 [00:01<00:01, 337.05it/s][A
 67%|██████▋   | 667/1

              precision    recall  f1-score   support

           1     0.9156    0.9565    0.9356       873
           2     0.5682    0.3937    0.4651       127

    accuracy                         0.8850      1000
   macro avg     0.7419    0.6751    0.7003      1000
weighted avg     0.8715    0.8850    0.8758      1000






In [None]:
run_imputation_model(clf, X_train, X_test.drop("contact", axis=1), y_train, y_test, ["contact"])


  0%|          | 0/1000 [00:00<?, ?it/s][A
  4%|▍         | 38/1000 [00:00<00:02, 373.30it/s][A
  8%|▊         | 76/1000 [00:00<00:02, 359.20it/s][A
 11%|█▏        | 114/1000 [00:00<00:02, 368.32it/s][A
 15%|█▌        | 151/1000 [00:00<00:02, 363.71it/s][A
 19%|█▉        | 188/1000 [00:00<00:02, 354.82it/s][A
 22%|██▎       | 225/1000 [00:00<00:02, 358.70it/s][A
 26%|██▌       | 261/1000 [00:00<00:02, 348.63it/s][A
 30%|██▉       | 297/1000 [00:00<00:01, 351.66it/s][A
 33%|███▎      | 333/1000 [00:00<00:01, 345.74it/s][A
 37%|███▋      | 369/1000 [00:01<00:01, 347.43it/s][A
 41%|████      | 407/1000 [00:01<00:01, 355.33it/s][A
 44%|████▍     | 443/1000 [00:01<00:01, 354.38it/s][A
 48%|████▊     | 480/1000 [00:01<00:01, 358.87it/s][A
 52%|█████▏    | 516/1000 [00:01<00:01, 358.80it/s][A
 55%|█████▌    | 552/1000 [00:01<00:01, 356.63it/s][A
 59%|█████▉    | 588/1000 [00:01<00:01, 355.85it/s][A
 62%|██████▎   | 625/1000 [00:01<00:01, 357.23it/s][A
 66%|██████▌   | 661/1

              precision    recall  f1-score   support

           1     0.9171    0.9633    0.9397       873
           2     0.6145    0.4016    0.4857       127

    accuracy                         0.8920      1000
   macro avg     0.7658    0.6825    0.7127      1000
weighted avg     0.8787    0.8920    0.8820      1000






### Combined

In [None]:
X_test_temp = X_test.drop(["housing", "balance", "contact"], axis=1)
predictions  = clf.predict(X_test_temp.reset_index(drop=True))
print(classification_report(y_test, predictions, digits=4))


  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 4/1000 [00:02<11:42,  1.42it/s][A
  1%|          | 10/1000 [00:03<05:23,  3.06it/s][A
  1%|          | 12/1000 [00:05<08:30,  1.93it/s][A
  1%|▏         | 13/1000 [00:06<07:34,  2.17it/s][A
  1%|▏         | 14/1000 [00:07<11:12,  1.47it/s][A
  2%|▏         | 15/1000 [00:08<12:54,  1.27it/s][A
  2%|▏         | 17/1000 [00:10<11:45,  1.39it/s][A
  2%|▏         | 20/1000 [00:11<10:11,  1.60it/s][A
  2%|▏         | 22/1000 [00:11<07:53,  2.07it/s][A
  2%|▏         | 23/1000 [00:13<11:26,  1.42it/s][A
  3%|▎         | 27/1000 [00:13<06:00,  2.70it/s][A
  3%|▎         | 31/1000 [00:15<06:07,  2.63it/s][A
  4%|▍         | 38/1000 [00:16<03:38,  4.40it/s][A
  4%|▍         | 45/1000 [00:17<03:19,  4.80it/s][A
  5%|▍         | 49/1000 [00:17<02:56,  5.39it/s][A
  5%|▌         | 51/1000 [00:18<02:56,  5.36it/s][A
  5%|▌         | 54/1000 [00:18<02:27,  6.40it/s][A
  6%|▌         | 60/1000 [00:18<01:35,  9.79it/s][A
  

              precision    recall  f1-score   support

           1     0.9295    0.9519    0.9406       873
           2     0.6038    0.5039    0.5494       127

    accuracy                         0.8950      1000
   macro avg     0.7667    0.7279    0.7450      1000
weighted avg     0.8882    0.8950    0.8909      1000






In [None]:
run_smaller_model(X_train.drop(["housing", "balance", "contact"], axis=1), X_test.drop(["housing", "balance", "contact"], axis=1), y_train, y_test)


  0%|          | 0/1000 [00:00<?, ?it/s][A
  2%|▏         | 24/1000 [00:00<00:04, 233.75it/s][A
  5%|▍         | 48/1000 [00:00<00:04, 218.21it/s][A
  7%|▋         | 71/1000 [00:00<00:04, 221.66it/s][A
 11%|█         | 108/1000 [00:00<00:03, 277.13it/s][A
 14%|█▍        | 144/1000 [00:00<00:02, 305.66it/s][A
 18%|█▊        | 178/1000 [00:00<00:02, 316.90it/s][A
 21%|██        | 210/1000 [00:00<00:02, 313.77it/s][A
 24%|██▍       | 242/1000 [00:00<00:02, 300.49it/s][A
 28%|██▊       | 280/1000 [00:00<00:02, 322.15it/s][A
 32%|███▏      | 320/1000 [00:01<00:01, 341.94it/s][A
 36%|███▌      | 355/1000 [00:01<00:01, 336.08it/s][A
 39%|███▉      | 390/1000 [00:01<00:01, 340.02it/s][A
 43%|████▎     | 427/1000 [00:01<00:01, 347.70it/s][A
 46%|████▌     | 462/1000 [00:01<00:01, 341.23it/s][A
 50%|█████     | 500/1000 [00:01<00:01, 350.66it/s][A
 54%|█████▍    | 539/1000 [00:01<00:01, 361.28it/s][A
 58%|█████▊    | 577/1000 [00:01<00:01, 366.68it/s][A
 61%|██████▏   | 614/10

              precision    recall  f1-score   support

           1     0.9138    0.9588    0.9357       873
           2     0.5714    0.3780    0.4550       127

    accuracy                         0.8850      1000
   macro avg     0.7426    0.6684    0.6953      1000
weighted avg     0.8703    0.8850    0.8747      1000






In [None]:
run_imputation_banking(clf, X_train, X_test.drop(["balance", "housing", "contact"], axis=1), y_train, y_test, ["balance", "housing" , "contact"])

100%|██████████| 1000/1000 [00:02<00:00, 466.67it/s]
100%|██████████| 1000/1000 [00:03<00:00, 323.33it/s]
100%|██████████| 1000/1000 [00:02<00:00, 366.18it/s]

              precision    recall  f1-score   support

           1     0.9181    0.9633    0.9402       873
           2     0.6190    0.4094    0.4929       127

    accuracy                         0.8930      1000
   macro avg     0.7686    0.6864    0.7165      1000
weighted avg     0.8801    0.8930    0.8834      1000




