<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week8/Lec8_RegTree_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

Data ref [link](https://www.kaggle.com/datasets/mathchi/hitters-baseball-data)

In [None]:
data = pd.read_csv("Hitters.csv")
data_req = data[["Years", "Hits", "Salary"]]
data_req.head()

Unnamed: 0,Years,Hits,Salary
0,1,66,
1,14,81,475.0
2,3,130,480.0
3,11,141,500.0
4,2,87,91.5


In [None]:
# romove all rows with any NA value
data_req.dropna(how = "any", inplace = True)
# it is important to reset index as we will be using these indices to separate out regions
data_req.reset_index(drop = True, inplace = True)

In [None]:
data_req.index

RangeIndex(start=0, stop=263, step=1)

In [None]:
X = data_req[['Years', 'Hits']]
y = np.array(data_req['Salary'])
type(X), type(y)

(pandas.core.frame.DataFrame, numpy.ndarray)

In [None]:
# For similicity, any region will be defined by a df or a np array of selected indices
# Step 1: check number of observations in the regions is > 50, if yes proceed to step 2, else stop
# Step 2: for all features (j = 1, ..., p), for all cutpoints (s = 1, ..., k), calculate total loss for each (j,s)
# Step 3: choose (j,s) for which loss is min.
# Step 4: split into regions
# Go to step 1

In [None]:
# returning cutpoints as deciles
def get_cutpoints(region_as_df, feature_name):
  ''' given dataframe and '''
  return list(region_as_df[feature_name].quantile([i/10 for i in range(10)]))

def calc_loss(left_region, right_region):
  ''' expects regions as list of y indices'''
  total_loss = np.sum((y[left_region] - np.mean(y[left_region]))**2) + np.sum((y[right_region] - np.mean(y[right_region]))**2)

  return total_loss


In [None]:
def get_min_js(region_as_df):
  """ given a region this function should return the best (j, s) combination """

  region_df = region_as_df.copy()
  region_features = region_df.columns
  calculated_loss = []
  for j in range(region_df.shape[1]):
    feature = region_features[j]
    cuts = get_cutpoints(region_as_df = region_df, feature_name = feature)
    for s in cuts:
      region_left = list(region_df[region_df[feature] < s].index)
      region_right = list(region_df[region_df[feature] >= s].index)
      loss = calc_loss(left_region = region_left, right_region = region_right)
      calculated_loss.append({"j": j,
                              "feature": feature,
                              "s": s,
                              "loss": loss})

  calculated_loss_df = pd.DataFrame(calculated_loss)
  min_js = calculated_loss_df.iloc[calculated_loss_df['loss'].argmin()].to_dict()

  return min_js

In [None]:
def get_regions(region_as_df, choice):
  ''' given the (j, s) it performs the region splits and returns left and right regions '''
  feature = choice['feature']
  cut_point = choice['s']
  region_left = region_as_df[region_as_df[feature] < cut_point]
  region_right = region_as_df[region_as_df[feature] >= cut_point]

  return region_left, region_right

In [None]:
# Initialize
region = X.copy()
observations_per_region = 30
results = []

# get (j,s) for which loss is min
min_js = get_min_js(region_as_df = region)

# get regions
region_left, region_right = get_regions(region_as_df = region, choice = min_js)

# save region and split details
results.append({"start_region": "all",
                "split_details": min_js,
               "region_left_indices": list(region_left.index),
               "region_right_indices": list(region_right.index)
               })


In [None]:
# check
if len(region_left) < observations_per_region:
  pass
else:
  region = region_left.copy()
  # get (j,s) for which loss is min
  min_js = get_min_js(region_as_df = region)

  # get regions
  region_left_left, region_left_right = get_regions(region_as_df = region, choice = min_js)

  # save region and split details
  results.append({"start_region": "left",
                "split_details": min_js,
                "region_left_indices": list(region_left_left.index),
                "region_right_indices": list(region_left_right.index)
                })



In [None]:
# check
if len(region_right) < observations_per_region:
  pass
else:
  region = region_right.copy()
  # get (j,s) for which loss is min
  min_js = get_min_js(region_as_df = region)

  # get regions
  region_right_left, region_right_right = get_regions(region_as_df = region, choice = min_js)

  # save region and split details
  results.append({"start_region": "right",
                "split_details": min_js,
                "region_left_indices": list(region_right_left.index),
                "region_right_indices": list(region_right_right.index)
                })


173


In [None]:
[ print(result["split_details"], len(result["region_left_indices"]), len(result["region_right_indices"])) for result in results]

{'j': 0, 'feature': 'Years', 's': 5.0, 'loss': 40162623.58303067} 90 173
{'j': 0, 'feature': 'Years', 's': 4.0, 'loss': 6375188.202370053} 62 28
{'j': 1, 'feature': 'Hits', 's': 113.0, 'loss': 24157797.71410497} 86 87


[None, None, None]

In [None]:
# Now you have four regions: region_left_left, region_left_right, region_right_left, region_right_right
# what are our predictions for these regions? Just their means ...
pred_reg_l_l = np.mean(y[results[1]["region_left_indices"]])
pred_reg_l_r = np.mean(y[results[1]["region_right_indices"]])
pred_reg_r_l = np.mean(y[results[2]["region_left_indices"]])
pred_reg_r_r = np.mean(y[results[2]["region_right_indices"]])
print(f" region_l_l: {pred_reg_l_l}, \n region_l_r: {pred_reg_l_r}, \n region_r_l: {pred_reg_r_l}, \n region_r_r: {pred_reg_r_r}")

 region_l_l: 181.3682741935484, 
 region_l_r: 324.2857142857143, 
 region_r_l: 464.8546627906977, 
 region_r_r: 926.9675057471266


In [None]:
# for prediction you have to trace the path through the tree

def reg_tree_prediction(data_point):
  """expects input as numpy array with 2 features"""
  if data_point[0] < 4:
    print(f"Salary: {pred_reg_l_l}")
  elif data_point[0] > 4 and data_point[0] < 5:
    print(f"Salary: {pred_reg_l_r}")
  elif data_point[0] >= 5 and data_point[1] < 113:
    print(f"Salary: {pred_reg_r_l}")
  elif data_point[0] >= 5 and data_point[1] >= 113:
    print(f"Salary: {pred_reg_r_r}")
  else:
    print("input out of bounds")




In [None]:
X.describe()

Unnamed: 0,Years,Hits
count,263.0,263.0
mean,7.311787,107.828897
std,4.793616,45.125326
min,1.0,1.0
25%,4.0,71.5
50%,6.0,103.0
75%,10.0,141.5
max,24.0,238.0


In [None]:
reg_tree_prediction([6, 20])

Salary: 464.8546627906977
