In [1]:
# import required libraries

import pandas as pd
import numpy as np

In [2]:
# this step is to read data on Google Drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Data shared on eLearning
income_data=pd.read_csv("drive/My Drive/Income_Data.csv")

In [5]:
income_data.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [105]:
def best_split(data, target, feature, Loss_Function):
  unique_values = sorted(data[feature].unique())
  lower = np.array(unique_values[:len(unique_values)-1])
  upper = np.array(unique_values[1:])
  thresholds = (lower + upper) / 2

  Losses = pd.DataFrame(columns = ["Threshold", "Loss"])
  row_number = 0
  for threshold in thresholds:
    temp_data = data[[target, feature]].copy()
    temp_data["Y_Hat"] = np.NAN
    Y_Hat_1 = temp_data[temp_data[feature] <= threshold][target].mean()
    Y_Hat_2 = temp_data[temp_data[feature] > threshold][target].mean()
    temp_data["Y_Hat"] = np.where(temp_data[feature] <= threshold, Y_Hat_1, Y_Hat_2)

    Losses.loc[row_number, "Threshold"] = threshold

    if Loss_Function == "MSE":
      Losses.loc[row_number, "Loss"] = np.power(temp_data[target] - temp_data["Y_Hat"], 2).mean()
    else:
      Losses.loc[row_number, "Loss"] = np.abs(temp_data[target] - temp_data["Y_Hat"]).mean()

    row_number = row_number + 1

  return (Losses[Losses.Loss == min(Losses.Loss)])

In [106]:
# to estimate time it takes to run the model
import time

In [103]:
start_time = time.time()
best_split(income_data, "Salary", "Years of Experience", "MAE")
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.22896981239318848 seconds ---


In [102]:
start_time = time.time()
best_split(income_data, "Salary", "Age", "MSE")
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.18227481842041016 seconds ---


# Run a DT using skLearn

In [83]:
from sklearn.tree import DecisionTreeRegressor

In [91]:
# to use this package, we need to impute or remove missing values
modeling_data = income_data[(income_data.Salary.notnull()) & (income_data.Age.notnull()) & (income_data["Years of Experience"].notnull())]

(6702, 6)

In [104]:
# build a DT with only one layer
regressor = DecisionTreeRegressor(random_state=0, max_depth=1)
DT_model = regressor.fit(modeling_data[["Age", "Years of Experience"]], modeling_data["Salary"])

In [96]:
from sklearn import tree
text_representation = tree.export_text(regressor)
print(text_representation)

|--- feature_1 <= 5.50
|   |--- value: [69054.88]
|--- feature_1 >  5.50
|   |--- value: [148679.00]

