In [15]:
pip install graphviz

Collecting graphviz
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.21-py3-none-any.whl (47 kB)
   ---------------------------------------- 0.0/47.3 kB ? eta -:--:--
   ---------------------------------- ----- 41.0/47.3 kB 991.0 kB/s eta 0:00:01
   ---------------------------------------- 47.3/47.3 kB 788.5 kB/s eta 0:00:00
Installing collected packages: graphviz
Successfully installed graphviz-0.21
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_recall_curve
import json
import numpy as np 
import graphviz

In [5]:
# Getting data from the JSON file
contents = []

try:
    with open("public_cases.json", 'r') as f:
        contents = json.load(f)
except Exception as e:
    print(e)

input = [item.get("input") for item in contents]

miles = [item.get("miles_traveled") for item in input]
days = [item.get("trip_duration_days") for item in input]
amount = [item.get("total_receipts_amount") for item in input]

output = [item.get("expected_output") for item in contents]

# Convert data to data frame
data = {
    "miles_traveled": miles,
    "trip_duration_days": days,
    "total_receipts_amount": amount
}

dataFrame_input = pd.DataFrame(data)

print(dataFrame_input)
print(output)

     miles_traveled  trip_duration_days  total_receipts_amount
0              93.0                   3                   1.42
1              55.0                   1                   3.60
2              47.0                   1                  17.97
3              13.0                   2                   4.67
4              88.0                   3                   5.78
..              ...                 ...                    ...
995          1082.0                   1                1809.49
996           636.0                  11                2238.97
997           370.0                   6                 315.09
998           413.0                   8                 222.83
999           399.0                   3                 141.39

[1000 rows x 3 columns]
[364.51, 126.06, 128.91, 203.52, 380.37, 158.35, 320.12, 199.68, 464.07, 359.1, 356.17, 366.87, 204.58, 430.86, 195.14, 117.24, 179.06, 120.65, 234.2, 325.56, 574.1, 1443.96, 869, 1030.41, 1654.62, 621.12, 1139.94, 1624

In [3]:
# Splitting the dataset 
X_train, X_test, Y_train, Y_test = train_test_split(dataFrame_input, output, random_state=42, test_size=0.25)

### Final Trees

In [4]:
regressor_se = DecisionTreeRegressor(criterion='squared_error', max_depth=6, random_state=42)
regressor_se.fit(X_train, Y_train)
Y_pred_se = regressor_se.predict(X_test)

In [5]:
regressor_f = DecisionTreeRegressor(criterion='friedman_mse', max_depth=6, random_state=42)
regressor_f.fit(X_train, Y_train)
Y_pred_f = regressor_f.predict(X_test)

In [6]:
regressor_ae = DecisionTreeRegressor(criterion='absolute_error', max_depth=8, random_state=42)
regressor_ae.fit(X_train, Y_train)
Y_pred_ae = regressor_ae.predict(X_test)

In [7]:
regressor_p = DecisionTreeRegressor(criterion='poisson', max_depth=6, random_state=42)
regressor_p.fit(X_train, Y_train)
Y_pred_p = regressor_p.predict(X_test)

Cross Validation Strategies 


In [None]:
# K-Fold Cross Validation
models = [[Y_pred_ae, 'Absolute Error', regressor_ae], [Y_pred_f, 'Friedman MSE', regressor_f], [Y_pred_p, 'Poisson', regressor_p], [Y_pred_se, 'Squared Error', regressor_se]]

for model in models:
    print(f"K Fold Cross Validation for model: {model[1]}")
    k_folds = KFold(n_splits = 5)
    cross_val_score
    scores = cross_val_score(model[2], X = dataFrame_input, y = output, cv = k_folds)
    print("Cross Validation Scores: ", scores)
    print("Average CV Score: ", scores.mean())
    print("Number of CV Scores used in Average: ", len(scores))
    print("\n")

K Fold Cross Validation for model: Absolute Error
Cross Validation Scores:  [0.90265693 0.85197457 0.88733391 0.78135345 0.84858625]
Average CV Score:  0.8543810234214838
Number of CV Scores used in Average:  5


K Fold Cross Validation for model: Friedman MSE
Cross Validation Scores:  [0.90481347 0.84576235 0.88057398 0.78651226 0.87482673]
Average CV Score:  0.8584977610342299
Number of CV Scores used in Average:  5


K Fold Cross Validation for model: Poisson
Cross Validation Scores:  [0.88650373 0.8501377  0.85508663 0.77535614 0.87848288]
Average CV Score:  0.8491134155875832
Number of CV Scores used in Average:  5


K Fold Cross Validation for model: Squared Error
Cross Validation Scores:  [0.90481347 0.84576235 0.88066365 0.78651226 0.87482673]
Average CV Score:  0.8585156950701277
Number of CV Scores used in Average:  5




Multiple Evaluation Metrics

In [43]:
models = [[Y_pred_ae, 'Absolute Error', regressor_ae], [Y_pred_f, 'Friedman MSE', regressor_f], [Y_pred_p, 'Poisson', regressor_p], [Y_pred_se, 'Squared Error', regressor_se]]

for model in models:
    dot_data = export_graphviz(model[2], out_file = (model[1] + '.dot')) 
    print(f"Results for model: {model[1]}")
    mse = mean_squared_error(Y_test, model[0])
    mae = mean_absolute_error(Y_test, model[0])   
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"RSME: {np.sqrt(mse):.2f}")
    print("\n")

Results for model: Absolute Error
Mean Squared Error: 17709.36
Mean Absolute Error: 92.50
RSME: 133.08


Results for model: Friedman MSE
Mean Squared Error: 17295.57
Mean Absolute Error: 93.40
RSME: 131.51


Results for model: Poisson
Mean Squared Error: 18296.22
Mean Absolute Error: 100.12
RSME: 135.26


Results for model: Squared Error
Mean Squared Error: 17331.49
Mean Absolute Error: 93.67
RSME: 131.65




For showing tree's, put .dot file information here: http://webgraphviz.com/ 