In [1]:
# Getting data from json file
import json
import pandas as pd

contents = []

try:
    with open("..\\public_cases.json", 'r') as f:
        contents = json.load(f)
except Exception as e:
    print(e)

input = [item.get("input") for item in contents]
miles = [item.get("miles_traveled") for item in input]
days = [item.get("trip_duration_days") for item in input]
amount = [item.get("total_receipts_amount") for item in input]

target_feat = pd.DataFrame([item.get("expected_output") for item in contents])

data = {
    "miles_traveled": miles,
    "trip_duration_days": days,
    "total_receipts_amount": amount
}

descriptive_feats = pd.DataFrame(data)

# Add dervied features to the descriptive feature data frame
EPS = 1e-6 
descriptive_feats["cost_per_mile"] = descriptive_feats["total_receipts_amount"] / (descriptive_feats["miles_traveled"] + EPS)
descriptive_feats["cost_per_day"]  = descriptive_feats["total_receipts_amount"] / (descriptive_feats["trip_duration_days"] + EPS)
descriptive_feats["miles_per_day"] = descriptive_feats["miles_traveled"] / (descriptive_feats["trip_duration_days"] + EPS)
descriptive_feats["long_trip_flag"] = (descriptive_feats["trip_duration_days"] >= 5).astype(int)

In [2]:
# Splitting data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(descriptive_feats, target_feat, test_size=0.25, random_state=42)
y_train = y_train.values.ravel()

In [3]:
# Creating data frames with different normalization techniques
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Min-Max Scaling on the descriptive features of the training and testing data 
mm = MinMaxScaler() 
X_train_mm = pd.DataFrame(
    mm.fit_transform(X_train),
    columns=[c for c in X_test.columns],
    index=X_train.index
)
X_test_mm = pd.DataFrame(
    mm.transform(X_test),
    columns=[c for c in X_test.columns],
    index=X_test.index
)

# Standard Scaling on the descriptive features of the training and testing data
std = StandardScaler() 
X_train_std = pd.DataFrame(
    std.fit_transform(X_train),
    columns=[c for c in X_test.columns],
    index=X_train.index
)
X_test_std = pd.DataFrame(
    std.transform(X_test),
    columns=[c for c in X_test.columns],
    index=X_test.index
)

In [4]:
# Making a gradient boosting regressor and test it on the different normalized data frames

from sklearn.ensemble import GradientBoostingRegressor

# No normalization
gb_model = GradientBoostingRegressor(learning_rate= 0.05, max_depth= 3, n_estimators= 200, subsample= 0.7, random_state=42)
gb_model.fit(X_train, y_train)

# Standard normalization
gb_model_std = GradientBoostingRegressor(learning_rate= 0.05, max_depth= 3, n_estimators= 200, subsample= 0.7, random_state=42)
gb_model_std.fit(X_train_std, y_train)

# Min-Max normalization
gb_model_mm = GradientBoostingRegressor(learning_rate= 0.05, max_depth= 3, n_estimators= 200, subsample= 0.7, random_state=42)
gb_model_mm.fit(X_train_mm, y_train)

In [5]:
# Test the models on each of the different normalized data frames
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = gb_model.predict(X_test)
y_pred_std = gb_model_std.predict(X_test_std)
y_pred_mm = gb_model_mm.predict(X_test_mm)

predictions = {
    "No Normalization": y_pred, 
    "Standardization": y_pred_std, 
    "Min-Max Scaling": y_pred_mm
}

best_mae = []
best_r2 = []
best_accuracy = []

# Find the best normalization technique based on different evaluation metrics
for pred in predictions.keys():
    print(pred)
    mae = mean_absolute_error(y_test, predictions[pred])
    r_squared = r2_score(y_test, predictions[pred])

    threshold = 0.05
    within_threshold = np.abs((y_test.to_numpy().flatten() - predictions[pred]) / y_test.to_numpy().flatten()) <= threshold
    accuracy_within_threshold = np.mean(within_threshold) * 100

    accuracy_within_threshold = np.mean(within_threshold) * 100

    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared: {r_squared}")
    print(f"Accuracy within {threshold*100}% threshold: {accuracy_within_threshold}%")
    print("\n")

    # Store the best results for each metric as well as the respective normalization technique
    if(len(best_accuracy) == 0):
        best_accuracy.append(pred)
        best_accuracy.append(accuracy_within_threshold)
    if(len(best_mae) == 0):
        best_mae.append(pred)
        best_mae.append(mae)
    if(len(best_r2) == 0):
        best_r2.append(pred)
        best_r2.append(r_squared)
    if accuracy_within_threshold > best_accuracy[1]:
        best_accuracy[1] = accuracy_within_threshold
        best_accuracy[0] = pred
    if mae < best_mae[1]:
        best_mae[1] = mae
        best_mae[0] = pred
    if r_squared > best_r2[1]:
        best_r2[1] = r_squared
        best_r2[0] = pred

print("Best normalization technique: ")
print(best_accuracy, "\n", best_mae, "\n", best_r2)


No Normalization
Mean Absolute Error: 71.7968341470425
R-squared: 0.9430233497368724
Accuracy within 5.0% threshold: 56.39999999999999%


Standardization
Mean Absolute Error: 71.46645998599479
R-squared: 0.9432848766904267
Accuracy within 5.0% threshold: 58.8%


Min-Max Scaling
Mean Absolute Error: 71.4634093960824
R-squared: 0.9432922532935596
Accuracy within 5.0% threshold: 58.8%


Best normalization technique: 
['Standardization', np.float64(58.8)] 
 ['Min-Max Scaling', np.float64(71.4634093960824)] 
 ['Min-Max Scaling', 0.9432922532935596]


Technique used: No Normalization

In [6]:
# Saving the model
import joblib

joblib.dump(gb_model, '..\\artifacts\\finalModelgb.pkl')

['..\\artifacts\\finalModelgb.pkl']