In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [9]:
import json




with open('public_cases.json', 'r') as f:
    data_json = json.load(f)

df = pd.DataFrame(data_json)

In [3]:
df

Unnamed: 0,input,expected_output
0,"{'trip_duration_days': 3, 'miles_traveled': 93...",364.51
1,"{'trip_duration_days': 1, 'miles_traveled': 55...",126.06
2,"{'trip_duration_days': 1, 'miles_traveled': 47...",128.91
3,"{'trip_duration_days': 2, 'miles_traveled': 13...",203.52
4,"{'trip_duration_days': 3, 'miles_traveled': 88...",380.37
...,...,...
995,"{'trip_duration_days': 1, 'miles_traveled': 10...",446.94
996,"{'trip_duration_days': 11, 'miles_traveled': 6...",1699.94
997,"{'trip_duration_days': 6, 'miles_traveled': 37...",946.39
998,"{'trip_duration_days': 8, 'miles_traveled': 41...",802.95


In [10]:
# --- Feature Engineering ---
def feature_engineering(df):
    # Basic features
    df['miles_per_day'] = df['miles'] / df['days'].replace(0, 1)  # avoid div by zero
    df['spending_per_day'] = df['receipts'] / df['days'].replace(0, 1)

    # Flag for "sweet spot combo" (Kevin's insight)
    df['sweet_spot'] = ((df['days'] == 5) & 
                        (df['miles_per_day'] >= 180) & (df['miles_per_day'] <= 220) & 
                        (df['spending_per_day'] <= 100)).astype(int)

    # Flag for "vacation penalty"
    df['vacation_penalty'] = ((df['days'] >= 8) & (df['spending_per_day'] > 90)).astype(int)

    # You can add more flags based on interview insights if you want

    return df

# --- Simulated dataset for demonstration ---
np.random.seed(42)
n_samples = 1000
days = np.random.randint(1, 15, size=n_samples)
miles = np.random.randint(10, 1000, size=n_samples)
receipts = np.random.uniform(0, 1500, size=n_samples)


In [11]:

# Create DataFrame
data = pd.DataFrame({
    'days': days,
    'miles': miles,
    'receipts': receipts
})

# Simulated target: Let's create a mock function reflecting some business logic with noise & quirks
def legacy_system_simulation(row):
    base = 50 * row['days'] + 0.5 * row['miles'] + 0.7 * row['receipts']
    # Apply "sweet spot" bonus
    if 180 <= row['miles']/max(row['days'],1) <= 220 and row['days'] == 5 and (row['receipts']/max(row['days'],1)) <= 100:
        base *= 1.15
    # Apply "vacation penalty"
    if row['days'] >= 8 and (row['receipts']/max(row['days'],1)) > 90:
        base *= 0.85
    # Add some noise mimicking randomness/bugs
    noise = np.random.normal(0, 25)
    return base + noise

data['reimbursement'] = data.apply(legacy_system_simulation, axis=1)

In [6]:



# --- Prepare features ---
data = feature_engineering(data)

features = ['days', 'miles', 'receipts', 'miles_per_day', 'spending_per_day', 'sweet_spot', 'vacation_penalty']
X = data[features]
y = data['reimbursement']

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train Gradient Boosting Regressor ---
model = GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# --- Evaluate ---
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on test data: ${mae:.2f}")

# --- Example prediction ---



Mean Absolute Error on test data: $36.29


In [12]:
# In your Jupyter notebook cell
import joblib

# Assume `model` is your trained model
joblib.dump(model, 'reimbursement_model.pkl')


['reimbursement_model.pkl']

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split