In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import joblib
import pickle
import os
# Load processed data
df = pd.read_csv(r"../Data/processed_wallet_data.csv")
print("Dataset shape:", df.shape)
df.sample(5)


Dataset shape: (10000, 16)


Unnamed: 0,User_ID,Month,Year,total_kwh,avg_kwh,total_cost,avg_cost,avg_wallet_balance,avg_session_duration,avg_cost_efficiency,peak_hour_ratio,City,Vehicle_Type,Subscription_Type,Payment_Mode,Charger_Type
6894,U06895,1,2025,5.58,5.58,52.01,52.01,855.9,60,9.320789,0,Bangalore,4W,Basic,Wallet,Superfast
2639,U02640,3,2025,7.73,7.73,85.03,85.03,1703.34,60,11.0,0,Ahmedabad,2W,Basic,Credit Card,Superfast
3442,U03443,8,2025,46.07,46.07,485.12,485.12,1352.83,60,10.530063,0,Bangalore,2W,Basic,Credit Card,Superfast
5862,U05863,2,2024,48.97,48.97,443.67,443.67,1318.23,60,9.060037,0,Ahmedabad,2W,Pay-as-you-go,UPI,Fast
9293,U09294,9,2025,16.41,16.41,167.55,167.55,1564.21,60,10.210238,0,Delhi,2W,Premium,Debit Card,Fast


In [24]:
# Step 2: Feature Preparation (Fixed, Enriched & Safe)
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

target = 'total_cost'   # predicting monthly total spend

# Drop mathematically linked or leaky features
drop_cols = ['avg_cost', 'total_cost', 'avg_cost_efficiency', 'total_kwh', 'avg_kwh']
drop_cols = [c for c in drop_cols if c in df.columns]

categorical_cols = ['City', 'Vehicle_Type', 'Subscription_Type',
                    'Payment_Mode', 'Charger_Type', 'Month', 'Year', 'season']

# Build base features and target
X = df.drop(columns=['User_ID', target] + drop_cols)
y = df[target]

# -------------- ENRICH FEATURES HERE -------------- #
X['sessions_per_user_month'] = df.groupby(['User_ID', 'Month', 'Year'])['Month'].transform('count')
X['cost_per_kwh_est'] = df['avg_cost_efficiency'] * df['peak_hour_ratio'].fillna(0)
X['wallet_to_cost_ratio'] = df['avg_wallet_balance'] / (df['avg_cost'] + 1e-6)
X['vehicle_encoded'] = df['Vehicle_Type']
X['subscription_encoded'] = df['Subscription_Type']
# --------------------------------------------------- #

# Replace inf and NaN
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

# Encode all object/categorical columns safely
le_dict = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        le_dict[col] = le

# Scale numerical columns
scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Feature preparation and enrichment complete.")
print("Final feature shape:", X.shape)
print("Target variable:", target)


Feature preparation and enrichment complete.
Final feature shape: (10000, 15)
Target variable: total_cost


In [25]:
# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (8000, 15) Test size: (2000, 15)


In [26]:
X_train

Unnamed: 0,Month,Year,avg_wallet_balance,avg_session_duration,peak_hour_ratio,City,Vehicle_Type,Subscription_Type,Payment_Mode,Charger_Type,sessions_per_user_month,cost_per_kwh_est,wallet_to_cost_ratio,vehicle_encoded,subscription_encoded
9254,0.133166,0.982749,-1.347091,0.0,-0.512471,-0.225991,-1.169913,-1.219375,-1.328435,1.230953,0.0,-0.500862,-0.637796,-1.169913,-1.219375
1561,-0.447712,0.982749,-1.343413,0.0,-0.512471,-1.100737,0.952571,1.224752,0.449806,0.003926,0.0,-0.500862,-0.639087,0.952571,1.224752
1670,-1.319029,0.982749,-0.200985,0.0,-0.512471,1.523502,-1.169913,1.224752,1.338927,0.003926,0.0,-0.500862,-0.256106,-1.169913,1.224752
6087,-0.738151,0.982749,-0.605147,0.0,-0.512471,-0.225991,-1.169913,1.224752,0.449806,-1.223100,0.0,-0.500862,-0.386605,-1.169913,1.224752
6669,0.423605,-1.017554,0.862129,0.0,1.951331,-1.538110,0.952571,-1.219375,-1.328435,-1.223100,0.0,2.166305,-0.417645,0.952571,-1.219375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,-0.447712,-1.017554,0.096180,0.0,1.951331,-1.100737,-1.169913,0.002689,0.449806,1.230953,0.0,1.958041,0.192510,-1.169913,0.002689
5191,-1.609468,0.982749,-1.197831,0.0,-0.512471,-1.538110,0.952571,0.002689,0.449806,-1.223100,0.0,-0.500862,-0.597076,0.952571,0.002689
5390,1.294923,-1.017554,1.212207,0.0,-0.512471,-1.100737,-1.169913,1.224752,0.449806,0.003926,0.0,-0.500862,1.585590,-1.169913,1.224752
860,0.423605,0.982749,-1.219073,0.0,1.951331,-1.538110,-1.169913,0.002689,-0.439315,0.003926,0.0,2.290744,-0.635054,-1.169913,0.002689


In [27]:
# Step 4: Train Models (Safe version)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

# Ensure clean, numeric data
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

# Split again if needed (in case you modified X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(
        n_estimators=150,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        verbosity=0,
        n_jobs=-1
    )
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    # Basic metrics
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    mape = mean_absolute_percentage_error(y_test, preds)

    results[name] = {"RMSE": rmse, "R2": r2, "MAPE": mape}
    print(f"{name}: RMSE={rmse:.2f}, R²={r2:.3f}, MAPE={mape:.3f}")



Training LinearRegression...
LinearRegression: RMSE=102.68, R²=0.404, MAPE=0.704

Training RandomForest...
RandomForest: RMSE=4.58, R²=0.999, MAPE=0.012

Training XGBoost...
XGBoost: RMSE=7.52, R²=0.997, MAPE=0.019


In [28]:
results_df = pd.DataFrame(results).T.sort_values("RMSE")
display(results_df)

best_model_name = results_df.index[0]
print(f"\n✅ Best Performing Model: {best_model_name}")

best_model = models[best_model_name]

# Ensure artifacts directory exists
artifacts_dir = os.path.join("..", "artifacts")
os.makedirs(artifacts_dir, exist_ok=True)

# Paths must match main.py
model_path = os.path.join(artifacts_dir, "RandomForest_wallet_model.pkl")
scaler_path = os.path.join(artifacts_dir, "scaler.pkl")

# Save model with pickle
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)

# Save scaler used to transform features
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

print("Saved model to:", model_path)
print("Saved scaler to:", scaler_path)
# --- NEW: save encoders and column info for inference ---

encoders_path = os.path.join(artifacts_dir, "label_encoders.pkl")
numeric_cols_path = os.path.join(artifacts_dir, "numeric_cols.pkl")
feature_cols_path = os.path.join(artifacts_dir, "feature_columns.pkl")

with open(encoders_path, "wb") as f:
    pickle.dump(le_dict, f)

with open(numeric_cols_path, "wb") as f:
    pickle.dump(list(numeric_cols), f)

with open(feature_cols_path, "wb") as f:
    pickle.dump(list(X.columns), f)

print("Saved label encoders to:", encoders_path)
print("Saved numeric cols to:", numeric_cols_path)
print("Saved feature columns to:", feature_cols_path)

Unnamed: 0,RMSE,R2,MAPE
RandomForest,4.581218,0.998813,0.012167
XGBoost,7.522683,0.996799,0.0194
LinearRegression,102.683767,0.403513,0.703888



✅ Best Performing Model: RandomForest
Saved model to: ..\artifacts\RandomForest_wallet_model.pkl
Saved scaler to: ..\artifacts\scaler.pkl
Saved label encoders to: ..\artifacts\label_encoders.pkl
Saved numeric cols to: ..\artifacts\numeric_cols.pkl
Saved feature columns to: ..\artifacts\feature_columns.pkl


In [29]:
# Step 6: Make Sample Predictions

sample = X_test.sample(5, random_state=1)
preds = best_model.predict(sample)
pd.DataFrame({
    "Predicted_Spend": preds.round(2),
    "Actual_Spend": y_test.loc[sample.index].values.round(2)
})


Unnamed: 0,Predicted_Spend,Actual_Spend
0,293.78,293.06
1,43.52,44.27
2,241.63,239.73
3,371.19,369.15
4,103.91,104.38


In [30]:
# Example: training code (you already did something similar earlier) 
X = df.drop(['total_cost'], axis=1)   # all columns except 'target'
X_encoded = pd.get_dummies(X)     # or some encoder
scaler.fit(X_encoded)
model.fit(scaler.transform(X_encoded), df['total_cost'])

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
# 1. Construct a one-row DataFrame with SAME column names as original X
sample_dict = {
    "tenure": 12,
    "year": 2024,
    "feature3": 6.12,
    "feature4": 6.12,
    "feature5": 72.46,
    "feature6": 72.46,
    "feature7": 336.08,
    "feature8": 60,
    "feature9": 11.83986928,
    "feature10": 0,
    "City": "Hyderabad",
    "VehicleType": "2W",
    "PlanType": "Basic",
    "PaymentMethod": "UPI",
    "SpeedTier": "Superfast",
}

X_new = pd.DataFrame([sample_dict])   # shape (1, n_features)

# 2. Apply the SAME preprocessing as training
# Example if you used pd.get_dummies on full training data:
X_new_encoded = pd.get_dummies(X_new)

# Align columns with training columns
X_new_encoded = X_new_encoded.reindex(columns=X_encoded.columns, fill_value=0)

# 3. Scale and predict
values_scaled = scaler.transform(X_new_encoded)
prediction = model.predict(values_scaled)
prediction

array([14.605866], dtype=float32)