In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [22]:
from sklearn import set_config
set_config(transform_output='pandas')

In [23]:
df = pd.read_csv('final_data.csv')

In [24]:
df.sample(10)

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
61614,New Zealand,India,Napier,56,69,8,6.588235,36.0,160
29640,England,New Zealand,Auckland,59,83,8,9.567568,48.0,184
71121,Bangladesh,Pakistan,Lahore,141,36,8,10.071429,49.0,196
51786,Zimbabwe,Australia,Harare,98,37,7,7.084337,39.0,151
66931,West Indies,India,Kolkata,53,50,5,4.542857,19.0,109
10275,Pakistan,New Zealand,Christchurch,114,14,5,6.45283,39.0,130
13951,Sri Lanka,England,Southampton,105,36,5,7.5,30.0,140
32485,India,Australia,Durban,132,28,7,8.608696,66.0,188
71068,Sri Lanka,New Zealand,Auckland,117,12,2,6.5,42.0,142
37768,Sri Lanka,Bangladesh,Colombo,111,23,5,6.865979,47.0,159


In [25]:
df.isna().sum()

Unnamed: 0,0
batting_team,0
bowling_team,0
city,0
current_score,0
balls_left,0
wickets_left,0
crr,0
last_five,0
runs_x,0


In [26]:
df_temp = df.copy()

In [27]:
df_temp.shape

(71885, 9)

In [28]:
X = df_temp.drop(columns=['runs_x'])
y = df_temp['runs_x']

In [29]:
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.2, random_state = 1)

In [30]:
X_train.shape

(57508, 8)

In [31]:
X_train.columns

Index(['batting_team', 'bowling_team', 'city', 'current_score', 'balls_left',
       'wickets_left', 'crr', 'last_five'],
      dtype='object')

In [32]:
num_cols = ['current_score','balls_left','wickets_left','crr','last_five']

nominal_cat_cols = ['batting_team', 'bowling_team', 'city']

In [33]:
# column transformer
preprocessor= ColumnTransformer(transformers=[
    ('nominal_encode',OneHotEncoder(drop='first',sparse_output=False),nominal_cat_cols)
],remainder='passthrough')

preprocessor.set_output(transform='pandas')

In [34]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [35]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import ExtraTreesRegressor, StackingRegressor

In [36]:
best_xgb_params = {
    'n_estimators' : 1000,
     'learning_rate' : 0.2,
     'max_depth' : 12,
     'random_state' : 1
      }

best_gb_params = {
    'n_estimators' : 184,
    'learning_rate' : 0.2327591798889126,
    'max_depth' : 17
    }


best_xgb = XGBRegressor(**best_xgb_params)
best_gb = GradientBoostingRegressor(**best_gb_params)

best_meta = ExtraTreesRegressor(n_estimators=62, max_depth=9, random_state=42)

In [37]:
stacking_reg = StackingRegressor(
    estimators=[
        ("xgb", best_xgb),
        ("gb", best_gb)
    ],
    final_estimator=best_meta,
    cv=5,
    n_jobs=-1
)

In [38]:
processing_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("scaling", StandardScaler()),
    ("model", stacking_reg)
])
processing_pipeline

In [39]:
processing_pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [40]:
y_pred = processing_pipeline.predict(X_test)



In [41]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The R2 score is {r2_score(y_test,y_pred)}")
print(f"The MAE error is {mean_absolute_error(y_test,y_pred)}")

The R2 score is 0.9868831388777716
The MAE error is 1.7406933241748026


In [49]:
y_train_pred = processing_pipeline.predict(X_train)
y_test_pred = processing_pipeline.predict(X_test)

# compute metrics
metrics = {
    "train_mae": mean_absolute_error(y_train, y_train_pred),
    "test_mae": mean_absolute_error(y_test, y_test_pred),
    "train_r2": r2_score(y_train, y_train_pred),
    "test_r2": r2_score(y_test, y_test_pred)
}

# pretty print to console
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")



train_mae: 0.2208
test_mae: 1.7407
train_r2: 0.9999
test_r2: 0.9869




In [48]:
import pickle
pickle.dump(processing_pipeline,open('pipe.pkl','wb'))

In [50]:
import os
import json
os.makedirs("outputs/metrics", exist_ok=True)

with open("outputs/metrics/metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)

print("✅ Metrics saved to outputs/metrics/metrics.json")

✅ Metrics saved to outputs/metrics/metrics.json


In [55]:
import joblib
import pickle
joblib.dump(processing_pipeline, "Pipe.pkl")

['Pipe.pkl']

In [56]:
import sklearn
print(sklearn.__version__)

1.6.1
