In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
df = pd.read_csv('final_data.csv')

In [None]:
df.sample(10)

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
66717,West Indies,England,London,208,0,2,10.4,53.0,208
24611,Australia,South Africa,Johannesburg,169,11,5,9.302752,41.0,196
53733,Bangladesh,Sri Lanka,Colombo,130,20,5,7.8,45.0,155
54076,Pakistan,South Africa,Centurion,52,83,9,8.432432,44.0,195
13552,Pakistan,England,Karachi,93,53,10,8.328358,40.0,166
29585,Sri Lanka,Pakistan,London,48,95,10,11.52,46.0,150
7600,England,Pakistan,Karachi,129,41,7,9.797468,48.0,221
62998,Bangladesh,Pakistan,Lahore,52,70,10,6.24,36.0,141
38345,Pakistan,India,Mirpur,40,74,5,5.217391,22.0,83
33032,England,Pakistan,Manchester,80,59,8,7.868852,54.0,131


In [None]:
df.isna().sum()

Unnamed: 0,0
batting_team,0
bowling_team,0
city,0
current_score,0
balls_left,0
wickets_left,0
crr,0
last_five,0
runs_x,0


In [None]:
df_temp = df.copy()

In [None]:
df_temp.shape

(71885, 9)

In [None]:
X = df_temp.drop(columns=['runs_x'])
y = df_temp['runs_x']

In [None]:
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.2, random_state = 1)

In [None]:
X_train.shape

(57508, 8)

In [None]:
X_train.columns

Index(['batting_team', 'bowling_team', 'city', 'current_score', 'balls_left',
       'wickets_left', 'crr', 'last_five'],
      dtype='object')

In [None]:
num_cols = ['current_score','balls_left','wickets_left','crr','last_five']

nominal_cat_cols = ['batting_team', 'bowling_team', 'city']

In [None]:
# column transformer
preprocessor= ColumnTransformer(transformers=[
    ('nominal_encode',OneHotEncoder(drop='first',sparse_output=False),nominal_cat_cols)
],remainder='passthrough')

preprocessor.set_output(transform='pandas')

In [None]:
processing_pipeline = Pipeline(steps=[
                                ("preprocess",preprocessor),
                                ("scaling",StandardScaler()),
                                ("model",LinearRegression())
                            ])

processing_pipeline

In [None]:
processing_pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred = processing_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The R2 score is {r2_score(y_test,y_pred)}")
print(f"The MAE error is {mean_absolute_error(y_test,y_pred)}")

The R2 score is 0.6934866127993256
The MAE error is 13.666049544326134


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(processing_pipeline,
                         X_train,
                         y_train,
                         scoring="neg_mean_absolute_error",
                         cv=5,n_jobs=-1)

print(f"The cross-validation MAE is {-scores.mean()}")

The cross-validation MAE is 13.531806051643311


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [None]:
processing_pipeline = Pipeline(steps=[
                                ("preprocess",preprocessor),
                                ("scaling",StandardScaler()),
                                ("model",XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
                            ])

processing_pipeline

In [None]:
processing_pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred = processing_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The R2 score is {r2_score(y_test,y_pred)}")
print(f"The MAE error is {mean_absolute_error(y_test,y_pred)}")

The R2 score is 0.9848944544792175
The MAE error is 1.9422072172164917


In [None]:
processing_pipeline = Pipeline(steps=[
                                ("preprocess",preprocessor),
                                ("scaling",StandardScaler()),
                                ("model",GradientBoostingRegressor(n_estimators=184, learning_rate=0.2327591798889126, max_depth= 17))
                            ])

processing_pipeline

In [None]:
processing_pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred = processing_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The R2 score is {r2_score(y_test,y_pred)}")
print(f"The MAE error is {mean_absolute_error(y_test,y_pred)}")

The R2 score is 0.9822164896939378
The MAE error is 1.9246871734484794


In [21]:
from sklearn.ensemble import RandomForestRegressor
processing_pipeline = Pipeline(steps=[
                                ("preprocess",preprocessor),
                                ("scaling",StandardScaler()),
                                ("model",RandomForestRegressor(n_estimators=201,max_depth=13,max_features=None,min_samples_split=2,min_samples_leaf=1,max_samples=0.7136902216842659))
                            ])

processing_pipeline

In [22]:
processing_pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
y_pred = processing_pipeline.predict(X_test)

In [24]:
print(f"The R2 score is {r2_score(y_test,y_pred)}")
print(f"The MAE error is {mean_absolute_error(y_test,y_pred)}")

The R2 score is 0.8427221310656158
The MAE error is 9.508533578020497
