In [3]:
import pandas as pd 
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [4]:
url = 'https://raw.githubusercontent.com/sharonkadamandla/IPL_game_prediction/main/ipl_data.csv'
df = pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


#### Understanding data

In [6]:
print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

print("\nSummary statistics for numerical columns:")
print(df.describe())

print("\nValue counts for categorical columns:")
for column in df.select_dtypes(include=['object']).columns:
    print(f"\n{column}:")
    print(df[column].value_counts())

print("\nDuplicate rows:")
print(df.duplicated().sum())


Data types:
mid                 int64
date               object
venue              object
bat_team           object
bowl_team          object
batsman            object
bowler             object
runs                int64
wickets             int64
overs             float64
runs_last_5         int64
wickets_last_5      int64
striker             int64
non-striker         int64
total               int64
dtype: object

Missing values:
mid               0
date              0
venue             0
bat_team          0
bowl_team         0
batsman           0
bowler            0
runs              0
wickets           0
overs             0
runs_last_5       0
wickets_last_5    0
striker           0
non-striker       0
total             0
dtype: int64

Summary statistics for numerical columns:
                mid          runs       wickets         overs   runs_last_5  \
count  76014.000000  76014.000000  76014.000000  76014.000000  76014.000000   
mean     308.627740     74.889349      2.415844     

#### Data Cleaning 
Data has no missing values or null values

#### Data Processing

In [7]:
# Droping Colums
irrelevant = ['mid', 'date', 'venue','batsman', 'bowler', 'striker', 'non-striker']
df = df.drop(irrelevant, axis=1)

# Dropping Teams
const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
              'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
              'Delhi Daredevils', 'Sunrisers Hyderabad']
df = df[(df['bat_team'].isin(const_teams)) & (df['bowl_team'].isin(const_teams))]
df = df[df['overs'] >= 5.0]

#### Encoding

In [8]:
#label encoder
label_encoder = LabelEncoder()
for col in ['bat_team', 'bowl_team']:
  df[col] = label_encoder.fit_transform(df[col])

#onehot encoding
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1])], remainder='passthrough')
ipl_df = np.array(columnTransformer.fit_transform(df))

In [9]:
cols = ['batting_team_Chennai Super Kings', 'batting_team_Delhi Daredevils', 'batting_team_Kings XI Punjab',
              'batting_team_Kolkata Knight Riders', 'batting_team_Mumbai Indians', 'batting_team_Rajasthan Royals',
              'batting_team_Royal Challengers Bangalore', 'batting_team_Sunrisers Hyderabad',
              'bowling_team_Chennai Super Kings', 'bowling_team_Delhi Daredevils', 'bowling_team_Kings XI Punjab',
              'bowling_team_Kolkata Knight Riders', 'bowling_team_Mumbai Indians', 'bowling_team_Rajasthan Royals',
              'bowling_team_Royal Challengers Bangalore', 'bowling_team_Sunrisers Hyderabad', 'runs', 'wickets', 'overs',
       'runs_last_5', 'wickets_last_5', 'total']
df = pd.DataFrame(ipl_df, columns=cols)

In [10]:
# assign colums, split data
x = df.drop(['total'], axis=1)
y = df['total']

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.20, shuffle=True)
print(f"Training Set : {train_x.shape}\nTesting Set : {test_x.shape}")

Training Set : (32086, 21)
Testing Set : (8022, 21)


# Models

In [11]:
models = dict()

#### Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()

# Train Model
linreg.fit(train_x, train_y)

# Evaluate Model
train_score_linreg = str(linreg.score(train_x, train_y) * 100)
test_score_linreg = str(linreg.score(test_x, test_y) * 100)
print(f'Train Score : {train_score_linreg[:5]}%\nTest Score : {test_score_linreg[:5]}%')
models["linreg"] = test_score_linreg

Train Score : 66.02%
Test Score : 65.47%


#### SVM

In [13]:
svm = SVR()
# Train Model
svm.fit(train_x, train_y)

train_score_svm = str(svm.score(train_x, train_y)*100)
test_score_svm = str(svm.score(test_x, test_y)*100)
print(f'Train Score : {train_score_svm[:5]}%\nTest Score : {test_score_svm[:5]}%')
models["svm"] = test_score_svm 

Train Score : 57.64%
Test Score : 57.22%


#### Decision Tree Regressor

In [14]:
tree = DecisionTreeRegressor()
# Train Model
tree.fit(train_x, train_y)

# Evaluate Model
train_score_tree = str(tree.score(train_x, train_y) * 100)
test_score_tree = str(tree.score(test_x, test_y) * 100)
print(f'Train Score : {train_score_tree[:5]}%\nTest Score : {test_score_tree[:5]}%')
models["tree"] = test_score_tree

Train Score : 99.98%
Test Score : 85.50%


#### Lasso Regression

In [15]:
xgb = XGBRegressor()
# Train Model
xgb.fit(train_x, train_y)

train_score_xgb = str(xgb.score(train_x, train_y)*100)
test_score_xgb = str(xgb.score(test_x, test_y)*100)
print(f'Train Score : {train_score_xgb[:5]}%\nTest Score : {test_score_xgb[:5]}%')
models["xgb"] = test_score_xgb

#### Random Forest Regression

In [16]:
forest = RandomForestRegressor()

# Train Model
forest.fit(train_x, train_y)

# Evaluate Model
train_score_forest = str(forest.score(train_x, train_y)*100)
test_score_forest = str(forest.score(test_x, test_y)*100)
print(f'Train Score : {train_score_forest[:5]}%\nTest Score : {test_score_forest[:5]}%')
models["forest"] = test_score_forest

Train Score : 99.06%
Test Score : 93.20%


In [18]:
pickle.dump(forest, open("rf_model.pkl", "wb"))