In [21]:
import numpy as np
import pandas as pd
from datetime import datetime

In [22]:
ipl=pd.read_csv("https://raw.githubusercontent.com/anujvyas/IPL-First-Innings-Score-Prediction-Deployment/master/ipl.csv")
ipl.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


# Data Cleaning

In [23]:
#Dropping the columns(axis=1) and reflecting result on the original dataset(inplace=True)
ipl.drop(labels=['mid','venue','batsman','bowler','striker','non-striker'], axis=1, inplace=True) 

# Keeping the currently playing teams
current_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']

ipl = ipl[(ipl['bat_team'].isin(current_teams)) & (ipl['bowl_team'].isin(current_teams))]

# Removing First 5 overs data from each match
ipl = ipl[ipl['overs']>=5.0]

# Converting the column 'date' from string into datetime object
ipl['date'] = ipl['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

# Converting categorical features into numbers
encoded_ipl = pd.get_dummies(data=ipl, columns=['bat_team', 'bowl_team'])

In [4]:
# Checking the columns order for later prediction input
pd.set_option('display.max_columns', None)
encoded_ipl.head(1)
# column order: runs,wickets,overs,runs_last_5,wickets_last_5,total,bat_csk,bat_dd,bat_kxip,bat_kkr,bat_mi,bat_rr,bat_rcb,bat_srh,bowl_csk,bowl_dd,bowl_kxip,bowl_kkr,bowl_mi,bowl_rr,bowl_rcb,bowl_srh

Unnamed: 0,date,runs,wickets,overs,runs_last_5,wickets_last_5,total,bat_team_Chennai Super Kings,bat_team_Delhi Daredevils,bat_team_Kings XI Punjab,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Rajasthan Royals,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Chennai Super Kings,bowl_team_Delhi Daredevils,bowl_team_Kings XI Punjab,bowl_team_Kolkata Knight Riders,bowl_team_Mumbai Indians,bowl_team_Rajasthan Royals,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad
32,2008-04-18,61,0,5.1,59,0,222,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


In [5]:
# Finding range of year values
encoded_ipl['date'].dt.year

32       2008
33       2008
34       2008
35       2008
36       2008
         ... 
75884    2017
75885    2017
75886    2017
75887    2017
75888    2017
Name: date, Length: 40108, dtype: int64

In [24]:
# Splitting datasets into train and test sets
# Training dataset- for years[2008,2009,2010,2011,2012,2013,2014,2015]
# Testing dataset- for years[2016,2017]
trainset=encoded_ipl[encoded_ipl['date'].dt.year<=2015]
testset=encoded_ipl[encoded_ipl['date'].dt.year>2015]
X_train=trainset.drop(labels='total',axis=1)
X_test=testset.drop(labels='total',axis=1)
y_train=trainset['total']
y_test=testset['total']

In [25]:
# Now date is of no use in determining score. It was just used for filtering the records
X_train.drop(labels='date',axis=1,inplace=True)
X_test.drop(labels='date',axis=1,inplace=True)

In [8]:
X_train.head()

Unnamed: 0,runs,wickets,overs,runs_last_5,wickets_last_5,bat_team_Chennai Super Kings,bat_team_Delhi Daredevils,bat_team_Kings XI Punjab,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Rajasthan Royals,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Chennai Super Kings,bowl_team_Delhi Daredevils,bowl_team_Kings XI Punjab,bowl_team_Kolkata Knight Riders,bowl_team_Mumbai Indians,bowl_team_Rajasthan Royals,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad
32,61,0,5.1,59,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
33,61,1,5.2,59,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
34,61,1,5.3,59,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
35,61,1,5.4,59,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
36,61,1,5.5,58,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


In [32]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [33]:
# Testing Linear Regression Model
y_pred=model.predict(X_test)

In [34]:
# Evaluating the Linear Regression Model
# Max-value=1(best), lower values are worse
# https://scikit-learn.org/stable/modules/model_evaluation.html#explained-variance-score
from sklearn import metrics
print(metrics.explained_variance_score(y_test,y_pred))

0.7136486554350284


In [12]:
# Random Forest Regression Model
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=200,random_state=0)
model.fit(X_train,y_train)

RandomForestRegressor(n_estimators=200, random_state=0)

In [13]:
# Testing Random Forest Regression Model
y_pred=model.predict(X_test)

In [14]:
# Evaluating the Random Forest Model
from sklearn import metrics
print(metrics.explained_variance_score(y_test,y_pred))

0.6384274820394653


# Linear Regression Model Seems to Perform Well than Random Forest Regression Model

In [41]:
# Predicting using sample data
temp_data=[]
runs=32
wickets=1
overs=6.5
runs_in_last_5_overs=20
wickets_in_last_5_overs=1
temp_data=temp_data+[runs,wickets,overs,runs_in_last_5_overs,wickets_in_last_5_overs]
temp_data=temp_data+[1,0,0,0,0,0,0,0] #CSK batting
temp_data=temp_data+[0,1,0,0,0,0,0,0] #DD bowling
final_data=np.array([temp_data])
print(int(model.predict(final_data)))

151


In [35]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
# For exporting dataframes into CSV files
from project_lib import Project
project = Project(project_id='', project_access_token='')
pc = project.project_context

In [36]:
# Storing the model for later use
import joblib
project.save_data(file_name='linear_model.pkl',data=joblib.dump(model,'linear_model_prediction.pkl')[0])

{'file_name': 'linear_model.pkl',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmhc2021-donotdelete-pr-b4rupmtzwrnsav',
 'asset_id': '3856ca31-1547-42ad-b8c0-4535ef1249d8'}