In [4]:
# Importing essential libraries
import pandas as pd

In [5]:
# Loading the datasets
df = pd.read_csv('ipl.csv')
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [7]:
# -- Data Cleaning --
unwanted_col = ['mid','batsman','bowler','striker','non-striker']
df.drop(labels=unwanted_col,axis=1,inplace=True)

In [11]:
# Keeping only Consistent Teams
regular_team = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']
df = df[(df['bat_team'].isin(regular_team)) & (df['bowl_team'].isin(regular_team))]

In [13]:
df = df[df['overs'] >= 5.0]

In [15]:
# Converting date column into datetime format
from datetime import datetime
df['date'] = df['date'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d'))

In [16]:
# Data Preprocessing - Converting Categorical features using onehotencoder method (pd.get_dummies)
encoded_df = pd.get_dummies(data=df, columns=['bat_team','bowl_team'])

In [18]:
# Rearranging the columns
encoded_df = encoded_df[['date', 'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
              'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
              'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
              'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab',
              'bowl_team_Kolkata Knight Riders', 'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
              'bowl_team_Royal Challengers Bangalore', 'bowl_team_Sunrisers Hyderabad',
              'overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'total']]

In [20]:
# Splitting the data into train and test set
X_train = encoded_df.drop(labels='total', axis=1)[encoded_df['date'].dt.year <= 2016]
X_test = encoded_df.drop(labels='total', axis=1)[encoded_df['date'].dt.year >= 2017]

In [21]:
y_train = encoded_df[encoded_df['date'].dt.year <= 2016]['total'].values
y_test = encoded_df[encoded_df['date'].dt.year >= 2017]['total'].values

In [22]:
# Removing the 'date' column
X_train.drop(labels='date', axis=True, inplace=True)
X_test.drop(labels='date', axis=True, inplace=True)

In [48]:
# Model Building
# Randomforest Regressor
from sklearn.ensemble import RandomForestRegressor
Rand_reg = RandomForestRegressor(n_estimators=300,max_depth=2)
Rand_reg.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [49]:
y_pred = Rand_reg.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

656.7358059430283

In [39]:
# Above accuracy is not much good
# So we change the model Try Ridge Regression or Lasso Regression
# First check out Ridge
from sklearn.linear_model import Ridge,Lasso
ridge_reg = Ridge()
ridge_reg.fit(X_train,y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [41]:
y_pred = ridge_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

251.00850756690315

In [65]:
# Lasso 
lasso_reg = Lasso()
lasso_reg.fit(X_train,y_train)
y_pred = lasso_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

262.3797366400709

In [66]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_pred = lin_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

251.32310304517708

In [50]:
# Above the models ridge regression model is given better accuracy than other model
# So We choose Ridge Regression for this Project

In [55]:
# Creating a pickle file for the classifier
import pickle
filename = 'first-innings-score-rr-model.pkl'
rr_pickle = pickle.dump(ridge_reg, open(filename, 'wb'))