In [119]:
%matplotlib inline

In [157]:
# importing essential libraries
import pandas as pd
import numpy as np 
import pickle

In [121]:
# Loading the dataset
df = pd.read_csv('/content/IPL Data/ipl.csv')
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1.0,0.0,0.1,1.0,0.0,0.0,0.0,222.0
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1.0,0.0,0.2,1.0,0.0,0.0,0.0,222.0
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2.0,0.0,0.2,2.0,0.0,0.0,0.0,222.0
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2.0,0.0,0.3,2.0,0.0,0.0,0.0,222.0
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2.0,0.0,0.4,2.0,0.0,0.0,0.0,222.0


In [122]:
# --- ]Data Cleaning ---
# Removing unwanted columns
columns_to_remove = ['mid', 'venue', 'batsman', 'bowler', 'striker', 'non-striker']
df.drop(labels = columns_to_remove, axis = 1, inplace = True)

In [123]:
# Keeping only consistent teams
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals', 
                    'Mumbau Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']

In [124]:
df = df[(df['bat_team'].isin(consistent_teams)) & (df['bowl_team'].isin(consistent_teams))]

In [125]:
df

Unnamed: 0,date,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,1.0,0.0,0.1,1.0,0.0,222.0
1,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,1.0,0.0,0.2,1.0,0.0,222.0
2,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,2.0,0.0,0.2,2.0,0.0,222.0
3,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,2.0,0.0,0.3,2.0,0.0,222.0
4,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,2.0,0.0,0.4,2.0,0.0,222.0
...,...,...,...,...,...,...,...,...,...
43121,2013-04-28,Chennai Super Kings,Kolkata Knight Riders,40.0,0.0,3.6,40.0,0.0,200.0
43122,2013-04-28,Chennai Super Kings,Kolkata Knight Riders,40.0,0.0,4.1,40.0,0.0,200.0
43123,2013-04-28,Chennai Super Kings,Kolkata Knight Riders,44.0,0.0,4.2,44.0,0.0,200.0
43124,2013-04-28,Chennai Super Kings,Kolkata Knight Riders,48.0,0.0,4.3,48.0,0.0,200.0


In [126]:
# Removing the first 5 overs data in every match
df = df[df['overs']>=5.0]

In [127]:
# Converting the column 'date' from string into datetime object

from datetime import datetime
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [128]:
# --- Data Preprocessing --- 
# Converting categorical features using OneHotEncoding method 
encoded_df = pd.get_dummies(data=df, columns=['bat_team', 'bowl_team'])
encoded_df.columns

Index(['date', 'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
       'total', 'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils',
       'bat_team_Kings XI Punjab', 'bat_team_Kolkata Knight Riders',
       'bat_team_Rajasthan Royals', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Sunrisers Hyderabad', 'bowl_team_Chennai Super Kings',
       'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab',
       'bowl_team_Kolkata Knight Riders', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad'],
      dtype='object')

In [129]:
# Rearranging the columns 

encoded_df = encoded_df[['date', 'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
       'total', 'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils',
       'bat_team_Kings XI Punjab', 'bat_team_Kolkata Knight Riders',
       'bat_team_Rajasthan Royals', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Sunrisers Hyderabad', 'bowl_team_Chennai Super Kings',
       'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab',
       'bowl_team_Kolkata Knight Riders', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad']]

In [130]:
encoded_df.shape

(14897, 21)

In [133]:
# splitting the data into train and test set 
y = encoded_df.iloc[:, -1]
X = encoded_df.iloc[:, :-1]

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [137]:
# Removing the 'date' column
X_train.drop(labels = 'date', axis = True, inplace = True)
X_test.drop(labels = 'date', axis = True, inplace = True)

In [138]:
# --- Model Building --- 
# Linear Regression Model 

from sklearn.linear_model import LinearRegression 
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [139]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [140]:
mean_squared_error(y_test, y_pred)

5.518427820463073e-31

In [142]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold
models = [('Linear Regression', LinearRegression()),
          ('Gradient Boosting', GradientBoostingRegressor()),
          ('Random Forest', RandomForestRegressor())]

In [144]:
for model in models:
  reg = model[1]
  reg.fit(X_train, y_train)
  pred = reg.predict(X_test)
  print(model[0])
  print('R2: ', r2_score(y_test, pred))
  print('RMSE: ', np.sqrt(mean_squared_error(y_test, pred)))

Linear Regression
R2:  1.0
RMSE:  7.428612131793578e-16
Gradient Boosting
R2:  0.924954943048525
RMSE:  0.04290900873261453
Random Forest
R2:  0.9999439219736087
RMSE:  0.0011729616047315866


In [152]:
# K - Nearest Neighbors Regressor
from sklearn.neighbors import KNeighborsRegressor

In [151]:
params = {
    'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'weights': ['distance', 'uniform'],
    'p': [1, 2] # p = 1 compute manhatten distance, p = 2 compute euclidean distance
}

In [153]:
knn = KNeighborsRegressor()
rs = GridSearchCV(estimator = knn, param_grid = params, cv = 5, n_jobs = -1, scoring = 'neg_mean_squared_error')
rs.fit(X_train, y_train)
print(rs.best_estimator_)

KNeighborsRegressor(n_neighbors=3, p=1, weights='distance')


In [154]:
knn = rs.best_estimator_
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

In [156]:
print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)

knn_reg = ('KNN', r2, err)

------------------------------
R2:  0.9671447899569202
Root Mean Squared Error:  0.02839157987928781


In [158]:
# Creating a pickle file for the classifier
filename = 'first-innings-score-lr-model.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [161]:
# %cd /content/drive/MyDrive/Mlops/IPL-First-Innings-Score-Prediction
# !pip freeze > requirements.txt