In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
train_df = pd.read_csv('/kaggle/input/ipl-dataset/ipl.csv')

In [None]:
train_df.info()

In [None]:
train_df.head()

## Data Preprocessing

In [None]:
# removing unwanted columns
columns_to_remove = ['mid','striker','non-striker','bowler','batsman']
train_df.drop(labels=columns_to_remove, axis=1, inplace=True)

In [None]:
train_df.info()

In [None]:
train_df['bat_team'].unique()

In [None]:
train_df['venue'].unique()

In [None]:
# checking the frequency of rows for each stadium
train_df['count']=1
train_df.groupby(['venue']).count()['count']

In [None]:
# keeping the rows with only the main teams and main venues
main_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils',
       'Sunrisers Hyderabad']

main_venues = ['M Chinnaswamy Stadium','Eden Gardens', 'Feroz Shah Kotla', 'MA Chidambaram Stadium, Chepauk',
       'Punjab Cricket Association Stadium, Mohali', 
       'Wankhede Stadium', 'Sawai Mansingh Stadium',
       'Rajiv Gandhi International Stadium, Uppal']

train_df = train_df[(train_df['bat_team'].isin(main_teams))&(train_df['bowl_team'].isin(main_teams))&(train_df['venue'].isin(main_venues))]

In [None]:
# removing the first 5 overs data because it is too early for prediction
train_df = train_df[train_df['overs']>=5.0]

In [None]:
# converting date into datetime object
train_df['date'] = train_df['date'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))

In [None]:
# encoding the categorical variable
train_df = pd.get_dummies(data=train_df, columns=['bat_team','bowl_team','venue'])

In [None]:
train_df.head()

In [None]:
sc = StandardScaler()
columns_to_scale = ['runs','wickets','overs','runs_last_5','wickets_last_5']
sc.fit(train_df[columns_to_scale])
train_df[columns_to_scale] = sc.transform(train_df[columns_to_scale])

In [None]:
x_train = train_df.drop(labels='total',axis=1)[train_df['date'].dt.year<=2016]
x_test = train_df.drop(labels='total',axis=1)[train_df['date'].dt.year>=2017]

In [None]:
y_train = train_df[train_df['date'].dt.year<=2016]['total'].values
y_test = train_df[train_df['date'].dt.year>=2017]['total'].values

In [None]:
x_train.drop(labels='date', axis=True, inplace=True)
x_test.drop(labels='date', axis=True, inplace=True)

## Model Building and Hyperparameter Tuning

In [None]:
lasso=Lasso()

In [None]:
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10], 'max_iter':[int(x) for x in np.linspace(start = 1000, stop =10000 , num = 2)], 'tol':[0.0001,0.001], 'selection':['random']}
lasso_regressor=GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=5)
lasso_regressor.fit(x_train,y_train)

In [None]:
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

In [None]:
prediction_lasso = lasso_regressor.predict(x_test)

In [None]:
#--Hyperparameter tuning--
#__Random forest regressor__
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop =600 , num = 3)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 15, num = 5)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 6]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(x_train, y_train)

In [None]:
print(rf_random.best_params_)

In [None]:
prediction_rf = rf_random.predict(X_test)

## Saving the Model

In [None]:
# Creating a pickle file for the classifier
filename = 'Batting-score-LassoReg-model.pkl'
pickle.dump(lasso_regressor, open(filename, 'wb'))