### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

### Importing the Dataset

In [2]:
df = pd.read_csv('Data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 872450 entries, 0 to 872449
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   match_id                872450 non-null  int64  
 1   season                  872450 non-null  object 
 2   start_date              872450 non-null  object 
 3   venue                   872450 non-null  object 
 4   innings                 872450 non-null  int64  
 5   ball                    872450 non-null  float64
 6   batting_team            872450 non-null  object 
 7   bowling_team            872450 non-null  object 
 8   striker                 872450 non-null  object 
 9   non_striker             872450 non-null  object 
 10  bowler                  872450 non-null  object 
 11  runs_off_bat            872450 non-null  int64  
 12  extras                  872450 non-null  int64  
 13  wides                   36803 non-null   float64
 14  noballs             

### Feature Engineering

In [4]:
df['batting_team'].unique()

array(['Australia', 'Sri Lanka', 'Hong Kong', 'Ireland', 'Zimbabwe',
       'India', 'Bangladesh', 'New Zealand', 'South Africa', 'England',
       'West Indies', 'Pakistan', 'Scotland', 'Oman', 'Netherlands',
       'United Arab Emirates', 'Papua New Guinea', 'ICC World XI',
       'Thailand', 'Uganda', 'Malaysia', 'Botswana', 'Lesotho', 'Malawi',
       'Namibia', 'Sierra Leone', 'Mozambique', 'Nepal', 'China',
       'Kuwait', 'Philippines', 'Vanuatu', 'United States of America',
       'Germany', 'Italy', 'Kenya', 'Nigeria', 'Tanzania', 'Rwanda',
       'Japan', 'Indonesia', 'Fiji', 'Samoa', 'Canada', 'Ghana',
       'Guernsey', 'Denmark', 'Norway', 'Jersey', 'Maldives', 'Mali',
       'Singapore', 'Qatar', 'South Korea', 'Bermuda', 'Cayman Islands',
       'Portugal', 'Spain', 'Gibraltar', 'Bhutan', 'Saudi Arabia',
       'Bahrain', 'Iran', 'Austria', 'Belgium', 'Luxembourg',
       'Czech Republic', 'Isle of Man', 'Bulgaria', 'Romania', 'Greece',
       'Serbia', 'Malta', 'France

In [5]:
consistent_teams = ['Australia', 'Bangladesh', 'India', 'New Zealand', 'South Africa', 'West Indies', 'England', 'Sri Lanka', 'Pakistan']

In [6]:
df = df[df['batting_team'].isin(consistent_teams)]

In [7]:
df = df[df['bowling_team'].isin(consistent_teams)]

In [8]:
df = df.sort_values(['match_id', 'innings'], ascending= [True, True])

In [9]:
df['overs'] = df['ball']

In [10]:
df['ball'] = df['ball'].astype(str)

In [11]:
df['over'] = df['ball'].str[0:-1]

In [12]:
df['over'] = df['over'].str.replace('.', '')

In [13]:
df['ball'] = df['ball'].str[-1]

In [14]:
df['ball'] = df['ball'].astype(int)

In [15]:
df['over'] = df['over'].astype(int)

In [16]:
df['total_runs'] = df['runs_off_bat']+df['extras']

In [17]:
df = df[['match_id', 'innings', 'batting_team', 'bowling_team', 'overs', 'ball', 'over', 'total_runs', 'player_dismissed']]

In [18]:
df = df.rename(columns= {'match_id': 'id'})

In [19]:
df = df.replace(np.nan, 0)

### Total Runs scored in Innings

In [20]:
df['total'] = df.groupby(['id', 'innings'])['total_runs'].transform('sum')

### Runs Scored till Current Ball

In [21]:
df['total_score'] = df.groupby(['id', 'innings'])['total_runs'].apply(lambda x:x.cumsum())

### Runs Scored in Previous 30 Balls

In [22]:
tmp = df.groupby(['id', 'innings'])['total_runs'].rolling(min_periods= 1, window= 30).sum().reset_index()

In [23]:
df['prev_30_runs'] = tmp['total_runs'].to_list()

### Wickets fallen in previous 30 Balls

In [24]:
df['player_dismissed'] = np.where(df['player_dismissed']== 0,0,1)

In [25]:
tmp = df.groupby(['id', 'innings'])['player_dismissed'].rolling(min_periods= 1, window= 30).sum().reset_index()

df['prev_30_wickets'] = tmp['player_dismissed'].to_list()

### Wickets fallen till Current Ball

In [26]:
df['total_wickets'] = df.groupby(['id', 'innings'])['player_dismissed'].apply(lambda x:x.cumsum())

### Dot Balls in previous 30 Balls

In [27]:
df['prev_30_dot_balls'] = df['total_runs']
df['prev_30_dot_balls'] = np.where(df['prev_30_dot_balls']== 0,1,0)

tmp = df.groupby(['id', 'innings'])['prev_30_dot_balls'].rolling(min_periods= 1, window= 30).sum().reset_index()

df['prev_30_dot_balls'] = tmp['prev_30_dot_balls'].to_list()

### Boundaries in previous 30 Balls

In [28]:
df['prev_30_boundaries'] = df['total_runs']
df['prev_30_boundaries'] = np.where(df['prev_30_boundaries']>3,1,0)

tmp = df.groupby(['id', 'innings'])['prev_30_boundaries'].rolling(min_periods= 1, window= 30).sum().reset_index()

df['prev_30_boundaries'] = tmp['prev_30_boundaries'].to_list()

### Run Rate till Current Ball

In [29]:
df['total_balls'] = df['over']*6+df['ball']

In [30]:
df['run_rate'] = 6*(df['total_score'])/df['total_balls']

### Run Rate in previous 30 Balls

In [31]:
tmp = df.groupby(['id', 'innings'])['run_rate'].rolling(min_periods= 1, window= 30).sum().reset_index()
df['prev_30_run_rate'] = tmp['run_rate'].to_list()

In [32]:
convert_dict = {'prev_30_runs': int,
                'prev_30_wickets': int,
                'prev_30_dot_balls': int,
                'prev_30_boundaries': int,
                'run_rate': int,
                'prev_30_run_rate': int}

df= df.astype(convert_dict)

In [33]:
df.to_csv('DATA_FEATURE.csv', index= None)

### ------------------------------------------------------------------------------------------------------------------------------------------------

### Loading the Dataset

In [34]:
df = pd.read_csv('DATA_FEATURE.csv')

In [35]:
df = pd.get_dummies(data= df, columns= ['batting_team', 'bowling_team'])

In [36]:
df=df[['id',  'batting_team_Australia', 'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India', 
       'batting_team_New Zealand', 'batting_team_Pakistan', 'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia','bowling_team_Bangladesh', 'bowling_team_England',
       'bowling_team_India','bowling_team_New Zealand', 'bowling_team_Pakistan', 'bowling_team_South Africa',
       'bowling_team_Sri Lanka', 'bowling_team_West Indies', 'overs', 'total_score','total_wickets', 'run_rate', 'prev_30_runs',
       'prev_30_wickets', 'prev_30_dot_balls','prev_30_boundaries', 'prev_30_run_rate','total']]

### Train-Test Split

In [37]:
X = df.drop(labels= ['total', 'id'], axis= 1)
y = df['total'].values

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42,stratify=y)

In [39]:
X_train= X_train.values
X_test=X_test.values
X_train=np.asarray(X_train).astype(np.float32)
X_test=np.asarray(X_test).astype(np.float32)

In [40]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(228090, 27) (76030, 27) (228090,) (76030,)


### Training Linear Regression Model

In [41]:
LR_model = LinearRegression()
LR_model.fit(X_train,y_train)

LinearRegression()

In [42]:
from sklearn.metrics import mean_absolute_error
pred = LR_model.predict(X_test)
mean_absolute_error(y_test, pred)

18.972521109168945

### Training Ridge Model (with Alpha= 1)

In [43]:
ridge = Ridge(alpha=1,max_iter=500)
ridge.fit(X_train,y_train)

Ridge(alpha=1, max_iter=500)

In [44]:
pr = ridge.predict(X_test)
mean_absolute_error(y_test, pr)

18.972508913376576

### Training Ridge Model (with Alpha= 5)

In [45]:
ridge5 = Ridge(alpha=5,max_iter=500)
ridge5.fit(X_train,y_train)

Ridge(alpha=5, max_iter=500)

In [46]:
prediction = ridge5.predict(X_test)
mean_absolute_error(y_test, prediction)

18.9724787914673

### Training Lasso Model

In [47]:
lasso = Lasso(alpha=0.01,max_iter=500)
lasso.fit(X_train,y_train)

Lasso(alpha=0.01, max_iter=500)

In [48]:
pd = lasso.predict(X_test)
mean_absolute_error(y_test, pd)

18.967863887720135

### Score Prediction

In [49]:
def score_prediction(Bat_team, Bowl_team, overs, total_score, total_wickets, run_rate, prev_runs_30, prev_wickets_30, prev_30_dot_balls, prev_30_boundaries, prev_30_run_rate):
    
    temp_array= list()
    
    if Bat_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
        
    elif Bat_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
        
    elif Bat_team == 'IND':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
        
    elif Bat_team == 'ENG':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
        
    elif Bat_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
        
    elif Bat_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
        
    elif Bat_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
        
    elif Bat_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
        
    elif Bat_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    if Bowl_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
        
    elif Bowl_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
        
    elif Bowl_team == 'IND':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
        
    elif Bowl_team == 'ENG':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
        
    elif Bowl_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
        
    elif Bowl_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
        
    elif Bowl_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
        
    elif Bowl_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
        
    elif Bowl_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
    
    temp_array = temp_array + [overs, total_score, total_wickets, run_rate, prev_runs_30, prev_wickets_30, prev_30_dot_balls, prev_30_boundaries, prev_30_run_rate]
    
    data = np.array([temp_array])
    
    my_prediction = int(lasso.predict(data))
    
    print('Predicted Score: ', my_prediction)
    print('Predicted Range: ', my_prediction - 20, 'to', my_prediction + 20)

In [50]:
Bat_team = 'AUS'

Bowl_team = 'IND'

overs = 10.2

total_score = 165

total_wickets = 1

run_rate = 10

prev_runs_30 = 45

prev_wickets_30 = 0

prev_30_dot_balls = 4

prev_30_boundaries = 10

prev_30_run_rate = 9 

score_prediction(Bat_team, Bowl_team, overs, total_score, total_wickets, run_rate, prev_runs_30, prev_wickets_30, prev_30_dot_balls, prev_30_boundaries, prev_30_run_rate)

Predicted Score:  239
Predicted Range:  219 to 259


#### 1. From the above given condition the predicted Score came to 239 and the predicted Range came to 219 to 259.  
#### 2.  I added the Run-Rate feature to the model with also building 2 extra models apart from Linear Regression and after comparison between these 3 models I used the Lasso model during the prediction process