In [63]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from statistics import stdev
import math
import scipy
import numpy as np

In [2]:
#Ignoring XGBoost warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#Ignoring SciKit-Learn warnings
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
data = pd.read_csv('training.csv')
data.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id
0,1,Pat Cummins,DNB,3,28,0,0,19 Oct 2011,v South Africa Centurion,1
1,2,Steve Smith,DNB,-,-,0,0,19 Oct 2011,v South Africa Centurion,1
2,10,Mitch Marsh,8*,1,19,1,0,19 Oct 2011,v South Africa Centurion,1
3,13,David Warner,20,-,-,0,0,19 Oct 2011,v South Africa Centurion,1
4,1,Pat Cummins,11*,1,73,0,0,23 Oct 2011,v South Africa Gqeberha,2


In [4]:
data.dtypes

player_id         int64
player_name      object
runs_scored      object
wickets          object
runs_conceded    object
catches          object
stumpings        object
match_date       object
opposition       object
match_id          int64
dtype: object

In [5]:
data['match_date'] = pd.to_datetime(data['match_date'])
data.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id
0,1,Pat Cummins,DNB,3,28,0,0,2011-10-19,v South Africa Centurion,1
1,2,Steve Smith,DNB,-,-,0,0,2011-10-19,v South Africa Centurion,1
2,10,Mitch Marsh,8*,1,19,1,0,2011-10-19,v South Africa Centurion,1
3,13,David Warner,20,-,-,0,0,2011-10-19,v South Africa Centurion,1
4,1,Pat Cummins,11*,1,73,0,0,2011-10-23,v South Africa Gqeberha,2


In [6]:
data['match_id'].nunique()

557

In [7]:
data['player_id'].nunique()

30

In [8]:
data['out_notout_dnb'] = data['runs_scored'].apply(lambda x: 1 if x[-1] == '*' else (2 if x in ('DNB','TDNB') else 0))
data.head(5)

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id,out_notout_dnb
0,1,Pat Cummins,DNB,3,28,0,0,2011-10-19,v South Africa Centurion,1,2
1,2,Steve Smith,DNB,-,-,0,0,2011-10-19,v South Africa Centurion,1,2
2,10,Mitch Marsh,8*,1,19,1,0,2011-10-19,v South Africa Centurion,1,1
3,13,David Warner,20,-,-,0,0,2011-10-19,v South Africa Centurion,1,0
4,1,Pat Cummins,11*,1,73,0,0,2011-10-23,v South Africa Gqeberha,2,1


In [9]:
data['runs_scored'] = data['runs_scored'].apply(lambda x : 0 if x in ('DNB','TDNB') else int(x.replace('*','')))
data['runs_scored'] = data['runs_scored'].astype('int')


In [10]:
data['weekday'] = data.match_date.dt.weekday
data['year'] = data.match_date.dt.year
data['day'] = data.match_date.dt.day
data['week_number'] = data.match_date.dt.isocalendar().week


In [11]:
data['wickets'] = data['wickets'].apply(lambda x: 0 if x == '-' else int(x))
data['runs_conceded'] = data['runs_conceded'].apply(lambda x: 0 if x == '-' else int(x))
data['catches'] = data['catches'].apply(lambda x: 0 if x == '-' else int(x))
data['stumpings'] = data['stumpings'].apply(lambda x: 0 if x == '-' else int(x))
data.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id,out_notout_dnb,weekday,year,day,week_number
0,1,Pat Cummins,0,3,28,0,0,2011-10-19,v South Africa Centurion,1,2,2,2011,19,42
1,2,Steve Smith,0,0,0,0,0,2011-10-19,v South Africa Centurion,1,2,2,2011,19,42
2,10,Mitch Marsh,8,1,19,1,0,2011-10-19,v South Africa Centurion,1,1,2,2011,19,42
3,13,David Warner,20,0,0,0,0,2011-10-19,v South Africa Centurion,1,0,2,2011,19,42
4,1,Pat Cummins,11,1,73,0,0,2011-10-23,v South Africa Gqeberha,2,1,6,2011,23,42


In [12]:
data['opposition'] = data['opposition'].apply(lambda x: ' '.join(x.split(' ')[1:]))
data.sort_values('match_date', inplace = True)

In [13]:
data.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_date,opposition,match_id,out_notout_dnb,weekday,year,day,week_number
1380,16,Rohit Sharma,0,0,0,1,0,2007-06-23,Ireland Belfast,244,2,5,2007,23,25
1381,16,Rohit Sharma,8,0,3,1,0,2007-06-26,South Africa Belfast,245,0,1,2007,26,26
1382,16,Rohit Sharma,1,0,0,0,0,2007-10-05,Australia Hyderabad,246,0,4,2007,5,40
1383,16,Rohit Sharma,52,0,0,1,0,2007-11-18,Pakistan Jaipur,247,0,6,2007,18,46
1384,16,Rohit Sharma,29,0,0,0,0,2008-02-03,Australia Brisbane,248,0,6,2008,3,5


In [14]:
# dummy_opp = pd.get_dummies(data['opposition'],drop_first = True,dtype = int, prefix = 'opp')
# new_data = pd.concat([data,dummy_opp], axis =1)
# new_data.columns = [col.lower().replace(' ','_') for col in new_data.columns]
# new_data.drop(columns = ['opposition'],axis = 1, inplace = True)
# new_data.shape


In [15]:
players_list = list(set(data['player_id']))
final_data = pd.DataFrame()
for player_name in players_list:
    player_data = data[data['player_id'] == player_name]
    player_data.sort_values('match_date',inplace = True)
    player_data['next_runs'] = player_data['runs_scored'].shift(-1)
    player_data['next_wickets'] = player_data['wickets'].shift(-1)
    final_data = pd.concat([final_data, player_data])
    final_data.reset_index(drop = True,inplace = True)
    # print(final_data.tail())


In [18]:
oppositions_list = ['South Africa', 'Ireland', 'England', 'India', 'New Zealand', 'Scotland', 'Pakistan', 'Bangladesh', 'Afghanistan', 
                    'West Indies', 'Sri Lanka', 'Zimbabwe', 'Kenya', 'Canada', 'Australia', 'Hong Kong', 'U.A.E.', 'Nepal','Netherlands']
final_data['opposition_team'] = final_data['opposition'].apply(lambda x : x.split()[0] if x.split()[0] in oppositions_list else ' '.join(x.split()[0:2]))
final_data['venue'] = final_data['opposition'].apply(lambda x: x.replace(x.split()[0],'') if x.split()[0] in oppositions_list else x.replace(' '.join(x.split()[0:2]),''))
final_data.drop(columns= ['opposition','match_date'],inplace = True)

In [19]:
final_data.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,stumpings,match_id,out_notout_dnb,weekday,year,day,week_number,next_runs,next_wickets,opposition_team,venue
0,1,Pat Cummins,0,3,28,0,0,1,2,2,2011,19,42,11.0,1.0,South Africa,Centurion
1,1,Pat Cummins,11,1,73,0,0,2,1,6,2011,23,42,6.0,1.0,South Africa,Gqeberha
2,1,Pat Cummins,6,1,49,0,0,3,1,4,2011,28,43,0.0,1.0,South Africa,Durban
3,1,Pat Cummins,0,1,11,0,0,4,2,5,2012,23,25,4.0,1.0,Ireland,Belfast
4,1,Pat Cummins,4,1,53,0,0,5,0,4,2012,29,26,1.0,2.0,England,Lord's


In [20]:
final_data['opposition_team'] = final_data.opposition_team.astype('category').cat.codes
final_data['venue'] = final_data.venue.astype('category').cat.codes


In [92]:
final_data_bat = final_data[['player_id', 'player_name', 'runs_scored', 'catches', 'stumpings', 'match_id', 'out_notout_dnb', 
 'weekday', 'year', 'day', 'week_number', 'next_runs', 'opposition_team', 'venue']]
final_data_bowl = final_data[['player_id', 'player_name', 'runs_scored', 'wickets', 'runs_conceded', 'catches', 
 'match_id', 'out_notout_dnb', 'weekday', 'year', 'day', 'week_number', 'next_wickets', 'opposition_team', 'venue']]

In [93]:
models = pd.DataFrame()
models1 = pd.DataFrame()
players = list(set(final_data['player_name']))

for player in players:
    # print(player)
    player_data_bat = final_data_bat[final_data_bat['player_name']==player]
    # print(player_data_bat.shape)

    player_data_bat = player_data_bat.dropna()
    y_runs = player_data_bat['next_runs']
    X_bat = player_data_bat.drop(columns = ['next_runs'], axis = 1)
    X_runs = X_bat[X_bat.columns[2:]]
    # print(X_runs.head())
    # print(y_runs.head())
    X_train_runs, X_test_runs, y_train_runs,y_test_runs = train_test_split(X_runs, y_runs, random_state=42)
    ridge_runs = pd.DataFrame()
    
    for j in range(0, 101):
        points_runs = Ridge(alpha=j).fit(X_train_runs, y_train_runs)
        ridge_df_runs = pd.DataFrame({'Alpha': pd.Series(j), 'Train': pd.Series(points_runs.score(X_train_runs, y_train_runs)),
                                       'Test': pd.Series(points_runs.score( X_test_runs, y_test_runs))})
        ridge_runs = pd.concat([ridge_runs,ridge_df_runs])

    # print(ridge_runs.head())
    ridge_runs['Average'] = ridge_runs[['Train', 'Test']].mean(axis=1)

    try:
        # Find the alpha value with the highest average score
        k_runs = ridge_runs[ridge_runs['Average'] == ridge_runs['Average'].max()]['Alpha'][0]
        k_runs = k_runs.head(1)[0]
    except:
        k_runs = ridge_runs[ridge_runs['Average'] == ridge_runs['Average'].max()]['Alpha'][0]

    # Train the model with the best alpha value
    next_runs = Ridge(alpha=k_runs)
    next_runs.fit(X_train_runs, y_train_runs)
    sd_next_runs = stdev(X_train_runs['runs_scored'].astype('float'))

    latest = X_bat.tail(1)
    # print(latest)
    latest['next_runs'] = next_runs.predict(latest[latest.columns[2:]])
    latest['next_runs_ll_95'], latest['next_runs_ul_95'] = latest['next_runs'] - scipy.stats.norm.ppf(.95) * ( 
        sd_next_runs / math.sqrt(len(X_train_runs))), latest['next_runs'] + scipy.stats.norm.ppf(.95) * ( sd_next_runs / math.sqrt(len(X_train_runs)))

    models = pd.concat([models,latest])

for player in players:

    player_data_bowl = final_data_bowl[final_data_bowl['player_name']==player]
    # print(player_data_bat.shape)

    player_data_bowl = player_data_bowl.dropna()
    y_bowl = player_data_bowl['next_wickets']
    X_wicket = player_data_bowl.drop(columns = ['next_wickets'], axis = 1)
    X_bowl = X_wicket[X_wicket.columns[2:]]
    # print(X_runs.head())
    # print(y_runs.head())
    X_train_bowl, X_test_bowl, y_train_bowl,y_test_bowl = train_test_split(X_bowl, y_bowl, random_state=42)
    ridge_bowl = pd.DataFrame()
    
    for j in range(0, 101):
        points_bowl = Ridge(alpha=j).fit(X_train_bowl, y_train_bowl)
        ridge_df_bowl = pd.DataFrame({'Alpha': pd.Series(j), 'Train': pd.Series(points_bowl.score(X_train_bowl, y_train_bowl)),
                                       'Test': pd.Series(points_bowl.score( X_test_bowl, y_test_bowl))})
        ridge_bowl = pd.concat([ridge_bowl,ridge_df_bowl])

    # print(ridge_bowl.head())
    ridge_bowl['Average'] = ridge_bowl[['Train', 'Test']].mean(axis=1)

    try:
        # Find the alpha value with the highest average score
        k_bowl = ridge_bowl[ridge_bowl['Average'] == ridge_bowl['Average'].max()]['Alpha'][0]
        k_bowl = k_bowl.head(1)[0]
    except:
        k_bowl = ridge_bowl[ridge_bowl['Average'] == ridge_bowl['Average'].max()]['Alpha'][0]

    # Train the model with the best alpha value
    next_wickets = Ridge(alpha=k_bowl)
    next_wickets.fit(X_train_bowl, y_train_bowl)
    sd_next_wickets = stdev(X_train_bowl['wickets'].astype('float'))



    latest = X_wicket.tail(1)
    # print(latest)
    latest['next_wickets'] = next_wickets.predict(latest[latest.columns[2:]])
    latest['next_wickets_ll_95'], latest['next_wickets_ul_95'] = latest['next_wickets'] - scipy.stats.norm.ppf(.95) * ( 
        sd_next_wickets / math.sqrt(len(X_train_bowl))), latest['next_wickets'] + scipy.stats.norm.ppf(.95) * ( sd_next_wickets / math.sqrt(len(X_train_bowl)))
    
    models1 = pd.concat([models1,latest])

In [122]:
ridge_bowl

Unnamed: 0,Alpha,Train,Test,Average
0,0,0.31415,0.015234,0.164692
0,1,0.314133,0.016384,0.165258
0,2,0.31409,0.017377,0.165734
0,3,0.314029,0.018247,0.166138
0,4,0.313955,0.019018,0.166486
0,5,0.313871,0.019708,0.16679
0,6,0.313782,0.020332,0.167057
0,7,0.313688,0.0209,0.167294
0,8,0.313592,0.021421,0.167507
0,9,0.313493,0.021903,0.167698


In [94]:
models.head()

Unnamed: 0,player_id,player_name,runs_scored,catches,stumpings,match_id,out_notout_dnb,weekday,year,day,week_number,opposition_team,venue,next_runs,next_runs_ll_95,next_runs_ul_95
1773,19,Virat Kohli,0,1,0,458,2,6,2023,17,37,15,24,42.602228,37.700663,47.503793
299,4,Josh Inglis,45,1,0,76,0,4,2023,22,38,6,58,5.620138,-11.729755,22.97003
506,9,Travis Head,38,0,0,181,0,1,2023,12,37,14,68,56.022773,47.191392,64.854155
2403,27,Mohammed Shami,0,0,0,76,2,4,2023,22,38,1,58,3.211596,2.28484,4.138352
374,7,Cameron Green,19,0,0,162,0,6,2023,24,38,6,43,11.656605,1.524956,21.788254


In [95]:
models1.head()

Unnamed: 0,player_id,player_name,runs_scored,wickets,runs_conceded,catches,match_id,out_notout_dnb,weekday,year,day,week_number,opposition_team,venue,next_wickets,next_wickets_ll_95,next_wickets_ul_95
1773,19,Virat Kohli,0,0,0,1,458,2,6,2023,17,37,15,24,-0.009562,-0.023064,0.003939
299,4,Josh Inglis,45,0,0,1,76,0,4,2023,22,38,6,58,0.0,0.0,0.0
506,9,Travis Head,38,2,39,0,181,0,1,2023,12,37,14,68,0.081532,-0.092755,0.255819
2403,27,Mohammed Shami,0,5,51,0,76,2,4,2023,22,38,1,58,1.508494,1.238359,1.778628
374,7,Cameron Green,19,2,103,0,162,0,6,2023,24,38,6,43,0.344068,-0.257667,0.945802


In [113]:
prediction = pd.merge(models[['player_id','next_runs','next_runs_ll_95','next_runs_ul_95']],models1[['player_id','next_wickets','next_wickets_ll_95','next_wickets_ul_95']], how = 'inner', on = ['player_id'] )

In [114]:
prediction['next_runs'] = round(prediction['next_runs'], 0)
prediction['next_runs_ll_95'] = round(prediction['next_runs_ll_95'], 0)
prediction['next_runs_ul_95'] = round(prediction['next_runs_ul_95'], 0)

prediction['next_wickets'] = round(prediction['next_wickets'], 0)
prediction['next_wickets_ll_95'] = round(prediction['next_wickets_ll_95'], 0)
prediction['next_wickets_ul_95'] = round(prediction['next_wickets_ul_95'], 0)


prediction['next_wickets'] = np.where(
    prediction['next_wickets'] < 0,
    0,
    prediction['next_wickets']
)
prediction['next_wickets_ll_95'] = np.where(
    prediction['next_wickets_ll_95'] < 0,
    0,
    prediction['next_wickets_ll_95']
)
prediction['next_wickets_ul_95'] = np.where(
    prediction['next_wickets_ul_95'] < 0,
    0,
    prediction['next_wickets_ul_95']

)

prediction['next_runs'] = np.where(
    prediction['next_runs'] < 0,
    0,
    prediction['next_runs']
)
prediction['next_runs_ll_95'] = np.where(
    prediction['next_runs_ll_95'] < 0,
    0,
    prediction['next_runs_ll_95']
)
prediction['next_runs_ul_95'] = np.where(
    prediction['next_runs_ul_95'] < 0,
    0,
    prediction['next_runs_ul_95']

)

In [115]:
prediction

Unnamed: 0,player_id,next_runs,next_runs_ll_95,next_runs_ul_95,next_wickets,next_wickets_ll_95,next_wickets_ul_95
0,19,43.0,38.0,48.0,-0.0,-0.0,0.0
1,4,6.0,0.0,23.0,0.0,0.0,0.0
2,9,56.0,47.0,65.0,0.0,-0.0,0.0
3,27,3.0,2.0,4.0,2.0,1.0,2.0
4,7,12.0,2.0,22.0,0.0,-0.0,1.0
5,5,27.0,20.0,35.0,1.0,0.0,1.0
6,25,3.0,2.0,3.0,1.0,0.0,2.0
7,11,33.0,28.0,38.0,1.0,1.0,1.0
8,15,10.0,9.0,12.0,2.0,1.0,2.0
9,6,31.0,20.0,41.0,-0.0,-0.0,-0.0


In [118]:
prediction.rename({'next_runs':'runs','next_wickets':'wickets'},axis = 1,inplace = True)
submission = prediction[['player_id','runs','wickets']].sort_values('player_id').reset_index(drop = True)

In [121]:
submission.to_csv('submission.csv',index = False)