In [170]:
import numpy as np
import pandas as pd
from imutils import paths
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import random

In [82]:
dataPaths = list(paths.list_files('.\ipl_csv2'))

dataPaths.pop(len(dataPaths) - 2)
dataPaths.pop(len(dataPaths) - 1)

def match_history(p_team, opp_team, num):
    req_df = []
    
    for match_file in dataPaths:
        try:
            df = pd.read_csv(match_file, index_col='match_id')
            df = preprocess(df)

            inn1 = df[df['innings'] == 1]
            inn2 = df[df['innings'] == 2]

            if (inn1.iloc[1]['batting_team'] == p_team or p_team == -1):
                if (inn1.iloc[1]['bowling_team'] == opp_team or opp_team == -1):
                    if len(req_df) < num or num == -1:
                        req_df.append(df)
            elif (inn2.iloc[1]['batting_team'] == p_team or p_team == -1):
                if (inn2.iloc[1]['bowling_team'] == opp_team or opp_team == -1):
                    if len(req_df) < num or num == -1:
                        req_df.append(df)

        except Exception as e:
            print(e)

    return req_df


def preprocess(df):
    pp_df = df[df['ball'] <= 6.1]
    pp_df = pp_df[pp_df['innings'] <= 2]

    pp_df['total_runs'] = pp_df['runs_off_bat'] + pp_df['extras']

    return pp_df

In [83]:
p_team = "Chennai Super Kings"
opp_team = "Kings XI Punjab"
df_list = match_history(p_team, -1, -1)

single positional indexer is out-of-bounds
single positional indexer is out-of-bounds


In [68]:
import csv

def read_files(fileName, n_features):
    dictionary = {}
    with open(fileName, mode='r') as infile:
        reader = csv.reader(infile)

        for row in reader:
            for i in range(1, n_features + 1):
                dictionary.setdefault(row[0], []).append(row[i])
    
    return dictionary
            

In [94]:
stad_rating = read_files('stadium_rating.csv', 1)

for k in stad_rating:
    stad_rating[k] = float(stad_rating[k][0])

print(stad_rating)

{'Rajiv Gandhi International Stadium, Uppal': 45.1875, 'Maharashtra Cricket Association Stadium': 47.285714285714285, 'Saurashtra Cricket Association Stadium': 54.9, 'Holkar Cricket Stadium': 51.611111111111114, 'M.Chinnaswamy Stadium': 45.72435897435897, 'Wankhede Stadium': 45.29054054054054, 'Eden Gardens': 46.103896103896105, 'M Chinnaswamy Stadium': 45.72435897435897, 'Feroz Shah Kotla': 47.57823129251701, 'Punjab Cricket Association IS Bindra Stadium, Mohali': 48.098214285714285, 'Green Park': 54.375, 'Punjab Cricket Association IS Bindra Stadium': 48.098214285714285, 'Rajiv Gandhi International Stadium': 45.1875, 'MA Chidambaram Stadium': 46.21186440677966, 'Sawai Mansingh Stadium': 44.88297872340426, 'Arun Jaitley Stadium': 47.57823129251701, 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium': 40.73076923076923, 'Sheikh Zayed Stadium': 43.310344827586206, 'Dubai International Cricket Stadium': 44.45454545454545, 'Sharjah Cricket Stadium': 46.916666666666664, 'MA Chidambaram S

In [96]:
batsmen_record = read_files('batsmen_record.csv', 2)

for k in batsmen_record:
    for (i, p) in enumerate(batsmen_record[k]):
        batsmen_record[k][i] = [int(s) for s in p[1:-1].split(',')]

In [97]:
bowlers_record = read_files('bowlers_record.csv', 4)

for k in bowlers_record:
    for (i, p) in enumerate(bowlers_record[k]):
        bowlers_record[k][i] = [int(s) for s in p[1:-1].split(',')]

In [98]:
def get_venue_score(df):
    venue = df.iloc[1]['venue']
    return stad_rating[venue]

def get_batsmen_scores(df):
    u_bat = list(set(df['striker'].unique()) | set(df['non_striker'].unique()))

    rating = []

    for batsman in u_bat:
        rating.append(np.mean(batsmen_record[batsman][0]))
    
    return rating
    
def get_bowlers_scores(df):

    u_bowlers = list(df['bowler'].unique())

    rating = []

    for (i,bowler) in enumerate(u_bowlers, 0):
        rating.append(random.uniform(0, 1))
    
    return rating

def get_wickets(df):
    return len(get_batsmen_scores(df)) - 2

In [99]:
X = []
Y = []

for df in df_list:
    match_id = df.index[0]

    #innings for which we are going to use for training (when the team is batting)
    inn = df.loc[(df['batting_team'] == p_team)]['innings'].iloc[0] 
    df_train_inn = df.loc[(df['innings'] == inn)]

    runs_x = df_train_inn['total_runs'].sum() #runs of training innings
    runs_target = df.loc[(df['innings'] == 3 - inn)]['total_runs'].sum() #runs of other innings



    #convert the df into usable training data with required features
    # to add bowler rating: 'bowlers_rating': [numpy.array(get_bowlers_scores(df_train_inn))] 
    temp = pd.DataFrame({'venue': [get_venue_score(df)], 'batsmen_rating': [np.sum(np.array(get_batsmen_scores(df_train_inn)))], 'wickets': [get_wickets(df_train_inn)], 'other_runs': [runs_target], 'chasing': [0 if inn == 1 else 1]})

    X.append(temp.values.tolist()[0])
    Y.append(runs_x)

In [124]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=42)

print(train_x, train_y)
print(test_x, test_y)

[[38.5, 48.26905480140774, 1.0, 65.0, 0.0], [40.73076923076923, 30.888888888888886, 0.0, 42.0, 1.0], [47.57823129251701, 24.614718614718612, 0.0, 40.0, 1.0], [45.72435897435897, 51.74051573426574, 1.0, 46.0, 0.0], [44.45454545454545, 41.33694083694083, 2.0, 37.0, 1.0], [46.21186440677966, 35.95479368932039, 0.0, 53.0, 1.0], [41.911764705882355, 35.95479368932039, 0.0, 33.0, 0.0], [46.21186440677966, 47.1781512605042, 2.0, 36.0, 0.0], [44.45454545454545, 30.888888888888886, 0.0, 47.0, 1.0], [47.57823129251701, 32.12127096204766, 0.0, 57.0, 0.0], [46.21186440677966, 67.3657562561653, 3.0, 35.0, 0.0], [44.88297872340426, 32.12127096204766, 0.0, 24.0, 0.0], [43.310344827586206, 48.00909673012892, 1.0, 24.0, 0.0], [48.098214285714285, 52.263262304039, 2.0, 79.0, 1.0], [44.88297872340426, 32.12127096204766, 0.0, 48.0, 0.0], [46.21186440677966, 32.12127096204766, 0.0, 56.0, 0.0], [43.310344827586206, 36.02380261248186, 0.0, 66.0, 0.0], [47.57823129251701, 28.942857142857143, 0.0, 54.0, 1.0], 

In [216]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [221]:
linear = LinearRegression()
bayesian = BayesianRidge()
dtree = DecisionTreeRegressor(random_state = 0) 
rforest = RandomForestRegressor(n_estimators = 100, random_state = 0)
lasso = Lasso(alpha =0.0005, random_state=1)
ENet = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

models = [linear,bayesian,dtree,rforest, lasso, ENet, KRR, GBoost, model_lgb]

In [222]:
def get_predictions(model):
    model.fit(train_x, train_y)
    return model.predict(test_x)

In [223]:
for model in models:
    y_pred = get_predictions(model)
    print(mean_squared_error(y_pred, test_y, squared=False))

11.831544359919674
12.0214588937326
16.557140118054473
13.677879648054615
11.83154659064275
11.831618336760235
13.001279482961074
15.28276426371131
14.587990323528206


In [210]:
#polynomial regression
polynomial_features= PolynomialFeatures(degree=2)
poly_reg = LinearRegression()
poly_reg.fit(polynomial_features.fit_transform(train_x), train_y)
y_pred_poly = poly_reg.predict(polynomial_features.fit_transform(test_x))
print(mean_squared_error(y_pred_poly, test_y, squared=False))

11.831544359919675
