In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import PassiveAggressiveRegressor

# Robust regression is interested in fitting a regression model in the presence of corrupt data: 
# either outliers, or error in the model.
# Scikit-learn provides 3 robust regression estimators: RANSAC, Theil Sen and HuberRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor

# TO BE WORKED ON : Polynomial regression: extending linear models with basis functions

from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.svm import NuSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
#from xgboost.sklearn import XGBRegressor
#from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor

from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error,r2_score
from math import sqrt

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
def getRegressors():
    Regressors = []
    RegList=[]
    # 0
    Linear = LinearRegression()
    Regressors.append(Linear)
    RegList.append('Linear')

    #XGBR = XGBRegressor()
    #Regressors.append(XGBR)

    # 1
    RandomForest = RandomForestRegressor(max_depth=3,min_samples_leaf=10, min_samples_split=15,
     n_estimators=50)
    Regressors.append(RandomForest)
    RegList.append('RF')

    # 11
    #LGBMR = LGBMRegressor()
    #Regressors.append(LGBMR)


    # 2
    ExtraTrees = ExtraTreesRegressor(n_estimators=10,min_samples_leaf=10, min_samples_split=10, random_state=0)
    Regressors.append(ExtraTrees)
    RegList.append('ExtraTrees')

    # 3
    GradientBoosting = GradientBoostingRegressor()
    Regressors.append(GradientBoosting)
    RegList.append('GradBoost')

    # 1
    #lars=Lars()
    #Regressors.append(lars)

    # 4
    lasso = Lasso()
    Regressors.append(lasso)
    RegList.append('Lasso')

    # 5
    elasticNet = ElasticNet()
    Regressors.append(elasticNet)
    RegList.append('E-net')

    # 6
    ridge = Ridge()
    Regressors.append(ridge)
    RegList.append('Ridge')

    #CatBoost = CatBoostRegressor(verbose=0)
    #Regressors.append(CatBoost)

  

    # 7
    AdaBoost = AdaBoostRegressor()
    Regressors.append(AdaBoost)
    RegList.append('Adaboost')
 
    # 8
    KNeighbors = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=10, p=2,
          weights='uniform')
    Regressors.append(KNeighbors)
    RegList.append('KNN')

    # 25
    #SGD = SGDRegressor()
    #Regressors.append(SGD)

  
    return Regressors,RegList

def predictAll(Regressors, X_train, y_train, X_test,y_test,RegList):

    predictions=[]
    cvscores=[]
    models =[]
    fitted_model=[]
    y_train= y_train.ravel()
    for regressor in Regressors:
        model_name = type(regressor).__name__
        models.append(model_name)
        print(' Model', model_name)
        regressor.fit(X_train,y_train)
        fitted_model.append(regressor)
        prediction = regressor.predict(X_test)
        predictions.append(np.round(prediction))

        score = cross_val_score(regressor,X_train, y_train,cv=5,scoring="neg_mean_squared_error")
        meanscore=score.mean()
        cvscores.append(np.sqrt(-meanscore))
        #print('CV Score',score,meanscore, np.sqrt(-meanscore))
    
    RMS=[]
    R2=[]
    scr=pd.DataFrame( columns = RegList)
    i=0
    for prediction in predictions:
        msscore = mean_squared_error(y_test, prediction)
        r2=r2_score(y_test, prediction)
        rms = sqrt(msscore)
        RMS.append(rms)
        R2.append(r2)
        scr.iloc[:,i]=np.round(prediction,0)
        i=i+1

    #scr.index=X_test.ravel()
    scr=scr.sort_index()

    compare = pd.DataFrame(list(zip(models,RMS,R2,cvscores)), columns=['Model','RMS','R2','CV'])
    return scr, fitted_model, compare.sort_values(by='RMS')

#Display the table of predictions
def display_result_runs(models, Reglist):
    
    idx = np.array(range(100,145, 5))
    i=0    
    scr=pd.DataFrame( columns = Reglist)
    for model in models:

        p= model.predict(idx.reshape(-1, 1))
        #print(score,round(p[0][0]))
        scr.iloc[:,i]=np.round(p)
        i=i+1

    scr.index=idx

    #result = pd.DataFrame(table, columns=['SCORE','PRED'])
    return scr

## Load Data of IPL

In [None]:
matchdata=pd.read_csv('/Data/match_scores.csv',index_col=0)
matchdata.head()

In [None]:
matchdata.describe()

In [None]:
matchdata.Innings_No.value_counts()

In [None]:
### innings 3 and 4 are superovers - avoid ininings 2 too and pick innings which have played full
matchdata=matchdata[(matchdata['Innings_No']==1) | (matchdata['Innings_No']==2) ]

### Compare Innings 1 and 2 Scores

In [None]:
fig,ax=plt.subplots(1,2,figsize=(12,4))
ax0=ax[0]
ax1=ax[1]

# all matches which have gone till 20th over 

cond = (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data = matchdata[cond]

data=matchdata[(matchdata['Innings_No']==1)]
no_of_innings = data.shape[0]

sns.scatterplot(x='RUNS_15',y='RUNS_20', data=matchdata[(matchdata['Innings_No']==1)],  ax=ax0)
ax0.set_title('Innings 1 # Matches '+str(no_of_innings))


data=matchdata[(matchdata['Innings_No']==2)]
no_of_innings = data.shape[0]

sns.scatterplot(x='RUNS_15',y='RUNS_20', data=matchdata[(matchdata['Innings_No']==2)], color='r', ax=ax1)
ax1.set_title('Innings 2 # Matches '+str(no_of_innings))

In [None]:
fig,ax=plt.subplots(1,2,figsize=(12,4))

cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data=matchdata[cond]

sns.distplot(data['RUNS_15'], color='b', label='15 overs', ax=ax[0])

cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
sns.distplot(data['RUNS_20'], color='r', label='20 overs', ax=ax[0])
ax[0].set_title ('Innings 1 - Score Distribution : 15 and 20 overs')

ax[0].legend()

## Innings 2

cond = (matchdata['Innings_No']==2) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data=matchdata[cond]

sns.distplot(data['RUNS_15'], color='b', label='15 overs', ax=ax[1])

cond = (matchdata['Innings_No']==2) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
sns.distplot(data['RUNS_20'], color='r', label='20 overs', ax=ax[1])
ax[1].legend()

ax[1].set_title ('Innings 2 - Score Distribution : 15 and 20 overs')

ax[0].legend()


plt.show()
#sns.barplot(data = data, y='RUNS_15', x='Innings_No',palette='Paired_r')

In [None]:
# Overlayed Scatter
cond = (matchdata['Innings_No']>=1) & (matchdata['Innings_No']<=2) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data = matchdata[cond]
sns.scatterplot(x='RUNS_15',y='RUNS_20', data=data, hue='Innings_No')
plt.title('Innings 1 and 2 Scores - Overlayed')
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.show()

### Check out the distribution of scores of  first Innings Only - Scatter Plot

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
sns.set_style('white')

cond = (matchdata['Innings_No']==1) &  (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data = matchdata[cond]


sns.scatterplot(x='RUNS_15',y='RUNS_20', data=data, ax=ax)
no_of_innings = data.shape[0]
ax.set(title ='All IPL Matches ( 1st Inninings) - No of Innings : '+str(no_of_innings))
plt.show()

### Check out the distribution of scores after 20 overs - Distribution Plot

In [None]:
### Check out the distribution of scores - Scatter Plot

cond = (matchdata['Innings_No']==1) &  (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data = matchdata[cond]


sns.distplot(data['RUNS_20'], kde=False)

Avg=np.round(data['RUNS_20'].median(),2)

xp=[Avg]*10
yp=list(range(0,100,10))
plt.plot(xp, yp, linewidth=3, color='r', ls='-', label='Median:'+str(Avg))

plt.title('First Innings Final Scores After 20 Overs: Median ' +str(Avg))
plt.legend()
plt.show()


### Create a multiplier feature

In [None]:
matchdata['Mult']= matchdata ['RUNS_20']/matchdata['RUNS_15']

In [None]:
cond = (matchdata['Innings_No']==1) &  (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115)
data = matchdata[cond]

sns.distplot(data['Mult'], kde=False)

Avg=np.round(data['Mult'].median(),2)

xp=[Avg]*10
yp=list(range(0,100,10))
plt.plot(xp, yp, linewidth=3, color='r', ls='-', label='Median:'+str(Avg))

plt.title(' Acceleration Between 15 to 20 Overs: Median ' +str(Avg))
plt.legend()
plt.show()

### Observation: Teams multiply their score by 1.2x to 1.8x with a median around 1.44x

##  Model 1:   Predict Score of Over 20 Based on Score of Over 15

In [None]:
### Data set head
cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )
matchdata[cond].head()

In [None]:
# Load X and y
cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )
# Load the features to a variable X
X = matchdata[cond][['RUNS_15']]

# Load the dependent variable to y
y = matchdata[cond]['RUNS_20']


In [None]:
y.head()

In [None]:
X.shape, y.shape

In [None]:
#Lets Build a Linear Model

In [None]:
from sklearn import linear_model

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
X_train.shape, X_test.shape

In [None]:
lr = linear_model.LinearRegression()

In [None]:
%%time
lr.fit(X_train,y_train)

In [None]:
lr.coef_ , lr.intercept_

In [None]:
runs=200
lr.predict([[runs]])[0]

In [None]:
predictions = lr.predict(X_test)

predictions[:5]

In [None]:
X_test[:5]

In [None]:
y_test[:5]

In [None]:
predictions[:5]

In [None]:
from sklearn import linear_model
# Let's create an instance for the LinerRegression model


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

lr = linear_model.LinearRegression()

# Training the model on our train dataset
lr.fit(X_train,y_train)



# Getting predictions from the model 
#X_test=X_test.values.reshape(-1,1)

predictions = lr.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:', metrics.r2_score(y_test, predictions))

In [None]:
# What will be the predicted score at 100 runs in 15 overs?

runs=100
lr.predict([[runs]])[0]

In [None]:
## Plot the Model 

In [None]:
# do a prediction for 15th over score in a range of 60 to 200
xx=[]
yy=[]
for runs in range (60,200,5):
    scr = np.round(lr.predict([[runs]])[0])
    #print(runs, scr)
    xx.append(runs)
    yy.append(scr)
    
    
cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )
data = matchdata[cond]
# plot the actual 
plt.scatter(data['RUNS_15'],data['RUNS_20'])
plt.plot(xx, yy, c='r', lw=3)
plt.title('Linear Regression')

In [None]:
lr.intercept_ , lr.coef_[0]

In [None]:
pd.DataFrame({'Score_15':xx, 'Final Score':yy})[8:18]

In [None]:
plt.plot(xx[8:], yy[8:], c='r', lw=3, marker='s', markeredgecolor ='g', markerfacecolor='b' )
plt.grid()
plt.title('Predictions')

### Try Multiple Algorithms for The Univariate Model

In [None]:
m1_regressors, m1_list = getRegressors()
scr, fitted, res = predictAll(m1_regressors, X_train, y_train, X_test,y_test,m1_list)


In [None]:
sample_results = display_result_runs(fitted,m1_list)
sample_results

In [None]:
## Plot the different models
sample_results.plot(figsize=(9,6))
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.title(' Comparing Model Predictions')
plt.show()

In [None]:
# Compare results
res

In [None]:
# Predicted Scores across models
scr.sample(5)

### This model is oversimplified - doesn't take into account the wickets !

# Model 2  - Score along with Wickets Remaining

### Load the runs and wickets data

In [None]:
matchdata=pd.read_csv('/Data/IPL.csv', index_col=0)
# Ignore super overs

matchdata = matchdata [ (matchdata['Innings_No']==1)  | (matchdata['Innings_No']==2) ]

In [None]:
matchdata.head()

In [None]:
### How many wickets fall by 15th over?
cond = (matchdata['Innings_No']==1 ) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115 )
data = matchdata[cond]

sns.distplot(data['Bowler_Wicket_15'], kde=False, color='darkred')
plt.title('Distribution of Wickets at 15th over : No of matches '+str(data.shape[0]))
plt.show()

### Create a multiplier feature

In [None]:
matchdata['Mult']= matchdata ['RUNS_20']/matchdata['RUNS_15']

In [None]:
## Display Avg  Runs_20  at each wicket level
cond = (matchdata['Innings_No']==1 ) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )

data = matchdata[cond]
g=sns.catplot(data=data,kind = 'box',x='Bowler_Wicket_15',y='RUNS_20')
plt.title(' Runs scored in 20 overs')

#(g.set_axis_labels("", "Survival Rate")
plt.show()

In [None]:
## Display Avg  Multiplier  at each wicket level
cond = (matchdata['Innings_No']==1 ) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )

data = matchdata[cond]
sns.catplot(data=data,kind = 'box',x='Bowler_Wicket_15',y='Mult')

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
cond = (matchdata['Innings_No']==1 ) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115 )

data = matchdata[cond]
sns.scatterplot(data=data,x='RUNS_15',y='RUNS_20', hue='Bowler_Wicket_15',size='Bowler_Wicket_15',
               sizes=(10,200),  ax=ax, palette='coolwarm')

In [None]:
## Scatter Diag per wicket

cond = (matchdata['Innings_No']==1 ) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=115 )

data = matchdata[cond]
sns.lmplot(data=data,x='RUNS_15',y='RUNS_20', col='Bowler_Wicket_15', col_wrap=4 , palette='Blues', sharex=False, sharey=False)
plt.ylim(0,200)
plt.show()

In [None]:
data = matchdata[cond]
sns.lineplot(x='RUNS_15',y='RUNS_20', hue='Bowler_Wicket_15', data=data, ax=ax, palette='Blues_r')

In [None]:
ax=sns.catplot(y='RUNS_15', x='Bowler_Wicket_15', data=data, kind='swarm',aspect=2.5, 
            label='Runs after 15 overs', legend=True)
ax.set(title=' Runs Distribution after 15 Overs', xlabel='Wickets', ylabel='Runs in 15 overs')
plt.show()

In [None]:
ax=sns.catplot(y='RUNS_20', x='Bowler_Wicket_15', data=data, kind='swarm',aspect=2.5, 
            label='Runs after 20 overs', legend=True)
ax.set(title=' Runs Distribution after 20 Overs', xlabel='Wickets', ylabel='Runs in 20 overs')
plt.show()

In [None]:
matchdata['multiplier']=np.round(matchdata['RUNS_20']/matchdata['RUNS_15']-1,1)

In [None]:
ax=sns.catplot(x='Bowler_Wicket_15', y='multiplier', data=matchdata, kind='violin', aspect=2)
plt.show()

In [None]:
ax=sns.catplot(y='multiplier', x='Bowler_Wicket_15', data=matchdata, kind='swarm',aspect=2.5, 
            hue='Innings_No')
ax.set(title=' % Increase in 16-20 Overs', xlabel='Wickets', ylabel='Multiplier')
plt.show()

In [None]:
ax=sns.catplot(y='multiplier', x='Bowler_Wicket_15',  kind='box', col='Innings_No', data = matchdata,
              sharey=False)
#ax.set_xticklabels(labels=[50,100,150,200])
#ax.set(title=' Runs Distribution after 20 Overs', xlabel='Runs in 15 Overs', ylabel='Runs in 20 overs')
plt.show()

### EDA Observations:

For Innings 2 multipliers are really low for wicket 0/1 - this could mean that if the run chase goes well in the first 15 overs, with few wickets lost, the sense of urgency is lost as they just need to win

Hence the first innings data is better for estimating 20th over score from 15th over score





### Model 2 : Set up X and Y variables

In [None]:
# Data for a three-dimensional line
cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )
from mpl_toolkits import mplot3d
fig = plt.figure(figsize=(12,6))
ax = plt.axes(projection='3d')

xdata = matchdata[cond]['Bowler_Wicket_15']

ydata = matchdata[cond]['RUNS_15']
zdata = matchdata[cond]['RUNS_20']
ax.scatter(xdata, ydata, zdata, c=zdata, cmap='Paired_r', linewidth=0.5);
plt.title('Final Score vs 15th over score')


In [None]:
cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )
# Load the features to a variable X
X = matchdata[cond][['Bowler_Wicket_15', 'RUNS_15']]

# Load the dependent variable to y
y = matchdata[cond]['RUNS_20']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

# Some models need scaled data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaled = scaler.fit(X)
X_train_s=scaler.transform(X_train)
X_test_s=scaler.transform(X_test)


In [None]:
matchdata[cond][['Bowler_Wicket_15', 'RUNS_15','RUNS_20']].head()

In [None]:
#X_test.head()
X.shape

In [None]:
fig,ax = plt.subplots(figsize=(9,6))
hdf=matchdata[cond][['Bowler_Wicket_15', 'RUNS_15','RUNS_20']]
hdf.columns=['Wicket','15ov','Final']
sns.heatmap(data=hdf.corr(), cmap='Spectral',annot=True,ax=ax, square=True, linewidths=.5, cbar=False)

plt.title("Heatmap of Correlations")
plt.show()

In [None]:
hdf.corr()

In [None]:
#Linear Model
from sklearn import linear_model
# Let's create an instance for the LinerRegression model
lr = linear_model.LinearRegression()

# Training the model on our train dataset
lr.fit(X_train,y_train)
predictions = lr.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:',metrics.r2_score(y_test, predictions))

In [None]:
def display_result(model):
    table =[]

    for score in range(100,150,5):
        l=[score]
        for wicket in range(1,10):
            p= model.predict(np.array([[wicket, score]]))
            #print(wicket,score,round(p[0]))
            l.append(round(p[0]))
        table.append(l)
    result = pd.DataFrame(table, columns=['SCORE','1','2','3','4','5','6','7','8','9'])
    return result
        

In [None]:
display_result(lr)

In [None]:
sg =SGDRegressor( max_iter=5 , tol=None)
# Training the model on our train dataset
#Use scaled values
sg.fit(X_train_s,y_train)
predictions = sg.predict(X_test_s)

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:',metrics.r2_score(y_test, predictions))


In [None]:
rf=RandomForestRegressor(max_depth=3, random_state=1, min_samples_leaf=10, min_samples_split=10)

# Training the model on our train dataset
rf.fit(X_train,y_train)
predictions = rf.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:',metrics.r2_score(y_test, predictions))


In [None]:
svr=SVR(C=100)

svr.fit(X_train_s,y_train)
predictions = svr.predict(X_test_s)

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:',metrics.r2_score(y_test, predictions))


In [None]:
model = KNeighborsRegressor(n_neighbors=10)

# Training the model on our train dataset
model.fit(X_train,y_train)
predictions = model.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:',metrics.r2_score(y_test, predictions))


In [None]:
display_result(model)

### Polynomial Reg 

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2,interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
lrpoly = LinearRegression(fit_intercept=False)
lrpoly.fit(X_train_poly,y_train)
p = lrpoly.predict(X_test_poly)


print('MAE:', metrics.mean_absolute_error(y_test, p))
print('MSE:', metrics.mean_squared_error(y_test, p))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, p)))
print('R2:',metrics.r2_score(y_test, p))
table =[]

for score in range(100,150,5):
    l=[score]
    for wicket in range(1,10):
        polyX= poly.fit_transform ( np.array([[wicket, score]]))
        p= lrpoly.predict(polyX)
        #print(wicket,score,round(p[0]))
        l.append(round(p[0]))
    table.append(l)

result = pd.DataFrame(table, columns=['SCORE','1','2','3','4','5','6','7','8','9'])
result

In [None]:
## Build multiple models
from keras import models
from keras import layers

def predictANN(X,y):


    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    model = models.Sequential()
    model.add(layers.Dense(596, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(20, activation='relu'))
   # model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1))

    model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['mae'])
    
    model.fit(X_train, y_train, epochs=80, batch_size=16, verbose=0)
    
    predictions = model.predict(X_test)
    
    print('MAE:', metrics.mean_absolute_error(y_test, predictions))
    print('MSE:', metrics.mean_squared_error(y_test, predictions))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
    print('R-squared' , metrics.r2_score(y_test, predictions))
    
    return


def predictAll_model2(Regressors, X_train, y_train, X_test,y_test,RegList):

    predictions=[]
    cvscores=[]
    models =[]
    fitted_model=[]
    y_train= y_train.ravel()
    for regressor in Regressors:
        model_name = type(regressor).__name__
        models.append(model_name)
        print(' Model', model_name)
        regressor.fit(X_train,y_train)
        fitted_model.append(regressor)
        prediction = regressor.predict(X_test)
        predictions.append(np.round(prediction))

        score = cross_val_score(regressor,X_train, y_train,cv=5,scoring="neg_mean_squared_error")
        meanscore=score.mean()
        cvscores.append(np.sqrt(-meanscore))
        #print('CV Score',score,meanscore, np.sqrt(-meanscore))
    
    RMS=[]
    R2=[]
    
    i=0
    for prediction in predictions:
        msscore = mean_squared_error(y_test, prediction)
        r2=r2_score(y_test, prediction)
        rms = sqrt(msscore)
        RMS.append(rms)
        R2.append(r2)
       
        i=i+1



    compare = pd.DataFrame(list(zip(models,RMS,R2,cvscores)), columns=['Model','RMS','R2','CV'])
    return fitted_model, compare.sort_values(by='RMS')



In [None]:
X.shape, y.shape

In [None]:
predictANN(X,y)

In [None]:
m2_reg, m2_list = getRegressors()

In [None]:
fitted_2, res_2=predictAll_model2(m2_reg, X_train, y_train, X_test,y_test,m2_list)

In [None]:
res_2

In [None]:
#Linear
display_result(fitted_2[0])

In [None]:
#KNN
display_result(fitted_2[8])

### Observations of Model 2 Performance

1. KNN - Since we do not have enough data esp for wickets 8 or 9 , KNN is giving over-optimistic results 
2. The penalty for losing wickets is best  captured by Polynomial Model but it is not very different from the linear model at low scores ( 100/9  goes to 134/135  but 145/9 goes to 177/181 )


## Model 3 : Add a Feature - Wicket Squared is added for all Models (giving benefit of Polynomial nature to all models)

In [None]:
cond = (matchdata['Innings_No']==1) & (matchdata['Ball_id_15']>=90 ) & (matchdata['Ball_id_20']>=120 )
# Load the features to a variable X
X1 = matchdata[cond][['Bowler_Wicket_15', 'RUNS_15']]

# Load the dependent variable to y
y = matchdata[cond]['RUNS_20']
X1['Bowler_Wicket_15_Sq']=X1['Bowler_Wicket_15']*X1['Bowler_Wicket_15']
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.20, random_state=1)

# Some models need scaled data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X1)
X_train_s=scaler.transform(X_train)
X_test_s=scaler.transform(X_test)

In [None]:
# Let's create an instance for the LinerRegression model
lr = linear_model.LinearRegression()

# Training the model on our train dataset
lr.fit(X_train,y_train)
predictions = lr.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:', metrics.r2_score(y_test, predictions))


In [None]:
def display_result_wktsq(model):
    table =[]

    for score in range(100,150,5):
        l=[score]
        for wicket in range(1,10):
            p= model.predict(np.array([[wicket, score, wicket*wicket]]))
            #print(wicket,score,round(p[0]))
            l.append(round(p[0]))
        table.append(l)

    result = pd.DataFrame(table, columns=['SCORE','1','2','3','4','5','6','7','8','9'])
    return result


In [None]:
display_result_wktsq(lr)

In [None]:
# Try KNN


model = KNeighborsRegressor(n_neighbors=10)

# Training the model on our train dataset
model.fit(X_train,y_train)
predictions = model.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R2:', metrics.r2_score(y_test, predictions))

In [None]:
display_result_wktsq(model)

In [None]:
# Build Multiple Models for Model 3

In [None]:
fitted_3, res_3=predictAll_model2(m2_reg, X_train, y_train, X_test,y_test,m2_list)

In [None]:
res_3

In [None]:
#Elastic 
display_result_wktsq(fitted_3[5])

In [None]:
#KNN
display_result_wktsq(fitted_3[8])

### Observations of Model3 

The penalty for losing wickets is better captured by Model 3

In Elastic Net, 100/9 after 15 overs is going to 126 in 20 overs; 145/9 is going to 174



### Conclusion :

####  1. The IPL games have few data points where games have gone to 20 overs completion after 8 or 9 wickets have fallen in 15 overs - so the models are not able to capture this scenario

#### 2.  The acceleration of scores from the 15th to 20th over highly  depends on the playing conditions and this model doesn't capture the same


### Additional Work

In [None]:
### Tuning RF

In [None]:
%%time
param_grid = {  
              "max_depth" : [ 3,  2],
              "min_samples_leaf" : [ 5,  10, 20], 
              "min_samples_split" : [  10,  15], 
              "n_estimators": [100, 50,  10],
              "max_features" : ["auto", "sqrt"]}

from sklearn.model_selection import GridSearchCV, cross_val_score

rf = RandomForestRegressor()

gsRF = GridSearchCV(rf,param_grid = param_grid, cv=5, scoring="explained_variance", n_jobs= 4, verbose = 1)

gsRF.fit(X, y)        

rf_best = gsRF.best_estimator_

print(gsRF.best_params_)

In [None]:
print(gsRF.best_params_)

In [None]:
#Random
display_result_wktsq(Regressors[1])