In [1]:
import numpy as np
import pandas as pd
import glob
import os
from random import shuffle

import torch
import torch.utils.data as data_utils
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_sequence,pad_packed_sequence,pad_sequence
torch.__version__

'1.3.0'

In [2]:
# # Data loading and processing cell
# path = r'odi_csv/'
# all_files = glob.glob(path + "/*.csv")
# shuffle(all_files)
# innings2 = []
# target=[]
# for filename in all_files:
#     df = pd.read_csv(filename,usecols=[0])
#     skips = df.loc[ 'info' , : ].shape[0]

#     df = pd.read_csv(filename,nrows=skips,skiprows=1,header=None)
#     df = df.drop(columns=0).set_index(df.columns[1])
#     winteam=None
#     if 'winner' in df.index:
#         winteam = df.loc['winner',:].values[0]
    
#     df = pd.read_csv(filename,skiprows=skips+1,header=None)
    
#     df2 = df[df.columns[[1,2,7,8]]].set_index(df.columns[1]).drop(index=1)
#     if df2.shape[0]>0:
#         innings2.append(df2)
#         i2team = df[df.columns[[1,3]]].set_index(df.columns[1]).drop(index=1).values[0,0]
#         if (i2team==winteam):
#             target.append(1)
#         else:
#             target.append(0)

# # len(innings2) = 1556

# # Data loading and processing cell
path = r'odi_csv/'
all_files = glob.glob(path + "/*.csv")
shuffle(all_files)
innings2 = []
target=[]
for filename in all_files:
    df = pd.read_csv(filename,usecols=[0])
    skips = df.loc[ 'info' , : ].shape[0]

    df = pd.read_csv(filename,nrows=skips,skiprows=1,header=None)
    df = df.drop(columns=0).set_index(df.columns[1])
    winteam=None
    if 'winner' in df.index:
        winteam = df.loc['winner',:].values[0]
    
    df = pd.read_csv(filename,skiprows=skips+1,header=None)
    
    df2 = df[df.columns[[1,2,7,8]]].set_index(df.columns[1]).drop(index=1)
    df2[9] = (df2[7]+df2[8]).cumsum()
    df3 = df2[df2.columns[[0,3]]]

    if df3.shape[0]>0:
        innings2.append(df3)
        i2team = df[df.columns[[1,3]]].set_index(df.columns[1]).drop(index=1).values[0,0]
        if (i2team==winteam):
            target.append(1)
        else:
            target.append(0)

In [3]:
# convert to torch tensor cell
features=[]
for i in range(len(innings2)):
    features.append(torch.tensor(innings2[i].values))


# convert to fixed length sequence
X = pad_sequence(features,batch_first=True,padding_value=-1)
y = torch.tensor(target)

# test train split
split = int(len(X) * 0.8)
X_train=X[:split]
X_test =X[split:]
y_train=y[:split]
y_test =y[split:]

print(X_train.size())
print(X_test.size())

torch.Size([1244, 321, 2])
torch.Size([312, 321, 2])


In [4]:
# LSTM model
class matchRNN(nn.Module):
    def __init__(self,insize,hsize,outsize):
        super(matchRNN,self).__init__()
        
        self.insize=insize
        self.hsize=hsize
        self.outsize = outsize
        
        # lstm cell
        self.lstm_cell = nn.LSTMCell(input_size=insize, hidden_size=hsize)
        self.fc_out = nn.Linear(in_features=hsize, out_features=outsize)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,feat):
#         feat = torch.tensor(feat[np.newaxis,:],dtype=torch.float32)
        batch_size = feat.size(0)
        
        # init the hidden and cell states to zeros
        hidden_state = torch.zeros((batch_size, self.hsize))
        cell_state = torch.zeros((batch_size, self.hsize))
        for t in range(feat.size(1)):

            # for the first time step (if input is different)
            if t == 0:
                hidden_state, cell_state = self.lstm_cell(feat[:,t,:].view(batch_size,-1).float(), (hidden_state, cell_state))
                
            # for the 2nd+ time step
            else:
                hidden_state, cell_state = self.lstm_cell(feat[:,t,:].view(batch_size,-1).float(), (hidden_state, cell_state))
            
        out = self.fc_out(hidden_state)

    
        return out

In [5]:
# evaluation cell

def X_generator(X,j):
    X1,_ = pad_packed_sequence(pack_sequence(X[:,:j,:],enforce_sorted = False), batch_first=True, padding_value=-1, total_length=321)
    return X1

def evalfunc(model,j):
    global X_train,X_test,y_train,y_test,batch_size
    model.eval()
    # train
    X = X_generator(X_train,j)
    y = y_train
    corr=0
    num_batches = X.size(0) // batch_size

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = (batch_idx+1) * batch_size
        op = model(X[start_idx:end_idx])
        maxval,maxidx = torch.max(op,1)
        corr+= np.sum((maxidx==y[start_idx:end_idx]).numpy())
    total=num_batches*batch_size
    train_acc = corr / total
    
    # test
    X = X_generator(X_test,j)
    y = y_test
    corr=0
    num_batches = X.size(0) // batch_size

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = (batch_idx+1) * batch_size
        op = model(X[start_idx:end_idx])
        maxval,maxidx = torch.max(op,1)
        corr+= np.sum((maxidx==y[start_idx:end_idx]).numpy())
    total=num_batches*batch_size
    test_acc = corr / total
    print('j = {}, Train Acc = {}, Test Acc = {}'.format(j,train_acc,test_acc))

In [6]:
# training cell

insize=X_train.size(2)
hsize=64
outsize=2    #binary classification
model = matchRNN(insize,hsize,outsize)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
batch_size=16
num_batches = int(X_train.size(0) / batch_size)

# train iterations
for epoch in range(201):
    epoch_loss=0
    model.train()
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = (batch_idx+1) * batch_size
        model.zero_grad()
        op = model(X_train[start_idx:end_idx])
        loss = loss_function(op.contiguous().view(-1,outsize), y_train[start_idx:end_idx].contiguous().view(-1))
        loss.backward()
        epoch_loss+=loss.data.item()
        optimizer.step()
    print('Epoch = {}, loss = {}'.format(epoch,epoch_loss))
    if epoch%10==0:
        for j in [321,250,200]:
            evalfunc(model,j)
    if epoch%50==0:
        torch.save(model.state_dict(), './models/cric_prediction_cumsum.pth')


torch.save(model.state_dict(), './models/cric_prediction_cumsum.pth')


Epoch = 0, loss = 53.020735919475555
j = 321, Train Acc = 0.538961038961039, Test Acc = 0.5953947368421053
j = 250, Train Acc = 0.5016233766233766, Test Acc = 0.5263157894736842
j = 200, Train Acc = 0.5016233766233766, Test Acc = 0.5263157894736842
Epoch = 1, loss = 52.71233743429184
Epoch = 2, loss = 52.63966912031174
Epoch = 3, loss = 52.61727178096771
Epoch = 4, loss = 52.60352408885956
Epoch = 5, loss = 52.59258794784546
Epoch = 6, loss = 52.583276987075806
Epoch = 7, loss = 52.57576113939285
Epoch = 8, loss = 52.561668276786804
Epoch = 9, loss = 52.57353091239929
Epoch = 10, loss = 52.55537927150726
j = 321, Train Acc = 0.5487012987012987, Test Acc = 0.6217105263157895
j = 250, Train Acc = 0.5016233766233766, Test Acc = 0.5263157894736842
j = 200, Train Acc = 0.5016233766233766, Test Acc = 0.5263157894736842
Epoch = 11, loss = 52.55022597312927
Epoch = 12, loss = 52.549009680747986
Epoch = 13, loss = 52.541397750377655
Epoch = 14, loss = 52.53628325462341
Epoch = 15, loss = 52.530

KeyboardInterrupt: 

In [None]:
# # attempt to make total balls as 300

# for index, row in df3.iterrows():
#     ball = row[2]-int(row[2])
#     if ball>=0.7:
#         do_nothing=0
        
# #     if ball>=0.7:
        
    

In [None]:
# Result with non cummulative runs

# 0    53.2790207862854
# 1    52.57516276836395
# 2    52.45734006166458
# 3    52.40315908193588
# 4    52.34970372915268
# 5    52.26073855161667
# 6    52.17638784646988
# 7    51.58316457271576
# 8    43.30328445136547
# 9    57.61210838705301
# 10    52.53858804702759
# 11    52.343645334243774
# 12    52.28505802154541
# 13    52.24510443210602
# 14    52.179391503334045
# 15    52.06208062171936
# 16    51.87605047225952
# 17    51.643471002578735
# 18    50.84223812818527
# 19    47.4653697013855
# 20    52.84381580352783
# 21    52.31451594829559
# 22    51.987657487392426
# 23    50.6322928071022
# 24    41.78655853867531
# 25    38.371080085635185
# 26    30.5227283090353
# 27    25.608611315488815
# 28    24.244553975760937
# 29    24.53299780935049
# 30    19.940633855760098
# 31    19.072165571153164
# 32    18.833250992000103
# 33    18.282961204648018
# 34    17.492209024727345
# 35    16.072133218869567
# 36    15.94692567922175
# 37    15.637733343988657
# 38    15.691517168655992
# 39    17.08754900470376
# 40    15.882059352472425
# 41    16.03504695557058
# 42    15.608288820832968
# 43    15.260794967412949
# 44    14.797165306285024
# 45    14.357600182294846
# 46    14.169182622805238
# 47    14.309206511825323
# 48    14.251552063971758
# 49    14.086593249812722
# 50    14.317906996235251
# 51    13.953223371878266
# 52    13.94574885815382
# 53    13.928890533745289
# 54    13.870008070021868
# 55    13.778199722990394
# 56    13.713717775419354
# 57    15.828098432160914
# 58    19.985529206693172
# 59    15.713323254138231