In [5]:
import pandas as pd

In [6]:
train = pd.read_excel('..//0.data//raw/CCC_Train.xlsx')
train.head()

Unnamed: 0,ID,Match_ID,Over,Commentary,Over_Run_Total,Target
0,0,803965640511,49.6,and india reach 300. there has been a 300 in ...,4,Run_Bw_Wickets
1,1,803965640511,49.5,"slower ball, ashwin bunts this to leg for -99...",4,Run_Bw_Wickets
2,2,803965640511,49.4,"full toss on off, he just slogs, gets a thick...",4,Run_Bw_Wickets
3,3,803965640511,49.3,that's the closest you'll get to a hat-trick ...,4,Dot
4,4,803965640511,49.2,sohail is doing the sajda after bowling rahan...,4,Wicket


## Clean Commentary Text

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk import stem

def clean_paragraph(para):
    lmtzr = stem.WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    p = ' '.join([lmtzr.lemmatize(token.lower()) for token in tokenizer.tokenize(para)])
    
    return p

In [8]:
import time; t0 = time.time()

train.Commentary = [clean_paragraph(para) for para in train.Commentary]

print(time.time() - t0)

23.746646881103516


In [9]:
train.Commentary[0]

'and india reach 300 there ha been a 300 in every first inning of this cup so far shami drive this full ball over point and come back a second'

## Create Classes and Assign Probabilities

In [33]:
train.Target.value_counts()

Run_Bw_Wickets    43085
Dot               42522
Boundary          12134
Wicket             3893
Name: Target, dtype: int64

In [44]:
'''Convert Class names to numbers for easy row indexing'''
for index, row in train.iterrows():
    if row.Target == "Run_Bw_Wickets":
        train.at[index, 'Target'] = 0
    if row.Target == "Dot":
        train.at[index, 'Target'] = 1
    if row.Target == "Boundary":
        train.at[index, 'Target'] = 2
    if row.Target == "Wicket":
        train.at[index, 'Target'] = 3

train.Target.value_counts()

0    43085
1    42522
2    12134
3     3893
Name: Target, dtype: int64

In [82]:
'''Find probabilites of each class'''
prob_of_each_class = []
totalDocCount = len(train)

for a in train.Target.value_counts():
    prob_of_each_class.append(float(a/totalDocCount))

prob_of_each_class = pd.Series(prob_of_each_class)
print(prob_of_each_class); prob_of_each_class.sum()

0    0.423923
1    0.418384
2    0.119389
3    0.038304
dtype: float64


1.0

## Create Word Matrix

In [64]:
'''Get Unique Words from the text data'''

tokenDict = {}
i = 0 
for row in train.Commentary:
    for token in row.split(' '):
        if tokenDict.get(token) == None:
            tokenDict[token] = i
            i = i + 1
        else:
            continue

len(tokenDict)

'''Unique Tokens = 13957'''

'Unique Tokens = 13957'

## Initialize Matrix

In [108]:
from scipy.sparse import coo_matrix
import numpy as np
matrix = coo_matrix((4, 13957))
matrix = matrix.tocsr()
matrix

<4x13957 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

## Fill Matrix 

In [110]:
'''Fill the Matrix'''

for index, row in train.iterrows():
    for token in row['Commentary'].split(' '):
        matrix[row['Target'], tokenDict[token]] = matrix[row['Target'], tokenDict[token]] + 1



In [147]:
'''Calculate total count'''
wordCountVector = [matrix[:, i].sum() for i in range(0, len(tokenDict))]

print(len(wordCountVector))
print(wordCountVector[:10])
print(matrix[:, 0].sum())

13957
[76743.0, 242.0, 1039.0, 20.0, 4160.0, 4248.0, 1658.0, 61734.0, 21021.0, 82.0]
76743.0


In [148]:
print(matrix[:, 0])
print(wordCountVector[tokenDict['and']])

  (0, 0)	27379.0
  (1, 0)	26616.0
  (2, 0)	15806.0
  (3, 0)	6942.0
76743.0


## Make Predictions on Test Data - Load, Clean, Predict Prob

In [149]:
'''Load Test Data'''
test = pd.read_excel('..//0.data//raw/CCC_Test.xlsx')
test['Target'] = None

test.head()

Unnamed: 0,ID,Match_ID,Over,Commentary,Over_Run_Total,Target
0,0,803965640511,47.6,"the yorker gone wrong. low full toss, and dho...",9,
1,1,803965640511,47.5,"short of a length, no room, worked to leg for...",9,
2,2,803965640511,47.4,"good yorker, dhoni is making room, but can't ...",9,
3,3,803965640511,47.3,raina is gone. finally to the short ball. but...,9,
4,4,803965640511,47.2,not called a wide. great call -999 say. he ha...,9,


In [150]:
'''Clean Test Commentary Text Data'''

import time; t0 = time.time()

test.Commentary = [clean_paragraph(para) for para in test.Commentary]

print(time.time() - t0)

5.518242120742798


In [155]:
for index, row in test.iterrows():
    pred_test_prob = 0.0
    pred_class = 0
    
    for target_class in range(0, 4):
        product = prob_of_each_class[target_class]
        
        for token in row['Commentary'].split(' '):
            if tokenDict.get(token) == None:
                continue
                
            p_value = matrix[target_class, tokenDict[token]] / wordCountVector[tokenDict[token]]
            
            if p_value != 0.0:
                product = product * p_value
                
        if product > pred_test_prob:
            pred_class = target_class
            pred_test_prob = product
            
    test.at[index, 'Target'] = pred_class

## Convert Labels back to Names

In [157]:
for index, row in test.iterrows():
    if row.Target == 0:
        test.at[index, 'Target'] = "Run_Bw_Wickets"
    if row.Target == 1:
        test.at[index, 'Target'] = "Dot"
    if row.Target == 2:
        test.at[index, 'Target'] = "Boundary"
    if row.Target == 3:
        test.at[index, 'Target'] = "Wicket"

In [159]:
test.to_csv('..//5.outputs//output.csv')

# Accuracy so far - 61.828 %

## Include Over_Run_Total Information Also

In [165]:
train.groupby(['Over_Run_Total'])['Target'].unique()

Over_Run_Total
0           [1, 3]
1        [1, 3, 0]
2        [1, 0, 3]
3        [1, 0, 3]
4     [0, 1, 3, 2]
5     [0, 1, 2, 3]
6     [0, 1, 2, 3]
7     [0, 1, 2, 3]
8     [0, 1, 2, 3]
9     [0, 2, 1, 3]
10    [0, 1, 2, 3]
11    [1, 0, 2, 3]
12    [1, 2, 0, 3]
13    [2, 1, 0, 3]
14    [0, 2, 1, 3]
15    [2, 0, 1, 3]
16    [2, 0, 3, 1]
17    [0, 2, 1, 3]
18    [2, 1, 0, 3]
19    [2, 0, 1, 3]
20    [1, 2, 0, 3]
21    [0, 2, 1, 3]
22    [2, 0, 1, 3]
23    [2, 0, 1, 3]
24    [2, 0, 1, 3]
25    [2, 0, 1, 3]
26    [2, 1, 0, 3]
27       [2, 0, 1]
28       [2, 0, 1]
29       [2, 0, 3]
30          [2, 0]
31          [0, 2]
32       [0, 2, 3]
34          [2, 0]
36             [2]
Name: Target, dtype: object

In [168]:
train.Over_Run_Total.nunique()

35

In [175]:
'''Create Matrix to store prob info for runs'''

matrix_runs_prob = coo_matrix((5, 37))
matrix_runs_prob = matrix_runs_prob.tocsr()
matrix_runs_prob

<5x37 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [176]:
'''Fill Matrix of Runs'''

for index, row in train.iterrows():
    matrix_runs_prob[row['Target'], row['Over_Run_Total']] = matrix_runs_prob[row['Target'], row['Over_Run_Total']] + 1
    matrix_runs_prob[4, row['Over_Run_Total']] = matrix_runs_prob[4, row['Over_Run_Total']] + 1



In [180]:
print(matrix_runs_prob[:, 0])

  (1, 0)	2845.0
  (3, 0)	218.0
  (4, 0)	3063.0


In [181]:
'''Make New Predictions'''

for index, row in test.iterrows():
    pred_test_prob = 0.0
    pred_class = 0
    
    for target_class in range(0, 4):
        product = prob_of_each_class[target_class]
        run_prob = matrix_runs_prob[target_class, row['Over_Run_Total']] / matrix_runs_prob[4, row['Over_Run_Total']]
        
        if run_prob != 0.0:
            product = product * run_prob
        
        for token in row['Commentary'].split(' '):
            if tokenDict.get(token) == None:
                continue
                
            p_value = matrix[target_class, tokenDict[token]] / wordCountVector[tokenDict[token]]
            
            if p_value != 0.0:
                product = product * p_value
                
        if product > pred_test_prob:
            pred_class = target_class
            pred_test_prob = product
            
    test.at[index, 'Target'] = pred_class


In [183]:
for index, row in test.iterrows():
    if row.Target == 0:
        test.at[index, 'Target'] = "Run_Bw_Wickets"
    if row.Target == 1:
        test.at[index, 'Target'] = "Dot"
    if row.Target == 2:
        test.at[index, 'Target'] = "Boundary"
    if row.Target == 3:
        test.at[index, 'Target'] = "Wicket"

test.to_csv('..//5.outputs//output2.csv')

# Accuracy = 62.45% 