In [0]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_excel('CCC_Train.xlsx')
train.head()

Unnamed: 0,ID,Match_ID,Over,Commentary,Over_Run_Total,Target
0,0,803965640511,49.6,and india reach 300. there has been a 300 in ...,4,Run_Bw_Wickets
1,1,803965640511,49.5,"slower ball, ashwin bunts this to leg for -99...",4,Run_Bw_Wickets
2,2,803965640511,49.4,"full toss on off, he just slogs, gets a thick...",4,Run_Bw_Wickets
3,3,803965640511,49.3,that's the closest you'll get to a hat-trick ...,4,Dot
4,4,803965640511,49.2,sohail is doing the sajda after bowling rahan...,4,Wicket


In [3]:
train.columns

Index(['ID', 'Match_ID', 'Over', 'Commentary', 'Over_Run_Total', 'Target'], dtype='object')

## Clean Commentary Text

In [4]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
from nltk.tokenize import RegexpTokenizer
from nltk import stem

def clean_paragraph(para):
    lmtzr = stem.WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    p = ' '.join([lmtzr.lemmatize(token.lower()) for token in tokenizer.tokenize(para)])
    
    return p

In [6]:
import time; t0 = time.time()

train.Commentary = [clean_paragraph(para) for para in train.Commentary]

print(time.time() - t0)

9.811755657196045


In [7]:
train.Commentary[0]

'and india reach 300 there ha been a 300 in every first inning of this cup so far shami drive this full ball over point and come back a second'

## Build Count Vector on Training Data

In [0]:
import sklearn, nltk
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
vectorizer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)
train_vectorized_data = vectorizer.fit_transform(train.Commentary).toarray()

In [10]:
'''
101634 docs, 8828 unique tokens
'''
print(train_vectorized_data.shape)
print(type(train_vectorized_data[0]))
len(train_vectorized_data[534])

(101634, 8828)
<class 'numpy.ndarray'>


8828

In [11]:
train.Target.value_counts()

Run_Bw_Wickets    43085
Dot               42522
Boundary          12134
Wicket             3893
Name: Target, dtype: int64

In [12]:
'''Convert Class names to numbers for easy row indexing'''

for index, row in train.iterrows():
    if row.Target == "Run_Bw_Wickets":
        train.at[index, 'Target'] = 0
    if row.Target == "Dot":
        train.at[index, 'Target'] = 1
    if row.Target == "Boundary":
        train.at[index, 'Target'] = 2
    if row.Target == "Wicket":
        train.at[index, 'Target'] = 3

train.Target.value_counts()

0    43085
1    42522
2    12134
3     3893
Name: Target, dtype: int64

In [13]:
train.Target[:10]

0    0
1    0
2    0
3    1
4    3
5    3
6    3
7    1
8    0
9    0
Name: Target, dtype: object

## Clean and Vectorize Test Data

In [14]:
test = pd.read_excel('CCC_Test.xlsx')
print(test.head())
print(test.shape)

'''Clean test Data'''
test.Commentary = [clean_paragraph(para) for para in test.Commentary]

# Run vectorizer on it.
test_vectorized_data = vectorizer.transform(test.Commentary).toarray()

   ID  ...  Over_Run_Total
0   0  ...               9
1   1  ...               9
2   2  ...               9
3   3  ...               9
4   4  ...               9

[5 rows x 5 columns]
(26143, 5)


In [15]:
'''Verify Shapes of input variables'''

print(test_vectorized_data.shape)
print(train_vectorized_data.shape)
test.Commentary[0]

(26143, 8828)
(101634, 8828)


'the yorker gone wrong low full toss and dhoni ha been waiting he sends it sailing over midwicket for 999'

## Run Network on Train Data

In [16]:
from keras import models, layers

Using TensorFlow backend.


In [17]:
network = models.Sequential()
network.add(layers.Dense(64, activation='relu', input_shape=(8828,)))
network.add(layers.Dense(32, activation='relu', input_shape=(8828,)))
network.add(layers.Dense(16, activation='relu', input_shape=(8828,)))
network.add(layers.Dense(8, activation='relu', input_shape=(8828,)))
network.add(layers.Dense(4, activation='softmax'))






In [18]:
network.compile(optimizer = 'rmsprop',
             loss = 'sparse_categorical_crossentropy',
             metrics = ['accuracy'])





In [19]:
'''Without Validation sets = 83.04%'''
x_validation_set = train_vectorized_data[:15246]
x_train_set = train_vectorized_data[15246:]

y_validation_set = train.Target[:15246]
y_train_set = train.Target[15246:]

network.fit(train_vectorized_data, train.Target, epochs=10, batch_size=512)
            # validation_data = (x_validation_set, y_validation_set))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0de33eada0>

In [0]:
predicted_test_labels = network.predict(test_vectorized_data)

In [21]:
predicted_test_labels.shape

(26143, 4)

In [22]:
np.sum(predicted_test_labels[12])

1.0000001

In [23]:
np.argmax(predicted_test_labels[1])

1

In [24]:
pred_labels = []
for row in predicted_test_labels:
    pred_labels.append(np.argmax(row))

pred_labels[:10]

[2, 1, 1, 3, 1, 0, 0, 1, 1, 1]

In [0]:
test['Target'] = pred_labels

In [26]:
test.head()

Unnamed: 0,ID,Match_ID,Over,Commentary,Over_Run_Total,Target
0,0,803965640511,47.6,the yorker gone wrong low full toss and dhoni ...,9,2
1,1,803965640511,47.5,short of a length no room worked to leg for 999,9,1
2,2,803965640511,47.4,good yorker dhoni is making room but can t get...,9,1
3,3,803965640511,47.3,raina is gone finally to the short ball but af...,9,3
4,4,803965640511,47.2,not called a wide great call 999 say he ha mov...,9,1


In [27]:
print(test.dtypes)
print(type(test.Target[0]))

test.Target = test.Target.astype('str')

print(test.dtypes)

ID                  int64
Match_ID            int64
Over              float64
Commentary         object
Over_Run_Total      int64
Target              int64
dtype: object
<class 'numpy.int64'>
ID                  int64
Match_ID            int64
Over              float64
Commentary         object
Over_Run_Total      int64
Target             object
dtype: object


In [28]:
'''Convert Labels back to Names'''

for index, row in test.iterrows():
    if row.Target == "0":
        test.at[index, 'Target'] = "Run_Bw_Wickets"
    if row.Target == "1":
        test.at[index, 'Target'] = "Dot"
    if row.Target == "2":
        test.at[index, 'Target'] = "Boundary"
    if row.Target == "3":
        test.at[index, 'Target'] = "Wicket"
        
test.head()

Unnamed: 0,ID,Match_ID,Over,Commentary,Over_Run_Total,Target
0,0,803965640511,47.6,the yorker gone wrong low full toss and dhoni ...,9,Boundary
1,1,803965640511,47.5,short of a length no room worked to leg for 999,9,Dot
2,2,803965640511,47.4,good yorker dhoni is making room but can t get...,9,Dot
3,3,803965640511,47.3,raina is gone finally to the short ball but af...,9,Wicket
4,4,803965640511,47.2,not called a wide great call 999 say he ha mov...,9,Dot


In [0]:
test.to_csv('output_DeepLearning_v3.csv')

# Best Accuracy so far - 83.175 %

1. v1 - without validation set - epochs=10, batch_size=512 -> 83.175 %
2. v2 - with validation set till 15246 - epochs=20, batch_size=128 -> 80.726 %
3. v3 - without validation - epochs=20, batch_size=128 -> 80.932%