In [1]:
import numpy as np 
import pandas as pd 
import os

In [2]:
import pandas as pd
from fastai import *
from fastai.text import *
import matplotlib.pyplot as plt

##### Preparing TrainingData and TestData csv

In [5]:
df_tr = pd.read_csv('../Data/CCC_TrainingData.csv')
df_ts = pd.read_csv('../Data/CCC_TestData.csv')

df_tr1 = df_tr[['Target','Commentary']].copy()
df_tr1.replace(to_replace={'Dot':0, 'Run_Bw_Wickets':1, 'Boundary':2, 'Wicket':3}, inplace=True)

df_ts1 = df_ts[['Commentary']].copy()

df_tr1.to_csv('../Data/TrainingData.csv', index=False)
df_ts1.to_csv('../Data/TestData.csv', index=False)

In [6]:
#datapath = '/kaggle/input/cricket-commentary/'

datapath = '../Data/'
datafile = 'TrainingData.csv'

#### Creating the dataset for Language Model

In [7]:
data_lm = TextLMDataBunch.from_csv(datapath, datafile)

##### Creating Language Model, with AWD_LSTM architecture

In [8]:
learn_lm = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)
#learn_lm = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.5) less accuracy

##### Finding optimum learning rate

In [None]:
#learn_lm.lr_find(start_lr=1e-8, end_lr=1e2)
#learn_lm.recorder.plot()

##### Training using fit one cycle approach

In [None]:
#learn_lm.model_dir='/kaggle/working/'

learn_lm.model_dir='../Output/'

# Run one epoch with lower layers 
learn_lm.fit_one_cycle(1, 1e-1) #better accuracy version 14

learn_lm.unfreeze()

#learn_lm.fit_one_cycle(20, slice(1e-2/10, 1e-2))
# version 21
#learn_lm.fit_one_cycle(8, slice(1e-2/10, 1e-2))

#version 24
# Run for many epochs with all layers unfrozen
learn_lm.fit_one_cycle(9, slice(1e-2/10, 1e-2))

#Version 27 , Using Callbacks
#learn_lm.fit_one_cycle(1, 1e-3, moms=(0.8,0.7), callbacks=[ShowGraph(learn_lm),
#                                                              SaveModelCallback(learn_lm,monitor='accuracy',mode='max')]) #9 is best 2.517068	2.853009	0.405768	


learn_lm.recorder.plot_metrics()  

In [None]:
#Version 21 update
#learn_lm.model_dir='/kaggle/working/'
# Run one epoch with lower layers 
#learn_lm.fit_one_cycle(cyc_len=1, max_lr=1e-3, moms=(0.8, 0.7))

# Run for many epochs with all layers unfrozen
#learn_lm.unfreeze()
#learn_lm.fit_one_cycle(cyc_len=20, max_lr=1e-3, moms=(0.8, 0.7))

#Version 21
#learn_lm.recorder.plot_losses()




In [None]:
learn_lm.predict("the yorker gone ", n_words=15)

#### Saving the encoder

In [None]:
learn_lm.save_encoder('lm_enc')

##### Bulding the Classifier

In [None]:
data_clas = TextClasDataBunch.from_csv(datapath, datafile, vocab=data_lm.train_ds.vocab, bs=32) #Tried multiple batch sizes

In [None]:
learn_clas = text_classifier_learner(data_clas, drop_mult=0.7,arch=AWD_LSTM) #Better accuracy with 0.7 dropout rate

#learn_clas = text_classifier_learner(data_clas, drop_mult=0.5,arch=AWD_LSTM)

In [None]:
#learn_clas.load_encoder('/kaggle/working/lm_enc')

learn_clas.load_encoder('../Output/lm_enc')

##### Handling class imbalance of labels

In [None]:
#df = pd.read_csv('../input/cricket-commentary/TrainingData.csv')
df = pd.read_csv('../Data/TrainingData.csv')

labelcounts = df.groupby(["Target"]).size()
label_sum = len(df["Target"])
class_imbalance = [(count/label_sum) for count in labelcounts]

In [None]:
weights_balance = [(1-count/label_sum) for count in labelcounts]
loss_weights = torch.FloatTensor(weights_balance).cuda()
learn_clas.crit = partial(F.cross_entropy, weight=loss_weights)

##### Finding learning rate for classifier

In [None]:
## Find Learning rate for classifier
#learn_clas.model_dir='/kaggle/working/'
#learn_clas.freeze()
#learn_clas.lr_find(start_lr=1e-8, end_lr=1e2)
#learn_clas.recorder.plot()


In [None]:
## Version 20 update
#learn_clas.freeze()
#learn_clas.fit_one_cycle(cyc_len=1, max_lr=1e-3, moms=(0.8, 0.7))

#learn_clas.freeze_to(-2)
#learn_clas.fit_one_cycle(1, slice(1e-4,1e-2), moms=(0.8,0.7))

#learn_clas.freeze_to(-3)
#learn_clas.fit_one_cycle(1, slice(1e-5,5e-3), moms=(0.8,0.7))

#learn_clas.unfreeze()
#learn_clas.fit_one_cycle(5, slice(1e-5,1e-3), moms=(0.8,0.7))

##### Training the Classifier

In [None]:
#Version 15
#Training layers one by one
learn_clas.freeze()
learn_clas.fit_one_cycle(1, 1e-2)

learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(1, slice(1e-3/10, 1e-3))

learn_clas.freeze_to(-3)
learn_clas.fit_one_cycle(1, slice(1e-3/10, 1e-3))

learn_clas.unfreeze()
#learn_clas.fit_one_cycle(20, slice(1e-3/10, 1e-3))
#best
#learn_clas.fit_one_cycle(11, slice(1e-3/10, 1e-3))

#version 24
learn_clas.fit_one_cycle(13, slice(1e-3/10, 1e-3)) # 13 is best 0.174861	0.310772	0.895312

##Using callbacks to find best accuracy
#learn_clas.fit_one_cycle(1, slice(1e-3/10, 1e-3), callbacks=[ShowGraph(learn_clas),
#                                                            SaveModelCallback(learn_clas,monitor='accuracy',mode='max')]) 


# 11 Epoch 0.174883	0.339358	0.888621

In [None]:
## Best accuracy - commenting for now
#learn_clas.fit_one_cycle(20, 1e-2)
#learn_clas.unfreeze()
#learn_clas.fit_one_cycle(20, slice(1e-3/10, 1e-3))

##### Saving the classifier 

In [None]:
#path = '/kaggle/working/sub.pkl'

path = '../Output/sub.pkl'

path = Path(path)
#learn_clas.model_dir='/kaggle/working/'
learn_clas.export(path)

learn_clas.recorder.plot_metrics()

##### Prediction

In [None]:
#test_df = pd.read_csv('../input/testdata/TestData.csv')
test_df = pd.read_csv('../Data/TestData.csv')

learn_clas.data.add_test(test_df['Commentary'])

prob_preds = learn_clas.get_preds(ds_type=DatasetType.Test, ordered=True)

#labels = np.argmax(prob_preds, 1)
#test_df['category'] = labels
#test_df.to_csv('/kaggle/working/submission.csv')

labels = np.argmax(prob_preds[0],1)
df_f = pd.DataFrame(labels)
df_f.replace(to_replace={0:'Dot', 1:'Run_Bw_Wickets', 2:'Boundary', 3:'Wicket'}, inplace=True)

df_f.reset_index(inplace=True)
df_f.columns=['ID','Target']

#df_f.to_csv('/kaggle/working/submission.csv', index=False)
df_f.to_csv('../Output/submission.csv', index=False)

In [5]:
dfa = pd.read_csv('../Output/submission.csv')
dfb = pd.read_csv('../Data/CCC_TestData.csv')

In [8]:
df_merge = pd.merge(dfa, dfb, on=('ID'), how='inner')

In [14]:
c1 = ['Boundary','Run_Bw_Wickets']
c2 = ['Boundary']

##### Creating a Rule to utilize Over Run Total feature

In [15]:
df_merge['Target'] = np.where((df_merge.Target.isin(c1)) & (df_merge['Over_Run_Total'] ==0), 'Dot',df_merge['Target']) 
df_merge['Target'] = np.where((df_merge.Target.isin(c2)) & (df_merge['Over_Run_Total'] < 4), 'Run_Bw_Wickets',df_merge['Target']) 


In [18]:
df_sub = df_merge[['ID','Target']].copy()
df_sub.to_csv('../Output/final.csv', index=False)