In [1]:
import pandas as pd
import numpy as np

np.random.seed(1)

In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.utils.np_utils import to_categorical
from keras.layers import Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df = pd.read_csv('train_data/eng_train.csv', encoding='utf-8')

In [4]:
df.head(20)

Unnamed: 0,ids,sentences,OCR
0,1.1.1.1,oṁ namo bhagavate vāsudevāya,om namo bhagavate vasudevaya
1,1.1.2.1,dharmaḥ projjhita-kaitavo ’tra paramo nirmatsa...,dharmah projjhita-kaitavo 'tra paramo nirmatsa...
2,1.1.2.2,vedyaṁ vāstavam atra vastu śivadaṁ tāpa-trayon...,vedyam vastavam atra vastu sivadam tapa-trayon...
3,1.1.2.3,śrīmad-bhāgavate mahā-muni-kṛte kiṁ vā parair ...,Ssrimad-bhagavate maha-muni-krte kim va parair...
4,1.1.2.4,sadyo hṛdy avarudhyate ’tra kṛtibhiḥ śuśrūṣubh...,sadyo hrdy avarudhyate 'tra krtibhih sugsrusub...
5,1.1.3.1,nigama-kalpa-taror galitaṁ phalaṁ,nigama-kalpa-taror galitam phalam
6,1.1.3.2,śuka-mukhād amṛta-drava-saṁyutam,suka-mukhad amrta-drava-samyutam
7,1.1.3.3,pibata bhāgavataṁ rasam ālayam,pibata bhagavatam rasam alayam
8,1.1.3.4,muhur aho rasikā bhuvi bhāvukāḥ,muhur aho rasika bhuvi bhavukah
9,1.1.4.1,naimiṣe ’nimiṣa-kṣetre,naimise 'nimisa-ksetre


In [5]:
df.describe()

Unnamed: 0,ids,sentences,OCR
count,50980,50980,50971
unique,45781,43891,43843
top,4.22.19.3,śrī-śuka uvāca,Sri-Suka uvaca
freq,2,346,346


In [6]:
filter_values = df['sentences'].str.len() == df['OCR'].str.len()

In [7]:
len(filter_values)

50980

In [8]:
filtered_df = df[filter_values]

In [9]:
filtered_df.head()

Unnamed: 0,ids,sentences,OCR
0,1.1.1.1,oṁ namo bhagavate vāsudevāya,om namo bhagavate vasudevaya
1,1.1.2.1,dharmaḥ projjhita-kaitavo ’tra paramo nirmatsa...,dharmah projjhita-kaitavo 'tra paramo nirmatsa...
2,1.1.2.2,vedyaṁ vāstavam atra vastu śivadaṁ tāpa-trayon...,vedyam vastavam atra vastu sivadam tapa-trayon...
5,1.1.3.1,nigama-kalpa-taror galitaṁ phalaṁ,nigama-kalpa-taror galitam phalam
6,1.1.3.2,śuka-mukhād amṛta-drava-saṁyutam,suka-mukhad amrta-drava-samyutam


In [10]:
len(filtered_df) / len(df)

0.9351510396233818

In [11]:
filtered_df['OCR'] = filtered_df['OCR'].astype(str).str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
filtered_df['OCR'].head()

0                         om namo bhagavate vasudevaya
1    dharmah projjhita-kaitavo 'tra paramo nirmatsa...
2    vedyam vastavam atra vastu sivadam tapa-trayon...
5                    nigama-kalpa-taror galitam phalam
6                     suka-mukhad amrta-drava-samyutam
Name: OCR, dtype: object

In [13]:
vocab = set(filtered_df['sentences'].str.cat() + filtered_df['OCR'].str.cat())

In [14]:
string_lengths = sorted(filtered_df['OCR'].str.len())

In [15]:
SEQ_LENGTH = int(np.percentile(sorted(string_lengths), 99.5))
VOCAB_SIZE = 56

In [16]:
SEQ_LENGTH = 150

In [17]:
vocab = sorted(vocab)
vocab_to_int = {c: i+1 for i, c in enumerate(vocab)}
int_to_vocab = {i+1:c for i,c in enumerate(vocab)}

In [18]:
int_to_vocab[1]

' '

In [19]:
X = filtered_df['OCR'].tolist()

In [20]:
Y = filtered_df['sentences'].tolist()

In [21]:
print(X[0])
print(Y[0])

om namo bhagavate vasudevaya
oṁ namo bhagavate vāsudevāya


In [22]:
X_train = []
Y_train = []

for x,y in zip(X, Y):
    
    X_train.append(np.array([vocab_to_int[ch] for ch in x]))
    Y_train.append(np.array([vocab_to_int[ch] for ch in y]))

In [23]:
X_train = np.array(X_train)
X_train[0]

array([26, 24,  1, 25, 12, 24, 26,  1, 13, 19, 12, 18, 12, 33, 12, 31, 16,
        1, 33, 12, 30, 32, 15, 16, 33, 12, 34, 12])

In [24]:
X_train = pad_sequences(X_train, maxlen=SEQ_LENGTH, dtype='int32', padding='post', truncating='post')
Y_train = pad_sequences(Y_train, maxlen=SEQ_LENGTH, dtype='int32', padding='post', truncating='post')

In [25]:
X_train[0]

array([26, 24,  1, 25, 12, 24, 26,  1, 13, 19, 12, 18, 12, 33, 12, 31, 16,
        1, 33, 12, 30, 32, 15, 16, 33, 12, 34, 12,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [26]:
Y_train = to_categorical(Y_train)

In [27]:
print(X_train.shape)
print(Y_train.shape)

(47674, 150)
(47674, 150, 56)


In [28]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 100, input_length=SEQ_LENGTH))
model.add(LSTM(64, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 100)          5600      
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 64)           42240     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 150, 56)           3640      
_________________________________________________________________
activation_1 (Activation)    (None, 150, 56)           0         
Total params: 51,480
Trainable params: 51,480
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(X_train, Y_train, batch_size=512, verbose=1, epochs=25, validation_split=0.05)

Train on 45290 samples, validate on 2384 samples
Epoch 1/25
 2560/45290 [>.............................] - ETA: 2:06 - loss: 0.0483 - acc: 0.9797

In [30]:
prediction = np.argmax(model.predict(X_train[:1]), axis=2)
prediction[0]

array([26, 24,  1, 25, 12, 24, 26,  1, 13, 19, 12, 18, 12, 33, 12, 31, 16,
        1, 33, 12, 30, 32, 15, 16, 33, 12, 34, 12,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [31]:
prediction_string = "".join([int_to_vocab[c] if c > 0 else '' for c in prediction[0]])
print(prediction_string)

om namo bhagavate vasudevaya


In [32]:
X_train[1], np.argmax(Y_train[1], axis=1)

(array([15, 19, 12, 29, 24, 12, 19,  1, 27, 29, 26, 21, 21, 19, 20, 31, 12,
         7, 22, 12, 20, 31, 12, 33, 26,  1,  5, 31, 29, 12,  1, 27, 12, 29,
        12, 24, 26,  1, 25, 20, 29, 24, 12, 31, 30, 12, 29, 12, 25, 12, 24,
         1, 30, 12, 31, 12, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       dtype=int32),
 array([15, 19, 12, 29, 24, 12, 45,  1, 27, 29, 26, 21, 21, 19, 20, 31, 12,
         7, 22, 12, 20, 31, 12, 33, 26,  1, 55, 31, 29, 12,  1, 27, 12, 29,
        12, 24, 26,  1, 25, 20, 29, 24, 12, 31, 30, 12, 29, 40, 50, 40, 48,
         1, 30, 12, 31, 40, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  

In [33]:
X[0]

'om namo bhagavate vasudevaya'

In [34]:
Y[0]

'oṁ namo bhagavate vāsudevāya'

In [58]:
!ls

README.md                 [31mfor_all_cantos.sh[m[m         [31mtesseract_ocr.sh[m[m
bhagabatam_dataprep.ipynb results_bilstm.csv        [1m[36mtest_data[m[m
[1m[36mcantos[m[m                    seq2seq-lstm.ipynb        text2img
cantos_crawl.py           seq2seq.ipynb             [1m[36mtrain_data[m[m


In [35]:
test_df = pd.read_csv('test_data/test_final.csv', encoding='utf-8')

In [36]:
test_df['source'] = test_df['source'].astype(str).str.lower()

In [37]:
X_test = []
y_test = []
origin = []
for x, y, s in zip(test_df['source'].tolist(), test_df['target'].tolist(), test_df['origin']):
    try:
        X_test.append(np.array([vocab_to_int[ch] for ch in x]))
        y_test.append(y)
        origin.append(s)
    except:
        print("error")

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error


In [38]:
X_test[:2]

[array([15, 19, 29, 31, 12, 29, 12, 30, 31, 29, 12,  1, 32, 33, 12, 14, 12]),
 array([15, 19, 12, 29, 24, 12,  7, 22, 30, 16, 31, 29, 16,  1, 22, 32, 29,
        32,  7, 22, 30, 16, 31, 29, 16,  1, 30, 12, 24, 12, 33, 16, 31, 12,
         1, 34, 32, 34, 32, 31, 30, 12, 33, 12, 19])]

In [39]:
X_test = pad_sequences(X_test, maxlen=SEQ_LENGTH, dtype='int32', padding='post', truncating='post')

In [40]:
x_test_pred = model.predict_classes(X_test, verbose=1)



In [41]:
test_prediction_string = []
x_test_string = []
for row in x_test_pred:
    test_prediction_string.append("".join([int_to_vocab[c] if c > 0 else '' for c in row]))
    
    
for row in X_test:
    x_test_string.append("".join([int_to_vocab[c] if c > 0 else '' for c in row]))


In [42]:
test_prediction_string

['dhrtarastra uvāca',
 'dharma-kṣetre kuru-kṣetre samaveta yuyutsavaḥ',
 'mamakaḥ pandavas caiva kim akurvata sanjaya',
 'karpaṇya-doṣopaḥata-svabhavaḥ',
 'prechaṁi tvaṁ dharma-sammudha-cetaḥ',
 'yac chreyaḥ syan niṣcitaṁ bṛuhi tan me',
 'siśyas te ’haṁ sadhi mam tvaṁ prapannaṁ',
 'srī-bhagavan uvaca',
 'asocyan anvasocas tvaṁ prajña-vadams ca bhasase',
 'gatasun agatasums ca nanuṣocanti panditaḥ',
 'na tv evahaṁ jatu nasaṁ na tvaṁ neme janadhipaḥ',
 'na caiva na bhaviṣyaṁaḥ sarve vayaṁ ataḥ param',
 'dehino ’smin yatha dehe kaumaram yauvanaṁ jara',
 'tathā dehantara-praptir dhiras tatra na muhyati',
 'matra-sparṣas tu kaunteya sitoṣna-sukha-duḥkha-dah',
 'agamapayino ’nityas tams titikṣasva bharata',
 'avinasi tu tad viddhi yena sarvam idam tatam',
 'vinasaṁ avyayasyasya na kascit kartum arhati',
 'na jayate mṛiyate va kadacin nayaṁ bhutva bhavita va na bhuyaḥ',
 'ajo nityaḥ sasvato ’yam puraṇo na hanyate hanyamane sarire',
 'vasaṁsi jirṇani yatha vihaya',
 'navani gṛhṇati naro ’paraṇ

In [43]:
final_df = pd.DataFrame(columns=["source", "target", "prediction", "origin"])

In [44]:
final_df['source'] = x_test_string
final_df['target'] = y_test
final_df['prediction'] = test_prediction_string
final_df['origin'] = origin

In [45]:
final_df.head()

Unnamed: 0,source,target,prediction,origin
0,dhrtarastra uvaca,dhṛtarāṣṭra uvāca,dhrtarastra uvāca,pdf
1,dharma-ksetre kuru-ksetre samaveta yuyutsavah,dharma-kṣetre kuru-kṣetre samavetā yuyutsavaḥ,dharma-kṣetre kuru-kṣetre samaveta yuyutsavaḥ,pdf
2,mamakah pandavas caiva kim akurvata sanjaya,māmakāḥ pāṇḍavāś caiva kim akurvata sañjaya,mamakaḥ pandavas caiva kim akurvata sanjaya,pdf
3,karpanya-dosopahata-svabhavah,kārpaṇya-doṣopahata-svabhāvaḥ,karpaṇya-doṣopaḥata-svabhavaḥ,pdf
4,prechami tvam dharma-sammudha-cetah,pṛcchāmi tvāṁ dharma-sammūḍha-cetāḥ,prechaṁi tvaṁ dharma-sammudha-cetaḥ,pdf


In [46]:
final_df.to_csv('results_lstm_seq_len_150_emb_100.csv', encoding='utf-8')