In [None]:
!pip install python-crfsuite



In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPError
import pycrfsuite
import urllib.request


In [None]:
url = 'https://github.com/ravichoudhary98/CRFs_sanskrit_word_segmenation/raw/master/data/'
file_name = 'output_train_sent1.txt'
base_url = url + file_name
urllib.request.urlretrieve(base_url, file_name)

#sentences list for sanskrit data
with open('output_train_sent1.txt') as f:
    sentences = f.readlines()
# you may also remove whitespace characters like `\n` at the end of each line
sentences = [x.strip() for x in sentences] 

In [None]:
len(sentences)

102712

In [None]:
sentences = [s for s in sentences if len(s) > 5] # remove very short "sentences"

In [None]:
print(sentences[200])

▁यस्य ▁साड्य ▁अ Bआव् अः ▁प्रमाRअ ▁अन्तर् एRअ ▁निScइत ः ▁स ▁बाड् इतः


For preparing our training data, every sentence is converted into a char list together with the information wether the char marks the beginning of a new word.

In [None]:
prepared_sentences = list()

for sentence in sentences:
    lengths = [len(w) for w in sentence.split(" ")]
    positions = []

    next_pos = 0
    for length in lengths:
        next_pos = next_pos + length
        positions.append(next_pos)
    concatenated = sentence.replace(" ", "")

    chars = [c for c in concatenated]
    labels = [0 if not i in positions else 1 for i, c in enumerate(concatenated)]

    prepared_sentences.append(list(zip(chars, labels)))
    
    
print([d for d in prepared_sentences[21]])

[('▁', 0), ('द', 0), ('े', 0), ('व', 0), ('ा', 0), ('ग', 1), ('ा', 0), ('र', 0), ('्', 0), ('अ', 1), ('स', 0), ('्', 0), ('य', 0), ('▁', 1), ('ख़', 0), ('न', 1), ('्', 0), ('अ', 1), ('न', 0), ('म', 0), ('्', 0), ('▁', 1), ('न', 0), ('ि', 0), ('ड', 0), ('ा', 0), ('न', 0), ('▁', 1), ('ख़', 0), ('न', 1), ('्', 0), ('अ', 1), ('न', 0), ('म', 0), ('्', 0), ('▁', 1), ('त', 0), ('ट', 0), ('ा', 0)]


## Transforming the characters to feature vectors.

Finally, we can create some simple n-gram features. Obviously, you could think of much more sophisticated features and possibly improve our model's performance.

In [None]:
def create_char_features(sentence, i):
    features = [
        'bias',
        'char=' + sentence[i][0] 
    ]
    
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
        
        
    if i + 1 < len(sentence):
        features.extend([
            'char+1=' + sentence[i+1][0],
            'char:+1=' + sentence[i][0] + sentence[i+1][0],
        ])
    else:
        features.append("EOS")
        
    if i + 2 < len(sentence):
        features.extend([
            'char+2=' + sentence[i+2][0],
            'char:+2=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0],
            'char+1:+2=' + sentence[i+1][0] + sentence[i+2][0],
        ])
        
    if i + 3 < len(sentence):
        features.extend([
            'char:+3=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0]+ sentence[i+3][0],
            'char+1:+3=' + sentence[i+1][0] + sentence[i+2][0] + sentence[i+3][0],
        ])
    
    return features



def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]

In [None]:
X = [create_sentence_features(ps) for ps in prepared_sentences[:-30000]]
y = [create_sentence_labels(ps)   for ps in prepared_sentences[:-30000]]

X_test = [create_sentence_features(ps) for ps in prepared_sentences[-30000:]]
y_test = [create_sentence_labels(ps)   for ps in prepared_sentences[-30000:]]

In [None]:
len(X),len(y),len(X_test),len(y_test)

(71957, 71957, 30000, 30000)

## Training a CRF
Now, we use Python-CRFSuite for training a CRF.

In [None]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0, 
    'c2': 1e-3,
    'max_iterations': 60,
    'feature.possible_transitions': True
})

In [None]:
type(trainer)

pycrfsuite._pycrfsuite.Trainer

In [None]:
trainer.train('sanskrit-text-segmentation.crfsuite')

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('sanskrit-text-segmentation.crfsuite')

<contextlib.closing at 0x7f684ff4a898>

In [None]:
def segment_sentence(sentence):
    sent = sentence.replace(" ", "")
    prediction = tagger.tag(create_sentence_features(sent))
    complete = ""
    for i, p in enumerate(prediction):
        if p == "1":
            complete += " " + sent[i]
        else:
            complete += sent[i]
    return complete

In [None]:
url = 'https://github.com/ravichoudhary98/CRFs_sanskrit_word_segmenation/raw/master/data/'
file_name = 'input.txt'
base_url = url + file_name
urllib.request.urlretrieve(base_url, file_name)

#use this if you want to convert all text file into segmented text 
outF = open("output.txt", "w")

with open('input.txt','r') as f:
    for line in f:
        l1 = ""
        l2 = ""
        if len(line)<=10:
            l1=line
            #outF.write(line)
            outF.write(l1)
        else:
            seg = segment_sentence(line)
            l2 = seg
            outF.write(l2)

In [None]:
import pandas as pd
df = pd.read_fwf('output.txt', header=None)
len(df)

6283

In [None]:
print(segment_sentence("त्रिपञ्चाशदधिकैकशतं"))
print(segment_sentence("एकदिवसीयक्रिकेटस्पर्धामालिकायां"))

त्रि पञ् चाशदधिकैकशतं
एक दिवसीय क्रि के ट स्पर्धा मालिका यां


Finally, let's find out how well our model performs.

In [None]:
tp = 0
fp = 0
fn = 0
n_correct = 0
n_incorrect = 0

for s in prepared_sentences[-30000:]:
    prediction = tagger.tag(create_sentence_features(s))
    correct = create_sentence_labels(s)
    zipped = list(zip(prediction, correct))
    tp +=        len([_ for l, c in zipped if l == c and l == "1"])
    fp +=        len([_ for l, c in zipped if l == "1" and c == "0"])
    fn +=        len([_ for l, c in zipped if l == "0" and c == "1"])
    n_incorrect += len([_ for l, c in zipped if l != c])
    n_correct   += len([_ for l, c in zipped if l == c])

In [None]:
print("Precision:\t" + str(tp/(tp+fp)))
print("Recall:\t\t" + str(tp/(tp+fn)))
print("Accuracy:\t" + str(n_correct/(n_correct+n_incorrect)))

Precision:	0.979928550540218
Recall:		0.9284058871761847
Accuracy:	0.9815456482318868
