First, perform some imports. The Python CRFSuite can be installed via
__ pip install python-crfsuite __

In [None]:
!pip install python-crfsuite



In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, HTTPError
import pycrfsuite

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#sentences list for Bangali data
sentences = list()
with open("/content/drive/My Drive/IIT(BHU) LAB/Completed  work/CRF_DECOMPOUNDING_BENGALI/bengali_train.txt") as f:
    sentences = f.readlines()
#Remove whitespace characters like `\n` at the end of each line
sentences = [x.strip() for x in sentences]

In [None]:
len(sentences)

106084

In [None]:
sentences = [s for s in sentences if len(s) > 5] # remove very short "sentences"

In [None]:
print(sentences[200]),len(sentences),

বললে টাটা সুমো এসেছিল!


(None, 55173)

For preparing our training data, every sentence is converted into a char list together with the information wether the char marks the beginning of a new word.

In [None]:
prepared_sentences = list()
#apply loop on sentences list to select one sentence
for sentence in sentences:
    #create a list lengths that have length of every word in sentence
    lengths = [len(w) for w in sentence.split(" ")]
    positions = []

    next_pos = 0
    #apply loop on 'lengths' list to select one word length
    for length in lengths:
        #set next position to next word
        next_pos = next_pos + length
        #append every word position to positions list
        positions.append(next_pos)
    #remove white spaces from 'sentence' and make one single string and store in 'concatenated' 
    concatenated = sentence.replace(" ", "")
    #take every character from 'concatenated' string variable and store in 'chars' list
    chars = [c for c in concatenated]
    #generate labels for every character if it is at starting position then 1 else 0 
    labels = [0 if not i in positions else 1 for i, c in enumerate(concatenated)]
    #add every character with there labels into list 'prepared_sentences'
    prepared_sentences.append(list(zip(chars, labels)))
    
    
print([d for d in prepared_sentences[200]])

[('ব', 0), ('ল', 0), ('ল', 0), ('ে', 0), ('ট', 1), ('া', 0), ('ট', 0), ('া', 0), ('স', 1), ('ু', 0), ('ম', 0), ('ো', 0), ('এ', 1), ('স', 0), ('ে', 0), ('ছ', 0), ('ি', 0), ('ল', 0), ('!', 0)]


## Transforming the characters to feature vectors.

Finally, we can create some simple n-gram features. Obviously, you could think of much more sophisticated features and possibly improve our model's performance.

In [None]:
#here sentence is prepared_sentence and i is length of prepared_sentence
def create_char_features(sentence, i):
    #set initial feature set char as first char in prepared_sentence
    features = [
        'bias',
        'char=' + sentence[i][0] 
    ]
    #if i >=1 then go to previous character else append 'BOS' in features list 
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
    #if i+1 < len(sentence) then go to next character and set it to next character and set char to next two characters else append 'EOS' to features list
    if i + 1 < len(sentence):
        features.extend([
            'char+1=' + sentence[i+1][0],
            'char:+1=' + sentence[i][0] + sentence[i+1][0],
        ])
    else:
        features.append("EOS")
    #if first if condition satisfy then go to second and third if condition and do the same work for next characters    
    if i + 2 < len(sentence):
        features.extend([
            'char+2=' + sentence[i+2][0],
            'char:+2=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0],
            'char+1:+2=' + sentence[i+1][0] + sentence[i+2][0],
        ])
    
    if i + 3 < len(sentence):
        features.extend([
            'char:+3=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0]+ sentence[i+3][0],
            'char+1:+3=' + sentence[i+1][0] + sentence[i+2][0] + sentence[i+3][0],
        ])
    return features



def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]

In [None]:
#example for this string 'nec tinia nec vermes tangent' create_char_features() output ['bias', 'char=n', 'BOS', 'char+1=e', 'char:+1=ne', 'char+2=c', 'char:+2=nec', 'char+1:+2=ec', 'char:+3=nect', 'char+1:+3=ect']    
#labels for this string [('n', 0), ('e', 0), ('c', 0), ('t', 1), ('i', 0), ('n', 0), ('i', 0), ('a', 0), ('n', 1), ('e', 0), ('c', 0), ('v', 1), ('e', 0), ('r', 0), ('m', 0), ('e', 0), ('s', 0), ('t', 1), ('a', 0), ('n', 0), ('g', 0), ('e', 0), ('n', 0), ('t', 0)]

In [None]:
len(prepared_sentences)

55173

In [None]:
#X list for training model with sentence features 
X = [create_sentence_features(ps) for ps in prepared_sentences[:-20000]]
#y list for training model with sentence labels(0 or 1) 
y = [create_sentence_labels(ps)   for ps in prepared_sentences[:-20000]]

#X_test list for testing model with sentence features 
X_test = [create_sentence_features(ps) for ps in prepared_sentences[-2173:]]
#y_test list for testing model with sentence labels(0 or 1) 
y_test = [create_sentence_labels(ps)   for ps in prepared_sentences[-2173:]]

In [None]:
len(X),len(y),len(X_test),len(y_test)

(35173, 35173, 2173, 2173)

## Training a CRF
Now, we use Python-CRFSuite for training a CRF.

In [None]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0, 
    'c2': 1e-3,
    'max_iterations': 60,
    'feature.possible_transitions': True
})

In [None]:
#training model 
trainer.train('bengali-text-segmentation.crfsuite')

In [None]:
#open trained model
tagger = pycrfsuite.Tagger()
tagger.open('bengali-text-segmentation.crfsuite')

<contextlib.closing at 0x7fd2e7a302e8>

In [None]:
#segment sentence or word by trained model
def segment_sentence(sentence):
    #remove white spaces from sentence
    sent = sentence.replace(" ", "")
    #tag sentence by trained model or create sentence features 
    prediction = tagger.tag(create_sentence_features(sent))
    #assign 'complete' to empty string 
    complete = ""
    #apply for loop on taged sentence
    for i, p in enumerate(prediction):
        #if label of character in sentence is 1 then brack that word from that place and add into complete
        if p == "1":
            complete += " " + sent[i]
        #if label of character in sentence is 0 then add that word as it is into complete
        else:
            complete += sent[i]
    return complete

In [None]:
#for check our model
sentence = 'সংবাদসংস্থাঅমৃতসর রাহুল'
prediction = tagger.tag(create_sentence_features(sentence))
complete = ""
for i, p in enumerate(prediction):
  if p == "1":
    complete += " " + sentence[i]
  else:
    complete += sentence[i]
complete,sentence

('সংবাদসংস্থা অমৃত সর  রাহুল', 'সংবাদসংস্থাঅমৃতসর রাহুল')

In [None]:
#print Segmented sentences or Segmented words
print(segment_sentence("কাটোয়া-আহমদপরসমেতবর্-নিউ ফরাক্কালাইনের লাভ-ক্ষনিয়েসমীক্ষা। "))
print(segment_sentence("পঞ্জাবসীমান্তেরগ্রামগুলোতেএবারদেওয়ালিপালনকরতেপারেননিগ্রামবাসীরা।সীমান্তেএতটেশনছিল,উৎসবেমেজাজ ছিলই না। সচিনেরপাকিস্তানকে গুঁড়িয়ে দেওয়ার পরে সেই আলোর উৎসব পালন করলেন"))

In [None]:
#use this if you want to convert whole text file into segmented text file that store in 'output.txt' file
outF = open("/content/output.txt", "w")

with open('/content/drive/My Drive/IIT(BHU) LAB/Completed  work/CRF_DECOMPOUNDING_BENGALI/bengali_validation.txt','r') as f:
  for line in f:
    l1 = ""
    l2 = ""
    if len(line)<=5:
      l1=line
      #outF.write(line)
      outF.write(l1)
    else:
      seg = segment_sentence(line)
      l2 = seg
      outF.write(l2)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup as soup
df = pd.read_fwf('/content/output.txt', header=None)
len(df)

4490

Finally, let's find out how well our model performs.

In [None]:
tp = 0
fp = 0
fn = 0
n_correct = 0
n_incorrect = 0

for s in prepared_sentences[-2176:]:
    prediction = tagger.tag(create_sentence_features(s))
    correct = create_sentence_labels(s)
    zipped = list(zip(prediction, correct))
    tp +=        len([_ for l, c in zipped if l == c and l == "1"])
    fp +=        len([_ for l, c in zipped if l == "1" and c == "0"])
    fn +=        len([_ for l, c in zipped if l == "0" and c == "1"])
    n_incorrect += len([_ for l, c in zipped if l != c])
    n_correct   += len([_ for l, c in zipped if l == c])

In [None]:
print("Precision:\t" + str(tp/(tp+fp)))
print("Recall:\t\t" + str(tp/(tp+fn)))
print("Accuracy:\t" + str(n_correct/(n_correct+n_incorrect)))

Precision:	0.9898973912584738
Recall:		0.9880209567438064
Accuracy:	0.9958070509591747


In [None]:
Precision:	0.929995347955685
Recall:		0.916343634521903
Accuracy:	0.9764714297802332