# **Mandarin Word Segmentation**

![](https://i.imgur.com/Y9JDMFK.jpg)



> **The code here are divided in 2 parts**

*   Done using Viterbi Algorithm
*   Used CRFsuite 

In [None]:
# Importing libraries

import nltk, re
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from IPython.display import HTML, display
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
'''
        Data Loading TRAIN:
        ----------
        The data is loaded from a tsv file
        ---------

        Creating Sentences: 
        ----------
        For tags the sentences are created using " 。" to define end of a sentence
        ----------
'''
whole_text = []

def tagSetupTrain(): 
  testfile = open('train.tsv', 'r')
  sentence = []
  for line in testfile:
    pieces = line.rstrip("\n").split("\t")
    if pieces[0]=='。':
      whole_text.append((sentence))
      sentence = []
    else:
      sentence.append(tuple(pieces))

In [None]:
tagSetupTrain()

In [None]:
len(whole_text)

179491

In [None]:
'''
        Data Loading TEST:
        ----------
        The data is loaded from a tsv file
        ---------

        Creating Sentences: 
        ----------
        For tags the sentences are created using " 。" to define end of a sentence
        ----------
'''
whole_test_text = []
def tagSetupTest():
    testfile = open('test.tsv', 'r')
    sentence_test = []
    for line in testfile:
        pieces = line.rstrip("\n").split("\t")
        if pieces[0]=='。':
          whole_test_text.append((sentence_test))
          sentence_test = []
        else:
          sentence_test.append(tuple(pieces))

In [None]:
tagSetupTest()

In [None]:
train_set,test_set = whole_text, whole_test_text

In [None]:
print("-" * 100)
print("Training Set Length -", len(train_set))
print("Testing Set Length -", len(test_set))
print("-" * 100)
print("Training Data Glimpse -\n")
print(train_set[:1])
print("-" * 100)

----------------------------------------------------------------------------------------------------
Training Set Length - 179491
Testing Set Length - 3351
----------------------------------------------------------------------------------------------------
Training Data Glimpse -

[[('時', '0'), ('間', '1'), ('：', '1'), ('三', '0'), ('月', '1'), ('十', '0'), ('日', '1'), ('（', '1'), ('星', '0'), ('期', '0'), ('四', '1'), ('）', '1'), ('上', '0'), ('午', '1'), ('十', '0'), ('時', '1')]]
----------------------------------------------------------------------------------------------------


In [None]:
# create list of train and test tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
test_tagged_words = [tup[0] for sent in test_set for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

8188676
194345


In [None]:
# check some of the tagged words.
train_tagged_words[1:5]

[('間', '1'), ('：', '1'), ('三', '0'), ('月', '1')]

In [None]:
# let's check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

2
{'0', '1'}


In [None]:
# let's check how many words are present in vocabulary
vocab = {word for word,tag in train_tagged_words}
print(len(vocab))

6115


In [None]:
# compute emission probability for a given word for a given tag
def word_given_tag(word,tag,train_bag= train_tagged_words):
    """"
        Parameters:
        ----------
        word: individualw word w
        train_bag: it is the training set that we initialized at top.
        
        What the function does?
        -----------------------
        It computes emission probabilties for a given word.
        
    """
    taglist = [pair for pair in train_bag if pair[1] == tag]
    tag_count = len(taglist)    
    w_in_tag = [pair[0] for pair in taglist if pair[0]==word]    
    word_count_given_tag = len(w_in_tag)    
    
    return (word_count_given_tag,tag_count)

In [None]:
# compute transition probabilities of a previous and next tag
def t2_given_t1(t2,t1,train_bag=train_tagged_words):
    """"
        Parameters:
        ----------
        t2: tag
        t1: tag
        train_bag: it is the training set that we initialized at top.
        
        What the function does?
        -----------------------
        It ompute transition probabilities of a previous and next tag
        
    """

    tags = [pair[1] for pair in train_bag]
    t1_tags = [tag for tag in tags if tag==t1]
    count_of_t1 = len(t1_tags)
    t2_given_t1 = [tags[index+1] for index in range(len(tags)-1) if tags[index] == t1 and tags[index+1] == t2]
    count_t2_given_t1 = len(t2_given_t1)
    return(count_t2_given_t1,count_of_t1)

In [None]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [None]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

# dataset glimpse
tags_df

Unnamed: 0,0,1
0,0.159006,0.840994
1,0.465715,0.534285


In [None]:
# # Let's test our Viterbi algorithm on the sample sentences of test dataset. We are using sample senetences to minimize server crash

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(20)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [None]:
def Viterbi_1(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        p_transition =[] # list for storing transition probabilities
        for tag in T:
            if key == 0:
                transition_p = 0
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            p_transition.append(transition_p)
            
        pmax = max(p)
        state_max = T[p.index(pmax)] 
        
      
        # if probability is zero (unknown word) then use transition probability
        if(pmax==0):
            pmax = max(p_transition)
            state_max = T[p_transition.index(pmax)]
                           
        else:
            state_max = T[p.index(pmax)] 
        
        state.append(state_max)
    return list(zip(words, state))

In [None]:
tagged_seq_v1 = Viterbi_1(test_tagged_words)


In [None]:
check_v1 = [i for i, j in zip(tagged_seq_v1, test_run_base) if i == j] 
accuracy_v1 = len(check_v1)/len(tagged_seq_v1)
print('Modified Viterbi_1 Accuracy: ',accuracy_v1*100)

Modified Viterbi_1 Accuracy:  76.93920335429769


In [None]:
pred = []
true = []
for i, j in (tagged_seq_v1):
#     print(j)
    pred.append(j)
for i, j in (test_run_base):
#     print(j)
    true.append(j)
target_names = ['0', '1']
print(classification_report(true,pred, target_names=target_names))
print(confusion_matrix(true, pred, labels=["0", "1"]))


              precision    recall  f1-score   support

           0       0.76      0.65      0.70       396
           1       0.78      0.85      0.81       558

    accuracy                           0.77       954
   macro avg       0.77      0.75      0.76       954
weighted avg       0.77      0.77      0.77       954

[[259 137]
 [ 83 475]]


In [None]:
# Importing libraries

import pycrfsuite
import urllib.request


In [None]:
# Importing data 

tagSetupTrain()
tagSetupTest()

In [None]:
# Assigning data to variables to use 

prepared_sentences,prepared_test_sentences = whole_text,whole_test_text

In [None]:
# Running Sample data

print([d for d in prepared_test_sentences[21]])

[('楊', '0'), ('建', '1'), ('為', '0'), ('了', '1'), ('沒', '0'), ('有', '1'), ('好', '0'), ('好', '1'), ('保', '0'), ('存', '1'), ('父', '0'), ('親', '1'), ('的', '1'), ('手', '0'), ('稿', '1'), ('，', '1'), ('向', '1'), ('台', '0'), ('灣', '1'), ('社', '0'), ('會', '1'), ('大', '0'), ('眾', '1'), ('鞠', '0'), ('躬', '1'), ('道', '0'), ('歉', '1')]


**Transforming the characters to feature vectors.**

Finally, we can create some simple n-gram features.

In [None]:
#here sentence is prepared_sentence and i is length of prepared_sentence
def create_char_features(sentence, i):
    #set initial feature set char as first char in prepared_sentence
    features = [
        'bias',
        'char=' + sentence[i][0] 
    ]
    #if i >=1 then go to previous character else append 'BOS' in features list 
    if i >= 1:
        features.extend([
            'char-1=' + sentence[i-1][0],
            'char-1:0=' + sentence[i-1][0] + sentence[i][0],
        ])
    else:
        features.append("BOS")
        
    if i >= 2:
        features.extend([
            'char-2=' + sentence[i-2][0],
            'char-2:0=' + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-2:-1=' + sentence[i-2][0] + sentence[i-1][0],
        ])
        
    if i >= 3:
        features.extend([
            'char-3:0=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0] + sentence[i][0],
            'char-3:-1=' + sentence[i-3][0] + sentence[i-2][0] + sentence[i-1][0],
        ])
    #if i+1 < len(sentence) then go to next character and set it to next character and set char to next two characters else append 'EOS' to features list
    if i + 1 < len(sentence):
        features.extend([
            'char+1=' + sentence[i+1][0],
            'char:+1=' + sentence[i][0] + sentence[i+1][0],
        ])
    else:
        features.append("EOS")
    #if first if condition satisfy then go to second and third if condition and do the same work for next characters    
    if i + 2 < len(sentence):
        features.extend([
            'char+2=' + sentence[i+2][0],
            'char:+2=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0],
            'char+1:+2=' + sentence[i+1][0] + sentence[i+2][0],
        ])
    
    if i + 3 < len(sentence):
        features.extend([
            'char:+3=' + sentence[i][0] + sentence[i+1][0] + sentence[i+2][0]+ sentence[i+3][0],
            'char+1:+3=' + sentence[i+1][0] + sentence[i+2][0] + sentence[i+3][0],
        ])
    return features



def create_sentence_features(prepared_sentence):
    return [create_char_features(prepared_sentence, i) for i in range(len(prepared_sentence))]

def create_sentence_labels(prepared_sentence):
    return [str(part[1]) for part in prepared_sentence]

In [None]:
# Assigning data for running test cases
X = [create_sentence_features(ps) for ps in prepared_sentences]
y = [create_sentence_labels(ps)   for ps in prepared_sentences]

X_test = [create_sentence_features(ps) for ps in prepared_test_sentences]
y_test = [create_sentence_labels(ps)   for ps in prepared_test_sentences]

**Training a CRF**

Now, we use Python-CRFSuite for training a CRF.

In [None]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0, 
    'c2': 1e-3,
    'max_iterations': 60,
    'feature.possible_transitions': True
})

In [None]:
type(trainer)

pycrfsuite._pycrfsuite.Trainer

In [None]:
#training model 

trainer.train('mandarin-text-segmentation.crfsuite')

In [None]:
#open trained model

tagger = pycrfsuite.Tagger()
tagger.open('mandarin-text-segmentation.crfsuite')

<contextlib.closing at 0x7f47cdc824c0>

In [None]:
tp = 0
fp = 0
fn = 0
n_correct = 0
n_incorrect = 0

for s in prepared_test_sentences:
    prediction = tagger.tag(create_sentence_features(s))
    correct = create_sentence_labels(s)
    zipped = list(zip(prediction, correct))
    tp +=        len([_ for l, c in zipped if l == c and l == "1"])
    fp +=        len([_ for l, c in zipped if l == "1" and c == "0"])
    fn +=        len([_ for l, c in zipped if l == "0" and c == "1"])
    n_incorrect += len([_ for l, c in zipped if l != c])
    n_correct   += len([_ for l, c in zipped if l == c])

In [None]:
print("Precision:\t" + str(tp/(tp+fp)))
print("Recall:\t\t" + str(tp/(tp+fn)))
print("Accuracy:\t" + str(n_correct/(n_correct+n_incorrect)))

Precision:	0.9689790662352242
Recall:		0.9841709006153898
Accuracy:	0.9709485708405156


In [None]:
def evaluate():
    print('Part 1 - Virtebi')
    print(classification_report(true,pred, target_names=target_names))
    print()
    print('Part 2 ')
    print("Precision:\t" + str(tp/(tp+fp)))
    print("Recall:\t\t" + str(tp/(tp+fn)))
    print("Accuracy:\t" + str(n_correct/(n_correct+n_incorrect)))