In [17]:
!pip install --user python-crfsuite
!pip install --user sklearn-crfsuite
!pip install --user scikit-learn



Installing PyCm for confusion matrix with multiclass labels

In [18]:
!pip install --user pycm



Import specific libraries

In [19]:
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV,cross_validate,GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
from pycm import *

Count number of non-blank lines in social media text file

In [20]:
count_file_dict=dict()
def count_non_blank_lines(filename):
    non_blank_count = sum(1 for line in open(filename) if line.strip())
    count_file_dict[filename]=non_blank_count

Merge contents of a specific code mixed social media text files

In [21]:
def merge_content(filenames,outfilename):
    with open(outfilename, 'w') as outfile:
        for fname in filenames:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)
    return outfile

Initialize model numbers and get the merged files

In [22]:
trans_model=[1,2,3]
filenames = ['FB_HI_EN_CR.txt', 'TWT_HI_EN_CR.txt', 'WA_HI_EN_CR.txt','FB_BN_EN_CR.txt', 'TWT_BN_EN_CR.txt', 'WA_BN_EN_CR.txt','FB_TE_EN_CR.txt', 'TWT_TE_EN_CR.txt', 'WA_TE_EN_CR.txt']
for filename in filenames:
    count_non_blank_lines(filename)
trans_filenames=["HI_EN_Data.txt","BN_EN_Data.txt","TE_EN_Data.txt"]
i=0
j=0
while(i<9):
    outfile=merge_content(filenames[i:i+3],trans_filenames[j])
    i=i+3
    j=j+1

Read a file and tokenise while removing whitespce/new line characters

In [23]:
def read_file_tokenise(filename):
    with open(filename, 'r') as fp:
        data = fp.readlines()
    for i in range(len(data)):
        data[i] = data[i].strip('\n')
        data[i] = data[i].split('\t')

    result = []
    temp = []

    for token in data:
        if token[0] == '':
            if len(temp) > 0:
                result.append(temp)
                temp = []
        else:
            temp.append(token)
    return result

Randomly shuffle result array based on seed value

In [24]:
def shuffle_tokenised_data(result):
    result = np.array(result)
    np.random.seed(52)
    np.random.shuffle(result)
    result = result.tolist()
    return result

Get order statistics of ascii and vowel values of words in sentences of file

In [25]:
def asciiPercentage(s):
    count = 0.
    for char in s:
        if ord(char) < 128:
            count += 1
    return count/len(s)

def vowelPercentage(s):
    vowels = "aeiou"
    count = 0.
    for char in s:
        if char in vowels:
            count += 1
    return count/len(s)

Convert tokenised word to corresponding features

In [26]:
def word_to_features(sent, i):

    # feature vector
    # word, pos, lang

    word = sent[i][0]
    wordClean = ''.join([ch for ch in word if ch in 'asdfghjklqwertyuiopzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM']).lower()
    normalizedWord = wordClean.lower()
    
    anyCap = any(char.isupper() for char in word)
    allCap = all(char.isupper() for char in word)
    hasSpecial = any(ord(char) > 32 and ord(char) < 65 for char in word)
    lang = sent[i][1]
    
    hashTag = word[0] == '#'
    mention = word[0] == '@'
    
    
    features = {'word' : word, 'wordClean' : wordClean, 'normalizedWord' : normalizedWord, \
                'lang' : lang,
                'isTitle' : word.istitle(), 'wordLength' : len(word), \
                'anyCap' : anyCap, 'allCap' : word.isupper(),
                'hasSpecial' : hasSpecial, 'asciiPer' : asciiPercentage(word)}
    
    
#     features['suffix5'] = word[-5:]
#     features['prefix5'] = word[:5]
#     features['suffix4'] = word[-4:]
#     features['prefix4'] = word[:4]
    features['suffix3'] = word[-3:]
    features['prefix3'] = word[:3]
    features['suffix2'] = word[-2:]
    features['prefix2'] = word[:2]
    features['suffix1'] = word[-1:]
    features['prefix1'] = word[:1]  
    
    return features

Convert sentence to features,labels and tokens

In [27]:
def sent_to_features(sent):
    features = []

    for i in range(len(sent)):
        features.append(word_to_features(sent, i))

    return features

def sent_to_labels(sent):
    labels = []

    for i in sent:
        labels.append(i[2])

    return labels

def sent_to_tokens(sent):

    tokens = []

    for i in sent:
        tokens.append(i[0])

    return tokens

Generate training and test set for cross-validation

In [28]:
def generate_training_test_set(result):
    resultValsent = result[int(len(result) * 0.8):]
    resultTrainsent = result[:int(len(result) * 0.8)]
    return resultTrainsent,resultValsent

Cross validating training data with 5 folds<br>
c1,c2 and max_iterations value obtained from GridSearchCV

In [29]:
def training_data_crossval(result,model_no):
    k = 5
    c1=0.0001
    c2=0.1
    chunk = len(result) / k
    results = []

    allTestPredictions = []
    allTestGroundTruth = []

    for i in range(k):

        print("cross validation", i, 'for', 'c1 :', c1, 'c2 :', c2)

        test_sents = result[int(i * chunk) : int((i + 1) * chunk)]
        train_sents = result[:int(i * chunk)] + result[int((i + 1) * chunk):]

        print("--> Extracting Train Set ...")
        X_train = [sent_to_features(s) for s in train_sents]
        y_train = [sent_to_labels(s) for s in train_sents]

        print("--> Extracting Test Set ...")
        X_test = [sent_to_features(s) for s in test_sents]
        y_test = [sent_to_labels(s) for s in test_sents]

        print("--> Loading CRF module ...")
        trainer = pycrfsuite.Trainer(verbose=False)

        for xseq, yseq in zip(X_train, y_train):
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': c1,   # coefficient for L1 penalty
            'c2': c2,  # coefficient for L2 penalty
            'max_iterations': 1000,  # stop earlier

        # include transitions that are possible, but not observed
            'feature.possible_transitions': True,
            'feature.possible_states' : True
        })

        print("Training ...")
        trainer.train('pos'+str(model_no)+'_crf_' + str(i))

        print("Testing ...")
        tagger = pycrfsuite.Tagger()
        tagger.open('pos'+str(model_no)+'_crf_' + str(i))

        y_pred = []

        for xseq in X_test:
            y_pred.append(tagger.tag(xseq))

        """ CRF based classification """

        predictedLabels = []
        correctLabels = []

        for i in y_pred:
            for j in i:
                predictedLabels.append(j)
                allTestPredictions.append(j)

        for i in y_test:
            for j in i:
                correctLabels.append(j)
                allTestGroundTruth.append(j)

    print(""" CRF Classification for Cross-Validation Set """)
    print('c1 :', c1, 'c2 :', c2)
    print(classification_report(allTestGroundTruth, allTestPredictions, digits = 4))

Validating model performance on validation set

In [30]:
def train_test_model_eval(resultTrainsent,resultValsent,model_no):
    c1=0.0001
    c2=0.1
    print("--> Extracting Train Set ...")
    X_train = [sent_to_features(s) for s in resultTrainsent]
    y_train = [sent_to_labels(s) for s in resultTrainsent]

    X_test = [sent_to_features(s) for s in resultValsent]
    y_test = [sent_to_labels(s) for s in resultValsent]

    print("--> Loading CRF module ...")
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': c1,   # coefficient for L1 penalty
        'c2': c2,  # coefficient for L2 penalty
        'max_iterations': 1000,  # stop earlier

    # include transitions that are possible, but not observed
        'feature.possible_transitions': True,
        'feature.possible_states' : True
    })

    print("Training ...")
    trainer.train('pos'+str(model_no)+'_crf')
    print("Testing ...")
    tagger = pycrfsuite.Tagger()
    tagger.open('pos'+str(model_no)+'_crf')

    allTestPredictions = []
    allTestGroundTruth = []
    y_pred = []

    for xseq in X_test:
        y_pred.append(tagger.tag(xseq))
    
    predictedLabels = []
    correctLabels = []

    for i in y_pred:
        for j in i:
            predictedLabels.append(j)
            allTestPredictions.append(j)

    for i in y_test:
        for j in i:
            correctLabels.append(j)
            allTestGroundTruth.append(j)

    print(""" CRF Classification For Validation Set """)
    print('c1 :', c1, 'c2 :', c2)
    print(classification_report(allTestGroundTruth, allTestPredictions, digits = 4))
    print(""" Confusion Matrix For Validation Set """)
    cm=ConfusionMatrix(allTestGroundTruth,allTestPredictions)
    print(cm)

Processing input and model creation and evaluation of its performance<br>
Printing Classification report for training and validation of model with the confusion matrix on the evaluation of validation set by the model

In [31]:
for i in range(len(trans_filenames)):
    result=read_file_tokenise(trans_filenames[i])
    result=shuffle_tokenised_data(result)
    resultTrainsent,resultValsent=generate_training_test_set(result)
    print("Cross validating model for"+" "+trans_filenames[i][0:5])
    print() #for empty line
    training_data_crossval(result,trans_model[i])
    print("Testing model on validation set for"+" "+trans_filenames[i][0:5])
    print() # for empty line
    train_test_model_eval(resultTrainsent,resultValsent,trans_model[i])

Cross validating model for HI_EN

cross validation 0 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 1 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 2 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 3 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 4 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
 CRF Classification for Cross-Validation Set 
c1 : 0.0001 c2 : 0.1
              precision    recall  f1-score   support

           #     0.9321    0.9353    0.9337       587
           $     0.8699  

The save_csv method can be used to save the confusion matrix in csv format and have a better demonstration of it.


Predict     #           $           @           CC          DT          E           G_J         G_N         G_PRP       G_PRT       G_R         G_SYM       G_V         G_X         PSP         U           ~           
Actual
#           97          2           0           0           0           0           0           3           1           1           0           0           2           0           0           0           0           

$           3           72          0           0           0           0           2           3           1           0           0           6           3           0           1           0           0           

@           0           0           180         0           0           0           0           2           0           0           0           0           0           0           1           0           0           

CC          0           0           0           87          10          0           0           9           1           15

Training ...
Testing ...
cross validation 1 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 2 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 3 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 4 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
 CRF Classification for Cross-Validation Set 
c1 : 0.0001 c2 : 0.1
              precision    recall  f1-score   support

           #     1.0000    0.9580    0.9785       119
           $     0.9048    0.7600    0.8261       100
           @     0.9926    0.9890    0.9908       272
          CC     0.8866    0.8459    0.8658       305
          DT     0.8355  

--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 1 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 2 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 3 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
cross validation 4 for c1 : 0.0001 c2 : 0.1
--> Extracting Train Set ...
--> Extracting Test Set ...
--> Loading CRF module ...
Training ...
Testing ...
 CRF Classification for Cross-Validation Set 
c1 : 0.0001 c2 : 0.1
              precision    recall  f1-score   support

           #     0.8716    0.9247    0.8974       279
           $     0.7195    0.7864    0.7515       323
           @     0.9977    1.0000    0.9988      1278
          CC     0.9408 