In [64]:
# Importing libraries

import nltk, re
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from IPython.display import HTML, display
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [42]:
'''
        Data Loading TRAIN:
        ----------
        The data is loaded from a tsv file
        ---------

        Creating Sentences: 
        ----------
        For tags the sentences are created using " 。" to define end of a sentence
        ----------
'''
whole_text = []

def tagSetupTrain(): 
  testfile = open('train.tsv', 'r')
  sentence = []
  for line in testfile:
    pieces = line.rstrip("\n").split("\t")
    if pieces[0]=='。':
      whole_text.append((sentence))
      sentence = []
    else:
      sentence.append(tuple(pieces))

In [43]:
tagSetupTrain()

In [44]:
len(whole_text)

179491

In [46]:
'''
        Data Loading TEST:
        ----------
        The data is loaded from a tsv file
        ---------

        Creating Sentences: 
        ----------
        For tags the sentences are created using " 。" to define end of a sentence
        ----------
'''
whole_test_text = []

testfile = open('test.tsv', 'r')
sentence_test = []
for line in testfile:
    pieces = line.rstrip("\n").split("\t")
    if pieces[0]=='。':
      whole_test_text.append((sentence_test))
      sentence_test = []
    else:
      sentence_test.append(tuple(pieces))

In [49]:
train_set,test_set = whole_text, whole_test_text

In [50]:
print("-" * 100)
print("Training Set Length -", len(train_set))
print("Testing Set Length -", len(test_set))
print("-" * 100)
print("Training Data Glimpse -\n")
print(train_set[:1])
print("-" * 100)

----------------------------------------------------------------------------------------------------
Training Set Length - 179491
Testing Set Length - 3351
----------------------------------------------------------------------------------------------------
Training Data Glimpse -

[[('時', '0'), ('間', '1'), ('：', '1'), ('三', '0'), ('月', '1'), ('十', '0'), ('日', '1'), ('（', '1'), ('星', '0'), ('期', '0'), ('四', '1'), ('）', '1'), ('上', '0'), ('午', '1'), ('十', '0'), ('時', '1')]]
----------------------------------------------------------------------------------------------------


In [51]:
# create list of train and test tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
test_tagged_words = [tup[0] for sent in test_set for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

8188676
194345


In [52]:
# check some of the tagged words.
train_tagged_words[1:5]

[('間', '1'), ('：', '1'), ('三', '0'), ('月', '1')]

In [53]:
# let's check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

2
{'0', '1'}


In [54]:
# let's check how many words are present in vocabulary
vocab = {word for word,tag in train_tagged_words}
print(len(vocab))

6115


In [55]:
# compute emission probability for a given word for a given tag
def word_given_tag(word,tag,train_bag= train_tagged_words):
    """"
        Parameters:
        ----------
        word: individualw word w
        train_bag: it is the training set that we initialized at top.
        
        What the function does?
        -----------------------
        It computes emission probabilties for a given word.
        
    """
    taglist = [pair for pair in train_bag if pair[1] == tag]
    tag_count = len(taglist)    
    w_in_tag = [pair[0] for pair in taglist if pair[0]==word]    
    word_count_given_tag = len(w_in_tag)    
    
    return (word_count_given_tag,tag_count)

In [56]:
# compute transition probabilities of a previous and next tag
def t2_given_t1(t2,t1,train_bag=train_tagged_words):
    """"
        Parameters:
        ----------
        t2: tag
        t1: tag
        train_bag: it is the training set that we initialized at top.
        
        What the function does?
        -----------------------
        It ompute transition probabilities of a previous and next tag
        
    """

    tags = [pair[1] for pair in train_bag]
    t1_tags = [tag for tag in tags if tag==t1]
    count_of_t1 = len(t1_tags)
    t2_given_t1 = [tags[index+1] for index in range(len(tags)-1) if tags[index] == t1 and tags[index+1] == t2]
    count_t2_given_t1 = len(t2_given_t1)
    return(count_t2_given_t1,count_of_t1)

In [57]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [58]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

# dataset glimpse
tags_df

Unnamed: 0,0,1
0,0.159006,0.840994
1,0.465715,0.534285


In [67]:
# # Let's test our Viterbi algorithm on the sample sentences of test dataset. We are using sample senetences to minimize server crash

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [68]:
def Viterbi_1(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        p_transition =[] # list for storing transition probabilities
        for tag in T:
            if key == 0:
                transition_p = 0
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            p_transition.append(transition_p)
            
        pmax = max(p)
        state_max = T[p.index(pmax)] 
        
      
        # if probability is zero (unknown word) then use transition probability
        if(pmax==0):
            pmax = max(p_transition)
            state_max = T[p_transition.index(pmax)]
                           
        else:
            state_max = T[p.index(pmax)] 
        
        state.append(state_max)
    return list(zip(words, state))

In [69]:
tagged_seq_v1 = Viterbi_1(test_tagged_words)


In [70]:
check_v1 = [i for i, j in zip(tagged_seq_v1, test_run_base) if i == j] 
accuracy_v1 = len(check_v1)/len(tagged_seq_v1)
print('Modified Viterbi_1 Accuracy: ',accuracy_v1*100)

Modified Viterbi_1 Accuracy:  75.80645161290323


In [71]:
pred = []
true = []
for i, j in (tagged_seq_v1):
#     print(j)
    pred.append(j)
for i, j in (test_run_base):
#     print(j)
    true.append(j)
target_names = ['0', '1']
print(classification_report(true,pred, target_names=target_names))
print(confusion_matrix(true, pred, labels=["0", "1"]))


              precision    recall  f1-score   support

           0       0.72      0.66      0.69       100
           1       0.78      0.82      0.80       148

    accuracy                           0.76       248
   macro avg       0.75      0.74      0.75       248
weighted avg       0.76      0.76      0.76       248

[[ 66  34]
 [ 26 122]]


In [75]:
datasets = {}

# from sklearn.datasets import fetch_mldata
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Fetch abalone dataset from mldata.org
data = whole_text
preprocessing_pipe = make_pipeline(
    OneHotEncoder(categorical_features=[0], sparse=False),
    #Scale all from 0 to 1
    MinMaxScaler())
# Apply preprocessing pipe to dataset and store the dataset in dict.
X = ct.fit_transform(X)
# datasets["chinese"] = {
#     "X": preprocessing_pipe.fit_transform(data[0]),
#     "y": data[1]
# }

In [76]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# Class that only holds a collection of different 
# base classifiers for usage with SSL methods.
class base_classifiers:
    KNN = KNeighborsClassifier(
        n_neighbors=3,
        metric="euclidean",
        n_jobs=2  # Parallelize work on CPUs
    )
    NB = GaussianNB(
        priors=None
    )
    #SVM = SVC(
    #    C=1.0,
    #    kernel='poly',
    #    degree=1,
    #    tol=0.001,
    #)
    CART = DecisionTreeClassifier(
        criterion='entropy',
        # splitter='best',
        # max_depth=None,
        # min_samples_split=2,
        min_samples_leaf=2,
        # min_weight_fraction_leaf=0.0,
        # max_features=None,
        # random_state=None,
        # max_leaf_nodes=None,
        # min_impurity_split=1e-07,
        # class_weight=None,
        # presort=False,
    )

In [77]:

from standard_self_training import StandardSelfTraining
from tri_training import TriTraining

# All classifiers used for testing
classifiers = [

    TriTraining("TriTraining (CART)", base_classifiers.CART),
    StandardSelfTraining("Self-Training (CART)", base_classifiers.CART)
]
labeling_rates = [0.10, 0.20, 0.30, 0.40]

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np

def _training_scoring_iteration(clf, X, y, training_index, test_index, labeling_rate):
    """ 
    One iteration of fully training and scoring a 
    classifier on given data (one Kfold split)
    """
    #Testing set is set aside.. - 1/10th of the data
    X_test, y_test = X[test_index], y[test_index]

    #For generating a testing and transductive set
    split_data = train_test_split(
        X[training_index],
        y[training_index],
        test_size=labeling_rate,
        random_state=42
    )
    (X_unlabeled, X_labeled, y_unlabeled, y_labeled) = split_data

    #Training set - 9/10 of data
    X_train = np.concatenate((X_labeled, X_unlabeled))
    y_train = np.concatenate((
        y_labeled.astype(str),
        np.full_like(y_unlabeled.astype(str), "unlabeled")
    ))
    
    #Train the classifier
    clf.fit(X_train, y_train)
    
    #Score the classifier
    transductive_score = clf.score(X_unlabeled, y_unlabeled.astype(str))
    testing_score = clf.score(X_test, y_test.astype(str))

    cnf_matrix = pd.DataFrame(
        confusion_matrix(y_test.astype(str), clf.predict(X_test).astype(str))
    )
    
    return transductive_score, testing_score, cnf_matrix
    
def train_and_score(clf, X, y, cv, labeling_rate):
    """
    Perform KFold cross-validation of a classifier on a given data
    and labelling rate
    """
    transductive_scores = []
    testing_scores = []
    for training_index, test_index in cv.split(X,y):
        transductive_score, testing_score, cnf_matrix = _training_scoring_iteration(clf, X, y, training_index, test_index, labeling_rate)
        
        transductive_scores.append(transductive_score)
        testing_scores.append(testing_score)
        print("#", end="")
    print()
    scores = {
        "trans_mean": np.mean(transductive_scores),
        "test_mean": np.mean(testing_scores),
        "trans_std": np.std(transductive_scores),
        "test_std": np.std(testing_scores)
    }
    return scores, cnf_matrix

In [79]:

from sklearn.model_selection import KFold
import pandas as pd

""" 
The main loop for testing 
all classifiers with 
all datasets and 
all labeling rates
"""
results = None
cnf_matrixes = {}
for classifier in classifiers:
    cnf_matrixes[classifier.name] = {}
    print(classifier.name)
    for dataset_name, dataset in datasets.items():
        cnf_matrixes[classifier.name][dataset_name] = {}
        print("dataset:", dataset_name, "\t")
        for labeling_rate in labeling_rates:
            print("rate:", labeling_rate, end=" ")

            test_info = { "classifier": classifier.name, "dataset":dataset_name, "labeling_rate":labeling_rate}
            cv = KFold(n_splits=10, random_state=42)
            scores, cnf_matrix = train_and_score(classifier, dataset["X"], dataset["y"], cv, labeling_rate)

            if results is None:
                results = pd.DataFrame([{**test_info, **scores}])
            else:
                results.loc[len(results.index)] = {**test_info, **scores}
            cnf_matrixes[classifier.name][dataset_name][labeling_rate] = cnf_matrix
    print()
    print("--------")

In [None]:
whole_test_text = []

def evaluate():
    testfile = open('test.tsv', 'r')
    sentence_test = []
    for line in testfile:
        pieces = line.rstrip("\n").split("\t")
        if pieces[0]=='。':
          whole_test_text.append((sentence_test))
          sentence_test = []
        else:
          sentence_test.append(tuple(pieces))
    
    ## Uncomment this line to use random sentences. The reason for using random sentences is to minimize
    ## the time and server crashing
#     test = whole_test_text[50000:55000]
#     random.seed(1234)
#     rndom = [random.randint(1,len(whole_test_text)) for x in range(5)]
#     test_run = [test[i] for i in rndom]
    test_run_base = [tup for sent in whole_test_text for tup in sent]
    test_tagged_words = [tup[0] for sent in whole_test_text for tup in sent]
    

    tagged_seq2 = Viterbi_1(test_tagged_words)
    pred = []
    true = []
    for i, j in (tagged_seq2):
    #     print(j)
        pred.append(j)
    for i, j in (test_run_base):
    #     print(j)
        true.append(j)
    target_names = ['0', '1']
    print(classification_report(true,pred, target_names=target_names))
    print(confusion_matrix(true, pred, labels=["0", "1"]))
