In [69]:
import pandas as pd
import numpy as np
import os
import math
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

### Import Data as Dataframe

In [37]:
cwd = os.getcwd()
rel_path = '/data_release'
filename = '/train.csv'
train_data = pd.read_csv(cwd + rel_path + filename, encoding = "ISO-8859-1")
train_data.head(5)

Unnamed: 0,sentence,pos_seq,label_seq
0,Ca n't fail to be entertaining .,"['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...","[0, 0, 0, 0, 0, 0, 0]"
1,How much was he going to tell her ?,"['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Up until that news hit the Committee , Don had...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,Could go on to the rugby and go with them coul...,"['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"Finally , we went to the office and they gave ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."


### Unknown Word Handling

In [38]:
def unknown_words(wordlist):
    for i in range(len(wordlist)):
        toss = np.random.binomial(size=1, n=1, p= 0.005)
        if toss == 1:
            wordlist[i] = 'UNK'
    return wordlist

### Converting all sentences into corresponding Words in a List

In [39]:
# O(n) Complexity

sentence = train_data['sentence']
sentence = list(sentence)
word_str = ''
for item in sentence:
    word_str += item + ' '

word_str = word_str.lower()
word_list = word_str.split()
word_list = unknown_words(word_list)          # def unknown_words()
print(len(word_list))

116622


### Converting all metaphor labels into one huge list of labels of 0's and 1's

In [40]:
# O(n) complexity

tag = train_data['label_seq'].tolist()
tag_str = ''
for item in tag:
    item = item[1:-1]
    tag_str += item + ', '
    
tag_list_str = tag_str.split(', ')
tag_list_str = tag_list_str[0:-1]
tag_list = []
for item in tag_list_str:
    tag_list.append(int(item))

len(tag_list)

116622

### Creating Dictionary for each word and its count of tags

In [54]:
# O(n) Complexity

word_dict = {}

for item, tag in zip(word_list, tag_list):
    if item not in word_dict:
        val_list = [0,0]
        val_list[tag] += 1
        word_dict[item] = val_list
    else:
        val_list = word_dict[item]
        val_list[tag] += 1
        word_dict[item] = val_list
        
print(word_dict['UNK'])

[499, 71]


### Creating Dictionary of Tag counts (Unigrams and Bigrams)

In [41]:
# O(n) Complexity

tag_dict = {}

tag = train_data['label_seq'].tolist()
tag_str = ''
for item in tag:
    item = item[1:-1]
    tag_str += item + ', ' + '<s>' + ', '

tag_list_str_s = tag_str.split(', ')
tag_list_str_s = tag_list_str_s[0:-1]

tag_list_str_bigram = []
for i in range(len(tag_list_str_s)-1):
    tag_list_str_bigram.append(tag_list_str_s[i]+tag_list_str_s[i+1])

for item in tag_list_str_s:
    if item not in tag_dict:
        tag_dict[item] = 1
    else:
        tag_dict[item] += 1

for item in tag_list_str_bigram:
    if item not in tag_dict:
        tag_dict[item] = 1
    else:
        tag_dict[item] += 1

print(tag_dict)

{'0': 103571, '<s>': 6323, '1': 13051, '00': 87135, '0<s>': 6267, '<s>0': 5829, '01': 10169, '10': 10606, '11': 2389, '<s>1': 493, '1<s>': 56}


### Transitional Probability

In [42]:
def t_prob(tag_dict, i, j):
    if j==-1:
        prob = tag_dict[str(i)]/(tag_dict[str(0)] + tag_dict[str(1)])
    else:
        key = str(j) + str(i)
        prob = tag_dict[key]/tag_dict[str(j)]
    
    return prob

### Emission Probability

In [53]:
def emiss_prob(word_dict,tag_dict, word, i):
    
    if word in word_dict:
        prob = word_dict[word][i]/tag_dict[str(i)]
    else:
        prob = word_dict['UNK'][i]/tag_dict[str(i)]
    
    return prob

### POS with ID tagging

In [43]:
pos_seq = train_data['pos_seq'].tolist()
pos_str = ''
for item in pos_seq:
    item = item[1:-1]
    pos_str += item + ', '

pos_list = pos_str.split(', ')
pos_list = pos_list[0:-1]
pos_dict = {}

i = 1
for item in pos_list:
    if item not in pos_dict:
        pos_dict[item] = i
        i += 1

print(pos_dict)

{"'VERB'": 1, "'ADV'": 2, "'PART'": 3, "'ADJ'": 4, "'PUNCT'": 5, "'PRON'": 6, "'ADP'": 7, "'DET'": 8, "'NOUN'": 9, "'PROPN'": 10, "'CCONJ'": 11, "'NUM'": 12, "'INTJ'": 13, "'X'": 14, "'SYM'": 15}


### Creation of Feature Matrix for Classification Model

In [120]:
def develop_feature_matrix(feature_len, data, wordlist):    
#     feature_len = 9           # posi-4, posi-3, posi-2, posi-1, posi, posi+1, posi+2, posi+3, posi+4 
    Lend = int((feature_len-1)/2)
    nrow = len(wordlist)
    ncol = feature_len
    # feature_train_X = np.full((nrow, ncol), -1, dtype=int)
    feature_X = np.zeros((nrow, ncol), dtype=int)

    sentence = data['sentence'].tolist()
    tagseq = data['pos_seq'].tolist()
    
    # sentence = sentence[0:1]
    word_c = 0
    for item,tag in zip(sentence, tagseq):
        item = item.split()
        tag = tag[1:-1]
        tag = tag.split(', ')

        for i in range(len(item)):
            for j in range(-Lend, Lend+1):
                if(i+j>=0 and i+j<len(item)):
                    feature_X[word_c, (j+Lend)] = pos_dict[tag[i+j]]
            word_c=word_c+1 

    return feature_X      

#### This is an alternative implementation of Feature Matrix

In [156]:
# def develop_feature_matrix(feature_len, data, wordlist):    
# #     feature_len = 9           # posi-4, posi-3, posi-2, posi-1, posi, posi+1, posi+2, posi+3, posi+4 
#     Lend = int((feature_len-1)/2)
#     nrow = len(wordlist)
#     ncol = feature_len
#     # feature_train_X = np.full((nrow, ncol), -1, dtype=int)
#     feature_X = np.zeros((nrow, (ncol+1)), dtype=int)
    
#     sentence = data['sentence'].tolist()
#     tagseq = data['pos_seq'].tolist()
    
#     # sentence = sentence[0:1]
#     word_c = 0
#     for item,tag in zip(sentence, tagseq):
#         item = item.split()
#         tag = tag[1:-1]
#         tag = tag.split(', ')

#         for i in range(len(item)):
#             for j in range(-Lend, Lend+1):
#                 if(i+j>=0 and i+j<len(item)):
#                     feature_X[word_c, (j+Lend)] = pos_dict[tag[i+j]]
            
#             feature_X[word_c, (ncol)] = word_dict[word_c][1]
#             word_c=word_c+1 

#     return feature_X    

In [158]:
# feature_len = 9
# X_train = develop_feature_matrix(feature_len, train_data, word_list)
# X_train[0:11,:]


### Classification Model

#### Naive Bayes

In [45]:
def naive_bayes(feature_X, label_Y):    
    clf = MultinomialNB(alpha = 1, class_prior=None, fit_prior=True)
    clf.fit(feature_X, label_Y)
    return clf

In [46]:
def classifier_prob_naive_bayes(model, X):
    Ypred = model.predict_proba(X)
    return Ypred

#### Logistic Regression

In [148]:
def logistic_regression(feature_X, label_Y):    
    weight = {0:1,1:4}
    clf = LogisticRegression(random_state=0, solver='lbfgs', class_weight = weight, multi_class='multinomial')
    clf.fit(feature_X, label_Y)
    return clf

In [149]:
def classifier_logistic_regression(model, X):
    Ypred = model.predict_proba(X)
    return Ypred

### Viterbi Algorithm on Classification

In [74]:
def viterbi(item, Y_prob, word_dict, tag_dict, ntags, word_c):
    
    item = item.split()
    word0 = item[0]
    c_prev = -1
    score = np.zeros((ntags, len(item)))
    b_ptr = np.zeros((ntags, len(item)))
    
    for c in range(ntags):
#         score[c,0] = math.log(t_prob(tag_dict, c, c_prev)) + math.log(emiss_prob(word_dict, tag_dict, word0, c))
#         score[c,0] = t_prob(tag_dict, c, c_prev)*emiss_prob(word_dict, tag_dict, word0, c)
#         score[c,0] = t_prob(tag_dict, c, c_prev)*Y_prob[0,c]
        score[c,0] = Y_prob[0,c]*emiss_prob(word_dict, tag_dict, word0, c)
        b_ptr[c,0] = 0
    
    word_c += 1
    for t in range(1, len(item)):
        for c in range(ntags):
            emission_prob = emiss_prob(word_dict, tag_dict, item[t], c)
            temp = []
            for j in range(ntags):
#                 temp.append(score[j,t-1] + math.log(t_prob(tag_dict, c, j)) + log(emission_prob))
#                 temp.append(score[j,t-1]*t_prob(tag_dict, c, j)*emission_prob)
#                 temp.append(score[j,t-1]*t_prob(tag_dict, c, j)*Y_prob[word_c,c])
                temp.append(score[j,t-1]*Y_prob[word_c,c]*emission_prob)
            score[c,t] = max(temp)
            b_ptr[c,t] = int(temp.index(max(temp)))
        word_c += 1
    
    b_ptr = np.int_(b_ptr)
    Tseq = [-1]*len(item)
    Tseq[len(item)-1] = np.argmax(score[:,len(item)-1])
    for i in range(len(item)-2, -1, -1):
        Tseq[i] = b_ptr[Tseq[i+1], i+1]
    
    return Tseq

### Training model on a *classifier* and Predicting on Validation/Test Set

In [154]:
# Loading Validation Set
cwd = os.getcwd()
rel_path = '/data_release'
filename = '/val.csv'
# filename = '/test_no_label.csv'
val_data = pd.read_csv(cwd + rel_path + filename, encoding = "ISO-8859-1")
val_data.head(5)

Unnamed: 0,sentence,pos_seq,label_seq
0,Four alternative approaches have been describe...,"['NUM', 'ADJ', 'NOUN', 'VERB', 'VERB', 'VERB',...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,"I wanted to say , you see , that I know you th...","['PRON', 'VERB', 'PART', 'VERB', 'PUNCT', 'PRO...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,The one with you chop the chop and then there ...,"['DET', 'NOUN', 'ADP', 'PRON', 'VERB', 'DET', ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"Given that most GIS are rather dumb systems , ...","['VERB', 'ADP', 'ADJ', 'PROPN', 'VERB', 'ADV',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Lacking a goal that might have altered its che...,"['VERB', 'DET', 'NOUN', 'ADJ', 'VERB', 'VERB',...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [155]:
# Trained model on training data
feature_len = 5
X_train = develop_feature_matrix(feature_len, train_data, word_list)      # feature_matrix for classifier
Y_truth = tag_list
# model = naive_bayes(X_train, Y_truth)                                   # Classifier
model = logistic_regression(X_train, Y_truth)                             # Classifier


# O(n) Complexity

sentence = val_data['sentence']
sentence = list(sentence)
word_str = ''
for item in sentence:
    word_str += item + ' '

word_str = word_str.lower()
val_word_list = word_str.split()

for i in range(len(val_word_list)):
    if val_word_list[i] not in word_list:
        val_word_list[i] = 'UNK'

X_val =  develop_feature_matrix(feature_len, val_data, val_word_list)     # feature_matrix for classifier
# Y_prob = classifier_prob_naive_bayes(model, X_val)                      # Prediction from classifier
Y_prob = classifier_prob_naive_bayes(model, X_val)                        # Prediction from classifier

ntags = 2
tagseq = []                                   # Predicted Labels
word_c = 0
for item in sentence:
    item = item.lower()
#     tagseq += viterbi(item, word_dict, tag_dict, ntags)
    tagseq += viterbi(item, Y_prob, word_dict, tag_dict, ntags, word_c)
    word_c += len(item.split())

print(len(tagseq))

output = pd.DataFrame(tagseq)
output.columns = ['label']
output.index.name = 'idx'
output.index += 1
path = os.getcwd() + '/result_classifier_val.csv'
output.to_csv(path)
output.head()

38628


Unnamed: 0_level_0,label
idx,Unnamed: 1_level_1
1,0
2,0
3,1
4,0
5,0
