In [1]:
# Import the required libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, random
from copy import copy, deepcopy
import sys
from collections import Counter
import queue
from scipy import stats
from sklearn.metrics import precision_recall_fscore_support

In [2]:
import nltk
# nltk.download('stopwords')
# nltk.download('tagsets')
# from nltk.corpus import stopwords
# nltk.download('averaged_perceptron_tagger')
from nltk import FreqDist
from nltk import ngrams
from nltk.tag import pos_tag

In [3]:
def get_text_labe(data):
    text = []
    answer_type = []
    label = []
    sent_length = []
    for line in data:
        a = line.split(':', maxsplit=1)
        label.append(a[0])
        b = a[1].strip().split(' ',maxsplit=1)
        text.append(b[1].lower())
        answer_type.append(b[0])
    # remove punctuations
    clean_text = [re.sub(r'([^\w\s]|[0-9])', ' ', line) for line in text]
    clean_text = [re.sub(r'(\s+)', ' ', line) for line in clean_text]
    return clean_text, label

In [4]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stop_words.add('')
    text_tokens = [sent.split() for sent in text]
    text_no_stopwords = [[w for w in words if w not in stop_words] for words in text_tokens]
    return text_no_stopwords

In [5]:
def ngram_topk(token_list, n, k, feat_dicts=None):
    ngrams_list = [list(ngrams(sent, n)) for sent in token_list]
    if feat_dicts:
        ngrams_topk_dict, idx2ngram_dict = feat_dicts
    else:
        all_ngrams = sum(ngrams_list, [])
        freq_dist = FreqDist(all_ngrams)
        freq_dist_k = freq_dist.most_common(k)
        ngrams_topk_list =  [ngram_token for ngram_token, _ in freq_dist_k]
        ngrams_topk_dict = {ngram_token:i for i, ngram_token in enumerate(ngrams_topk_list)}
        idx2ngram_dict = {v: k for k, v in ngrams_topk_dict.items()}
    
    ngrams_freq_feat = []
    for ngram_tokens in ngrams_list:
        ngram_token_freq = np.zeros(k, dtype = np.int32)
        for ngram_token in ngram_tokens:
            if ngram_token in ngrams_topk_dict.keys():
                ngram_token_freq[ ngrams_topk_dict[ngram_token] ]+=1
        ngrams_freq_feat.append(ngram_token_freq)
    
    return np.asarray(ngrams_freq_feat, dtype = np.int32), ngrams_topk_dict, idx2ngram_dict, ngrams_list

In [6]:
# Read the file line by line and clean the text of punctuation.
with open('Data/train_5500.label', 'r',encoding='latin-1') as file:
    data = file.readlines()

X_train, Y_train = get_text_labe(data)

with open('./Data/TREC_10.label', 'r',encoding='latin-1') as file:
    data = file.readlines()

X_test, Y_test = get_text_labe(data)

Todo:
get vocab. (top 500)

get sentence length feature (1)
get lexical features: 
presence of n grams (1000)
        1-gram (500)
        2-gram (300)
        3-gram (200)
        
Use nltk pos tagger and get tags
for bag of tags model, assign presence to 1 iff the word is in 500 vocab


In [7]:
def get_features_train(X_train, feats_to_use = ('len', 'uni' ,'bi' ,'tri' ,'pos')):
    X_train = [sent.split() for sent in X_train]
    features = []
    feat_dicts_list = []

    if 'len' in feats_to_use:
        X_train_sentlen = np.reshape(np.asarray([len(sent) for sent in X_train], dtype = np.int32),(-1,1))
        feat_dicts_list.append(None)
    else:
        X_train_sentlen = np.reshape(np.asarray([-1 for sent in X_train], dtype = np.int32),(-1,1))
    features.append(X_train_sentlen)
    
    if 'uni' in feats_to_use:
        X_train_unigram = ngram_topk(X_train, 1, 500)
        features.append(X_train_unigram[0])
        feat_dicts_list.append(X_train_unigram[1:-1])
    if 'bi' in feats_to_use:
        X_train_bigram = ngram_topk(X_train, 2, 300)
        features.append(X_train_bigram[0])
        feat_dicts_list.append(X_train_bigram[1:-1])
    if 'tri' in feats_to_use:
        X_train_trigram = ngram_topk(X_train, 3, 200)
        features.append(X_train_trigram[0])
        feat_dicts_list.append(X_train_trigram[1:-1])
    if 'pos' in feats_to_use:
        X_train_pos = [pos_tag(tokens) for tokens in X_train]
        X_train_pos_unigram = ngram_topk(X_train_pos, 1, 500)
        features.append(X_train_pos_unigram[0])
        feat_dicts_list.append(X_train_pos_unigram[1:-1])

    X_train_feats = np.concatenate( features, axis=1 )
    
    return X_train_feats, feat_dicts_list

In [8]:
def get_features_test(X_test, feat_dicts_list, feats_to_use = ('len', 'uni' ,'bi' ,'tri' ,'pos')):
    X_test = [sent.split() for sent in X_test]
    features = []
    feat_dicts_list_idx = 0
    assert len(feat_dicts_list) == len(feats_to_use)
    if 'len' in feats_to_use:
        X_test_sentlen = np.reshape(np.asarray([len(sent) for sent in X_test], dtype = np.int32),(-1,1))
        feat_dicts_list_idx += 1
    else:
        X_test_sentlen = np.reshape(np.asarray([-1 for sent in X_test], dtype = np.int32),(-1,1))
    features.append(X_test_sentlen)
    
    if 'uni' in feats_to_use:
        X_test_unigram = ngram_topk(X_test, 1, 500, feat_dicts_list[feat_dicts_list_idx])
        features.append(X_test_unigram[0])
        feat_dicts_list_idx += 1
    if 'bi' in feats_to_use:
        X_test_bigram = ngram_topk(X_test, 2, 300, feat_dicts_list[feat_dicts_list_idx])
        features.append(X_test_bigram[0])
        feat_dicts_list_idx += 1
    if 'tri' in feats_to_use:
        X_test_trigram = ngram_topk(X_test, 3, 200, feat_dicts_list[feat_dicts_list_idx])
        features.append(X_test_trigram[0])
        feat_dicts_list_idx += 1
    if 'pos' in feats_to_use:
        X_test_pos = [pos_tag(tokens) for tokens in X_test]
        X_test_pos_unigram = ngram_topk(X_test_pos, 1, 500, feat_dicts_list[feat_dicts_list_idx])
        features.append(X_test_pos_unigram[0])
        feat_dicts_list_idx += 1
    
    assert len(feat_dicts_list) == feat_dicts_list_idx
    X_test_feats = np.concatenate( features, axis=1 )
    return X_test_feats


In [9]:
X_train_feats, feat_dicts_list = get_features_train(X_train, feats_to_use = ('len', 'uni' ,'bi' ,'tri' ,'pos'))

In [10]:
X_test_feats = get_features_test(X_test, feat_dicts_list, feats_to_use = ('len', 'uni' ,'bi' ,'tri' ,'pos'))

In [11]:
label2idx = {lab: i for i, lab in enumerate(set(Y_train))}
idx2label = {i: lab for lab, i in label2idx.items()}
print(label2idx)

{'DESC': 0, 'LOC': 1, 'NUM': 2, 'ABBR': 3, 'ENTY': 4, 'HUM': 5}


In [12]:
Y_train_idx = np.asarray([label2idx[lab] for lab in Y_train], dtype=np.int32)
Y_test_idx = np.asarray([label2idx[lab] for lab in Y_test], dtype=np.int32)

In [13]:
# def sublists(input_list):
#     subs = []

#     for i in range(0, len(input_list) + 1):
#         temp = [list(x) for x in combinations(input_list, i)]

#         if len(temp) > 0:
#             subs.extend(temp)

#     return subs

In [14]:
# # Needs Work
# def tester(X, Y, iterations = 1):
#     kf = KFold(n_splits = 10, shuffle = True)
#     scores = []
#     mean_scores = []

#     for i in range(iterations):
#         for train_index, test_index in kf.split(X):
#             train_length = len(train_index)
#             valid_index = train_index[: train_length // 10]
#             train_index = train_index[train_length // 10 :]
#             X_train, X_test = X.iloc[train_index].drop(['index'], axis = 1),
#                               X.iloc[test_index].drop(['index'], axis = 1)
#             Y_train, Y_test = Y.iloc[train_index].drop(['index'], axis = 1),
#                               Y.iloc[test_index].drop(['index'], axis = 1)
#             clf = linear_model.LogisticRegression(solver = 'liblinear', penalty = 'l2',
#                   max_iter = 200).fit(X_train, Y_train.values.ravel())
#             scores.append(clf.score(X_test, Y_test))

#         mean_scores.append(np.mean(scores))

#     return np.mean(mean_scores)

In [15]:
# # Needs Work

# power_metrics = sublists(metrics)
# max_score = -1

# for metric_list in power_metrics:
#     if metric_list == []:
#         continue

#     X = data[metric_list].reset_index()
#     Y = data['Diabetic'].reset_index()
#     score = tester(X, Y, 1, False)

#     if score > max_score:
#         max_score = score
#         print(metric_list)
#         print(max_score)

In [16]:
class Node():
    cnt = 0
    def __init__(self, ):
        self.leaf = False
        self.majority_class = None
        self.attribute_index = None
        self.children = dict() # key: attribute_value, value: child_node
        self.sent_len_split_val = None # Used at inference time, if attribute_index is 0
        self.id = Node.cnt
        Node.cnt+=1
    
    def __str__(self,):
        return 'ID: {}  isLeaf: {} majority: {} split_idx: {} split_val = {}'.format(self.id, 
                                                                                    self.leaf, 
                                                                                    self.majority_class, 
                                                                                    self.attribute_index, 
                                                                                    list(self.children.keys())
                                                                                   )
    def __repr__(self):
        return str(self)
    
    def traverse_print(self,):
        print(self)
        for _, child in self.children:
              child.traverse_print()


In [37]:
class DecisionTree():
    
    # score has to be from 'entropy', 'gini', 'misclassification'
    def __init__(self, score='entropy'):
        score_functions = {'entropy': (DecisionTree.compute_entropy, DecisionTree.get_gain_entropy),
           'gini': (DecisionTree.compute_gini, DecisionTree.get_gain_gini),
           'misclassification': (DecisionTree.compute_misclassification, DecisionTree.get_gain_misclassification)}
        
        self.root = None
        assert score in score_functions.keys()
        self.score = score
        self.compute_score = score_functions[score][0]
        self.get_gain = score_functions[score][1]
        return
    
    @staticmethod
    def compute_entropy(labels):
        entropy = 0.0
        totSamples = len(labels)
        labelSet = set(labels.reshape(-1))
        for label in labelSet:
            prob = np.sum(labels == label) / totSamples
            if prob > 1e-12:
                entropy -= np.log(prob) * prob
        
        return entropy
    
    @staticmethod
    def get_gain_entropy(parent_info, data_i, labels):
        attr_split_info = 0
        attr_count = dict()
        for attr_val in set(data_i.reshape(-1)):
            ids = np.where(data_i == attr_val)[0]
            attr_count[attr_val] = len(ids)
            attr_split_info += attr_count[attr_val] * DecisionTree.compute_entropy(labels[ids])
        attr_gain = parent_info - attr_split_info
        attr_gain_ratio = DecisionTree.compute_dict_entropy(attr_count) * attr_gain
        return attr_gain, attr_gain_ratio, attr_count.keys()
    
    @staticmethod
    def compute_dict_entropy(attr_count):
        entropy = 0
        totSamples = sum(attr_count.values())
       
        labelSet = attr_count.keys()
        for label in labelSet:
            prob = attr_count[label] / totSamples
            if prob > 1e-12:
                entropy -= np.log(prob) * prob
        return entropy
    
    @staticmethod
    def compute_gini(labels):
        pass
    
    @staticmethod
    def get_gain_gini(parent_info, data_i, labels):
        pass

    @staticmethod
    def compute_misclassification(labels):
        pass
    
    @staticmethod
    def get_gain_misclassification(parent_info, data_i, labels):
        pass
    
    def split_node(self, parent, data, labels, used_attr_index):
        num_instances = data.shape[0]
        parent_info = self.compute_score(labels) * num_instances
        parent.majority_class = Counter(labels.reshape(-1)).most_common(1)[0][0]
        
        if parent_info == 0 :
            parent.leaf = True
        
        best_attr_index = None
        best_info_gain = -float('inf')
        best_gain_ratio = -float('inf')
        best_attr_keys = None
#         sent length case special
#         attr_split_info = 0
#         attr_count = dict()
        sent_len_split_val = stats.mode(data[:, 0])[0][0]
        le_ids = np.where(data[:, 0] <= sent_len_split_val)[0]
        gt_ids = np.where(data[:, 0] > sent_len_split_val)[0]
        data_0 = np.zeros(data.shape[0], dtype=np.int32)
        data_0[gt_ids] = 1
#         attr_count[0] = le_ids.shape[0]
#         attr_count[1] = gt_ids.shape[0]
#         attr_split_info = (attr_count[0] * self.compute_entropy(labels[le_ids])) + (attr_count[1] * self.compute_entropy(labels[gt_ids]) )    
#         attr_gain = parent_info - attr_split_info
        attr_gain, attr_gain_ratio, attr_count_keys = self.get_gain(parent_info, data_0, labels)
#         attr_gain_ratio = self.compute_dict_entropy(attr_count) * attr_gain
        if best_gain_ratio < attr_gain_ratio and  attr_gain_ratio > 0 :
                best_attr_index = 0
                best_info_gain = attr_gain
                best_gain_ratio = attr_gain_ratio
                best_attr_keys = attr_count_keys
        
        # during ablation, sentence length can be initialized to all zeros this will prevent splittiung in sent dimension/.
        for i in range(1, data.shape[1]): # starts from 1 as zero is sentence length (always.) .
            if i in used_attr_index:
                continue
            attr_gain, attr_gain_ratio, attr_count_keys = self.get_gain(parent_info, data[:, i], labels)
            if best_gain_ratio < attr_gain_ratio:
                best_attr_index = i
                best_info_gain = attr_gain
                best_gain_ratio = attr_gain_ratio
                best_attr_keys = attr_count_keys
        if best_gain_ratio <= 0 :
            parent.leaf = True
            return [] # TO Check    
        else:
            parent.attribute_index =  best_attr_index
            parent.children = { i: Node() for i in best_attr_keys}
            to_return = []
            if best_attr_index != 0:
                used_attr_index.append(best_attr_index)
                for i in best_attr_keys:
                    inds = np.where(data[:, best_attr_index] == i)[0]
                    to_return.append( (parent.children[i], data[inds], labels[inds], used_attr_index) )
            else:
                parent.sent_len_split_val = sent_len_split_val
                to_return.append( (parent.children[0], data[le_ids], labels[le_ids], used_attr_index) )
                to_return.append( (parent.children[1], data[gt_ids], labels[gt_ids], used_attr_index) )
            return to_return
    
    def build_tree(self, data, labels):
        traversal_q = queue.Queue()
        root = Node()
        traversal_q.put_nowait( (root, data, labels, [] ))
        while not traversal_q.empty():
            node_to_split = traversal_q.get_nowait()
            child_nodes = self.split_node(*node_to_split)
            for child in child_nodes:
                traversal_q.put_nowait(child)
        self.root = root
        return root
    
    def split_infer(self, node, data, data_indices):
        if node.leaf:
            return (True, data_indices, np.zeros( (data.shape[0]), dtype = np.int32) + node.majority_class)
        else:
            to_queue = []
            if(node.attribute_index == 0):
                left_idx = np.where(data[:,0] <= node.sent_len_split_val)[0]
                right_idx = np.where(data[:,0] > node.sent_len_split_val)[0]
                to_queue.append( (node.children[0], data[left_idx], data_indices[left_idx]) )
                to_queue.append( (node.children[1], data[right_idx], data_indices[right_idx]) )
                return (False, to_queue)
            else:
                for i in node.children.keys():
                    split_inds = np.where( data[:, node.attribute_index]  == i)[0]
                    if len(split_inds) > 0:
                        to_queue.append( (node.children[i], data[split_inds], data_indices[split_inds]) )
                return (False, to_queue)
    
    def get_labels(self, data):
        root = self.root
        data_idx = np.arange(data.shape[0], dtype = np.int32)
        labels = np.zeros( (data.shape[0]), dtype = np.int32) + -1
        traversal_q = queue.Queue()
        traversal_q.put_nowait( (root, data, data_idx ))
        while not traversal_q.empty():
            node_to_split = traversal_q.get_nowait()
            split_return = self.split_infer(*node_to_split)
            if split_return[0]:
                labels[split_return[1]] = split_return[2]
            else:
                for child in split_return[1]:
                    traversal_q.put_nowait(child)
        return labels


In [38]:
# Binarize data
X_train_feats_bin = deepcopy(X_train_feats)
X_train_feats_bin[:, 1:] = (X_train_feats[:, 1:] > 0).astype(np.int32)

X_test_feats_bin = deepcopy(X_test_feats)
X_test_feats_bin[:, 1:] = (X_test_feats[:, 1:] > 0).astype(np.int32)

In [39]:
dtree = DecisionTree()
root = dtree.build_tree(data=X_train_feats_bin, labels=Y_train_idx)

In [40]:
root

ID: 6  isLeaf: False majority: 4 split_idx: 2 split_val = [0, 1]

In [41]:
Node.cnt

2847

In [42]:
y_pred_test = dtree.get_labels(data=X_test_feats_bin)

In [43]:
def get_scores(Y_test_idx, y_pred_test):
    acc = (y_pred_test == Y_test_idx).mean()
    prec, rec, fscore, _ = precision_recall_fscore_support(Y_test_idx, y_pred_test, average='weighted')
    return acc, prec, rec, fscore

In [44]:
print('Acc: {}, Prec: {}, Rec: {}, Fscore: {}'.format(*get_scores(Y_test_idx, y_pred_test)))

Acc: 0.784, Prec: 0.7989338031578664, Rec: 0.784, Fscore: 0.7853790197104482


In [45]:
precision_recall_fscore_support(Y_test_idx, y_pred_test, average='macro')

(0.8241854510067325, 0.748570809698703, 0.7763632769347054, None)

In [None]:
all_features = {'len', 'uni' ,'bi' ,'tri' ,'pos'}
dtree_list = dict()
scores_list = dict()

for feat_to_drop in all_features:
    feats_to_use = frozenset(all_features - {feat_to_drop})
    X_train_feats, feat_dicts_list = get_features_train(X_train, feats_to_use = feats_to_use)
    X_test_feats = get_features_test(X_test, feat_dicts_list, feats_to_use = feats_to_use)
    X_train_feats_bin = deepcopy(X_train_feats)
    X_train_feats_bin[:, 1:] = (X_train_feats[:, 1:] > 0).astype(np.int32)
    
    X_test_feats_bin = deepcopy(X_test_feats)
    X_test_feats_bin[:, 1:] = (X_test_feats[:, 1:] > 0).astype(np.int32)
    dtree = DecisionTree()
    _ = dtree.build_tree(data=X_train_feats_bin, labels=Y_train_idx)
    dtree_list[feats_to_use] = dtree
    y_pred_test = dtree.get_labels(data=X_test_feats_bin)
    all_scores = get_scores(Y_test_idx, y_pred_test)
    scores_list[feats_to_use] = all_scores
    print('Features: {}, Missing Feature: {}'.format(feats_to_use, feat_to_drop))
    print('Acc: {}, Prec: {}, Rec: {}, Fscore: {}'.format(*all_scores))
    print()

In [None]:
all_features = {'len', 'uni' ,'bi' ,'tri' ,'pos'}

feats_to_use = frozenset(all_features - {'uni', 'bi', 'tri'})
X_train_feats, feat_dicts_list = get_features_train(X_train, feats_to_use = feats_to_use)
X_test_feats = get_features_test(X_test, feat_dicts_list, feats_to_use = feats_to_use)
X_train_feats_bin = deepcopy(X_train_feats)
X_train_feats_bin[:, 1:] = (X_train_feats[:, 1:] > 0).astype(np.int32)

X_test_feats_bin = deepcopy(X_test_feats)
X_test_feats_bin[:, 1:] = (X_test_feats[:, 1:] > 0).astype(np.int32)
dtree = DecisionTree()
_ = dtree.build_tree(data=X_train_feats_bin, labels=Y_train_idx)
dtree_list[feats_to_use] = dtree
y_pred_test = dtree.get_labels(data=X_test_feats_bin)
all_scores = get_scores(Y_test_idx, y_pred_test)
scores_list[feats_to_use] = all_scores
print('Features: {}, Missing Feature: {}'.format(feats_to_use, feat_to_drop))
print('Acc: {}, Prec: {}, Rec: {}, Fscore: {}'.format(*all_scores))
print()