In [61]:
import pandas as pd
import numpy as np
import re, nltk
import pdb
from collections import defaultdict
from sklearn import preprocessing
import math

def unique_vals(rows, col):
    """Find the unique values for a column in a dataset."""
    return set([row[col] for row in rows])

#######
# Demo:
# unique_vals(training_data, 0)
# unique_vals(training_data, 1)
#######


def class_counts(rows):
    """Counts the number of each type of example in a dataset."""
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

#######
# Demo:
# class_counts(training_data)
#######




def max_label(dict):
    max_count = 0
    label = ""

    for key, value in dict.items():
        if dict[key] > max_count:
            max_count = dict[key]
            label = key

    return label




def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)

#######
# Demo:
# is_numeric(7)
# is_numeric("Red")
#######


class Question:
    """A Question is used to partition a dataset.
    This class just records a 'column number' (e.g., 0 for Color) and a
    'column value' (e.g., Green). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the
    question. See the demo below.
    """

    def __init__(self, column, value, header):
        self.column = column
        self.value = value
        self.header = header

    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            self.header[self.column], condition, str(self.value))


def partition(rows, question):
    """Partitions a dataset.
    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows





def gini(rows):
    """Calculate the Gini Impurity for a list of rows.
    There are a few different ways to do this, I thought this one was
    the most concise. See:
    https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
    """
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

## TODO: Step 3
def entropy(rows):

    # compute the entropy.
    entries = class_counts(rows)
    avg_entropy = 0
    size = float(len(rows))
    for label in entries:
        prob = entries[label] / size
        avg_entropy = avg_entropy + (prob * math.log(prob, 2))
    return -1*avg_entropy

def missClassification(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

def info_gain(left, right, current_uncertainty):
    """Information Gain.
    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))

    
    return current_uncertainty - p * entropy(left) - (1 - p) * entropy(right)
    #return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

def find_best_split(rows, header):
    """Find the best question to ask by iterating over every feature / value
    and calculating the information gain."""
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = entropy(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val, header)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

## TODO: Step 2
class Leaf:
    """A Leaf node classifies data.
    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows, id, depth):
        self.predictions = class_counts(rows)
        self.predicted_label = max_label(self.predictions)
        self.id = id
        self.depth = depth

## TODO: Step 1
class Decision_Node:
    """A Decision Node asks a question.
    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch,
                 depth,
                 id,
                 rows):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.depth = depth
        self.id = id
        self.rows = rows


## TODO: Step 3
def build_tree(rows, header, depth=0, id=0):
    """Builds the tree.
    Rules of recursion: 1) Believe that it works. 2) Start by checking
    for the base case (no further information gain). 3) Prepare for
    giant stack traces.
    """
    # depth = 0
    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.

    gain, question = find_best_split(rows, header)

    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows, id, depth)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    # nodeLst.append(id)
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows, header, depth + 1, 2 * id + 2)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows, header, depth + 1, 2 * id + 1)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # depending on on the answer.
    return Decision_Node(question, true_branch, false_branch, depth, id, rows)

## TODO: Step 8 - already done for you
def prune_tree(node, prunedList):
    """Builds the tree.
    Rules of recursion: 1) Believe that it works. 2) Start by checking
    for the base case (no further information gain). 3) Prepare for
    giant stack traces.
    """

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node
    # If we reach a pruned node, make that node a leaf node and return. Since it becomes a leaf node, the nodes
    # below it are automatically not considered
    if int(node.id) in prunedList:
        return Leaf(node.rows, node.id, node.depth)

    # Call this function recursively on the true branch
    node.true_branch = prune_tree(node.true_branch, prunedList)

    # Call this function recursively on the false branch
    node.false_branch = prune_tree(node.false_branch, prunedList)

    return node

## TODO: Step 6
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predicted_label

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

## TODO: Step 4
def print_tree(node, spacing=""):
    """World's most elegant tree printing function."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print(spacing + "Leaf id: " + str(node.id) + " Predictions: " + str(node.predictions) + " Label Class: " + str(node.predicted_label))
        return

    # Print the question at this node
    print(spacing + str(node.question) + " id: " + str(node.id) + " depth: " + str(node.depth))

    # Call this function recursively on the true branch
    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")


def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

## TODO: Step 5
def getLeafNodes(node, leafNodes =[]):

    # Base case
    if isinstance(node, Leaf):
        leafNodes.append(node)
        return

    # Recursive right call for true values
    getLeafNodes(node.true_branch, leafNodes)

    # Recursive left call for false values
    getLeafNodes(node.false_branch, leafNodes)

    return leafNodes


def getInnerNodes(node, innerNodes =[]):

    # Base case
    if isinstance(node, Leaf):
        return

    innerNodes.append(node)

    # Recursive right call for true values
    getInnerNodes(node.true_branch, innerNodes)

    # Recursive left call for false values
    getInnerNodes(node.false_branch, innerNodes)

    return innerNodes

class Result:
    

    def __init__(self,accuracy,clazz):
        self.accuracy = accuracy
        self.clazz = clazz

        
## TODO: Step 6
def computeAccuracy(rows, node):

    count = len(rows)
    if count == 0:
        return 0

    accuracy = 0
    labels=[]
    for row in rows:
        # last entry of the column is the actual label
        label = classify(row, node)
        labels.append(label)
        if row[-1] == label:
            accuracy += 1
    return Result (round(accuracy/count, 2),labels)

In [62]:
import pandas as pd
import numpy as np
import re, nltk
import pdb
from collections import defaultdict
from sklearn import preprocessing
import math






category = []
subcategory = []
questions = []
length =[]

with open('train_5500.label',mode='r',encoding = "ISO-8859-1") as f:
    for line in f:
        #print(line)
        split_index1 = line.index(":")
        split_index2 = line.index(" ")
        category.append(line[:split_index1])
        subcategory.append(line[split_index1+1:split_index2])
        questions.append(line[(split_index2+1):].replace('\n',''))
                

training_data = pd.DataFrame(zip(category,questions), columns=['Class', 'Questions'])
training_data['CleanedQuestions'] = training_data['Questions'].str.replace('\W', ' ')
training_data['Length'] = training_data['CleanedQuestions'].str.split()
training_data['Length']= training_data['Length'].str.len()

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
training_data['CleanedQuestions'] = training_data['CleanedQuestions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
training_data['CleanedQuestions'] = training_data['CleanedQuestions'].str.lower()
training_data['Words'] = training_data['CleanedQuestions'].str.split()

# build the corpus
vocabulary = []
for question in training_data['Words']:
    for word in question:
        vocabulary.append(word)

#Set returns tuple of unique words
vocabulary = list(vocabulary)
len(vocabulary)
#print(vocabulary)

#print(training_data.head())
#print(training_data)



    
def generate_N_grams(text,ngram=1):
    words = text.split(" ")
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans    


gram_1_values=defaultdict(int)
gram_2_values= defaultdict(int)
gram_3_values = defaultdict(int)


for text in training_data["CleanedQuestions"]:
    for word in generate_N_grams(text):
        gram_1_values[word]+=1
        
gram_1_df=pd.DataFrame(sorted(gram_1_values.items(),key=lambda x:x[1],reverse=True),columns=["Word","Length"])
gram_1_words = gram_1_df['Word'].head(500).tolist()

total_1_gram_count= gram_1_df["Length"].sum()
gram_1_dict = dict(zip(gram_1_df.Word, gram_1_df.Length))



for text in training_data["CleanedQuestions"]:
    for word in generate_N_grams(text,2):
        gram_2_values[word]+=1

gram_2_df=pd.DataFrame(sorted(gram_2_values.items(),key=lambda x:x[1],reverse=True),columns=["Word","Length"])
gram_2_words = gram_2_df['Word'].head(300).tolist()
total_2_gram_count= gram_2_df["Length"].sum()
gram_2_dict = dict(zip(gram_2_df.Word, gram_2_df.Length))

for text in training_data["CleanedQuestions"]:
    for word in generate_N_grams(text,3):
        gram_3_values[word]+=1

gram_3_df=pd.DataFrame(sorted(gram_3_values.items(),key=lambda x:x[1],reverse=True),columns=["Word","Length"])
gram_3_words=gram_3_df['Word'].head(200).tolist()
total_3_gram_count= gram_3_df["Length"].sum()
gram_3_dict = dict(zip(gram_3_df.Word, gram_3_df.Length))

#store the lexical and tagged words
training_data.head()
lexicalWords=[]
taggedWords=[]
for words in training_data["Words"]:
    taggedWord= nltk.pos_tag(words)
    lexicalWord=[]
    for word in words:
        if word in gram_1_words:
            lexicalWord.append(word)
    lexicalWords.append(lexicalWord)
    taggedWords.append(taggedWord)
training_data["Lexical"]= lexicalWords   
training_data["Tagged"]= taggedWords
training_data.head()  


brown_news_tagged = nltk.pos_tag(vocabulary)
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd = tag_fd.most_common(500)
tag_fd_dict= dict(tag_fd)

total_tag=0
most_common_tags=[]
for j,k in tag_fd:
    most_common_tags.append(j)
    total_tag=total_tag+k
#print("tags",most_common_tags)
#print(" ")
#print(total_tag)

tagsList=[]
for tags in training_data["Tagged"]:
    tag=[]
    for j,k in tags:
        if k in most_common_tags:
            tag.append(k)
    tagsList.append(tag)
training_data["Tagged"]= tagsList

training_data.head()









  training_data['CleanedQuestions'] = training_data['Questions'].str.replace('\W', ' ')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shashadhar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shashadhar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,Class,Questions,CleanedQuestions,Length,Words,Lexical,Tagged
0,DESC,How did serfdom develop in and then leave Russ...,how serfdom develop leave russia,9,"[how, serfdom, develop, leave, russia]",[how],"[WRB, JJ, VB, JJ, NN]"
1,ENTY,What films featured the character Popeye Doyle ?,what films featured character popeye doyle,7,"[what, films, featured, character, popeye, doyle]","[what, character]","[WP, VBD, JJ, NN, NN, NN]"
2,DESC,How can I find a list of celebrities ' real na...,how i find list celebrities real names,10,"[how, i, find, list, celebrities, real, names]","[how, i, find, list, real, names]","[WRB, JJ, VBP, JJ, NNS, JJ, NNS]"
3,ENTY,What fowl grabs the spotlight after the Chines...,what fowl grabs spotlight chinese year monkey,12,"[what, fowl, grabs, spotlight, chinese, year, ...","[what, chinese, year]","[WP, NN, NN, NN, JJ, NN, NN]"
4,ABBR,What is the full form of .com ?,what full form com,7,"[what, full, form, com]","[what, full, form, com]","[WP, JJ, NN, NN]"


In [63]:
#caculate prob of the features to convert it to numerical value
#caculate prob of the features to convert it to numerical value
prob_lex=[]
for text in training_data["CleanedQuestions"]:
    
    for word in generate_N_grams(text,1):
        prob_1_gram=.0001
        if word in gram_1_words:
            #pdb.set_trace()
            prob_1_gram = prob_1_gram * ((gram_1_dict.get(word)/total_1_gram_count))      

    for word in generate_N_grams(text,2):           
        prob_2_gram=.0001
        if word in gram_2_words:
            prob_2_gram = prob_2_gram * ((gram_2_dict.get(word)/total_2_gram_count))



    for word in generate_N_grams(text,3):           
        prob_3_gram=.0001
        if word in gram_3_words:
            prob_3_gram = prob_3_gram * ((gram_3_dict.get(word)/total_3_gram_count))
    total_prob= (prob_1_gram*prob_2_gram*prob_3_gram)
    prob_lex.append(total_prob)  
    #print(prob_lex)
                            
prob_lex = abs(np.log(prob_lex))

prob_tag =[]
for tags in training_data["Tagged"]:
    for tag in tags:
        prob_1_gram=.0001
        prob_1_gram = prob_1_gram * ((tag_fd_dict.get(tag)/total_tag)) 

    prob_tag.append(prob_1_gram)
    
prob_tag =abs(np.log(prob_tag))   

feature_dataframe = pd.DataFrame(training_data["Length"],columns=['Length'])
feature_dataframe["Length"] = training_data["Length"]
feature_dataframe["Lexical"]= prob_lex
feature_dataframe["POSTag"] = prob_tag
feature_dataframe["Class"] = training_data["Class"]
feature_dataframe.head()

feature_dataframe.head()

Unnamed: 0,Length,Lexical,POSTag,Class
0,9,27.631021,10.305454,DESC
1,7,27.631021,10.305454,ENTY
2,10,34.736081,11.891937,DESC
3,12,27.631021,10.305454,ENTY
4,7,35.509271,10.305454,ABBR


In [64]:
# test data preparation

category = []
subcategory = []
questions = []
length =[]

with open('TREC_10.label',mode='r',encoding = "ISO-8859-1") as f:
    for line in f:
        #print(line)
        split_index1 = line.index(":")
        split_index2 = line.index(" ")
        category.append(line[:split_index1])
        subcategory.append(line[split_index1+1:split_index2])
        questions.append(line[(split_index2+1):].replace('\n',''))
                

test_data = pd.DataFrame(zip(category,questions), columns=['Class', 'Questions'])
test_data['CleanedQuestions'] = test_data['Questions'].str.replace('\W', ' ')
test_data['Length'] = test_data['CleanedQuestions'].str.split()
test_data['Length']= test_data['Length'].str.len()

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
test_data['CleanedQuestions'] = test_data['CleanedQuestions'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_data['CleanedQuestions'] = test_data['CleanedQuestions'].str.lower()
test_data['Words'] = test_data['CleanedQuestions'].str.split()
test_data.head()


lexicalWords=[]
taggedWords=[]
for words in test_data["Words"]:
    taggedWord= nltk.pos_tag(words)
    lexicalWord=[]
    for word in words:
        if word in gram_1_words:
            lexicalWord.append(word)
    lexicalWords.append(lexicalWord)
    taggedWords.append(taggedWord)
test_data["Lexical"]= lexicalWords   
test_data["Tagged"]= taggedWords
test_data.head()  


tagsList=[]
for tags in test_data["Tagged"]:
    tag=[]
    for j,k in tags:
        if k in most_common_tags:
            tag.append(k)
    tagsList.append(tag)
test_data["Tagged"]= tagsList

test_data.head()


# Calcate probability for the numerical values
prob_lex=[]
for text in test_data["CleanedQuestions"]:
    
    for word in generate_N_grams(text,1):
        prob_1_gram=.0001
        if word in gram_1_words:
            #pdb.set_trace()
            prob_1_gram = prob_1_gram * ((gram_1_dict.get(word)/total_1_gram_count))      

    for word in generate_N_grams(text,2):           
        prob_2_gram=.0001
        if word in gram_2_words:
            prob_2_gram = prob_2_gram * ((gram_2_dict.get(word)/total_2_gram_count))



    for word in generate_N_grams(text,3):           
        prob_3_gram=.0001
        if word in gram_3_words:
            prob_3_gram = prob_3_gram * ((gram_3_dict.get(word)/total_3_gram_count))
    total_prob= (prob_1_gram*prob_2_gram*prob_3_gram)
    prob_lex.append(total_prob)  
    #print(prob_lex)
                            
prob_lex = abs(np.log(prob_lex))

prob_tag =[]
for tags in test_data["Tagged"]:
    for tag in tags:
        prob_1_gram=.0001
        prob_1_gram = prob_1_gram * ((tag_fd_dict.get(tag)/total_tag)) 

    prob_tag.append(prob_1_gram)
    
prob_tag =abs(np.log(prob_tag))   

test_feature_dataframe = pd.DataFrame(test_data["Length"],columns=['Length'])
test_feature_dataframe["Length"] = test_data["Length"]
test_feature_dataframe["Lexical"]= prob_lex
test_feature_dataframe["POSTag"] = prob_tag
test_feature_dataframe["Class"] = test_data["Class"]
test_feature_dataframe.head()

test_feature_dataframe.head()


from sklearn import metrics

  test_data['CleanedQuestions'] = test_data['Questions'].str.replace('\W', ' ')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shashadhar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shashadhar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [65]:

import pandas as pd
from sklearn import model_selection


header = ['Length', 'Lexical', 'POSTag','Class']
train_lst = feature_dataframe.values.tolist()


# splitting the data set into train and test
trainDF, testDF = model_selection.train_test_split(train_lst, test_size=0.1)



# building the tree
t = build_tree(trainDF, header)

# get leaf and inner nodes
# print("\nLeaf nodes ****************")
# leaves = getLeafNodes(t)
# for leaf in leaves:
#     print("id = " + str(leaf.id) + " depth =" + str(leaf.depth))

# print("\nNon-leaf nodes ****************")
# innerNodes = getInnerNodes(t)

# for inner in innerNodes:
#     print("id = " + str(inner.id) + " depth =" + str(inner.depth))

# print tree

# training set data report
result = computeAccuracy(testDF, t)
actual_labels =[]
for row in testDF:
    actual_labels.append(row[-1])
    
maxAccuracy = result.accuracy
predicted_labels = result.clazz
print("Training Data Report")
print("\nAccuracy : " + str((1-maxAccuracy)*100) + "\n")
print(metrics.classification_report(actual_labels, predicted_labels,target_names= ["DESC","NUM","ENTY",'LOC','HUM','ABBR']))

#Test set data 

# splitting the data into 10 folding way
test_lst = test_feature_dataframe.values.tolist()

result = computeAccuracy(test_lst, t)
actual_labels =[]
for row in test_lst:
    actual_labels.append(row[-1])    
maxAccuracy1 = result.accuracy
predicted_labels = result.clazz
#print(actual_labels)
#print(result.clazz)
print("Test Data Report")
print("\nAccuracy: " + str((1-maxAccuracy1)*100) + "\n")
print(metrics.classification_report(actual_labels, predicted_labels,target_names= ["DESC","NUM","ENTY",'LOC','HUM','ABBR']))
    
      


#print_tree(t)



Training Data Report

Accuracy : 69.0

              precision    recall  f1-score   support

        DESC       0.60      0.38      0.46         8
         NUM       0.43      0.38      0.41       110
        ENTY       0.24      0.35      0.29       116
         LOC       0.27      0.31      0.29       122
         HUM       0.31      0.26      0.29        91
        ABBR       0.35      0.20      0.26        99

    accuracy                           0.31       546
   macro avg       0.37      0.31      0.33       546
weighted avg       0.32      0.31      0.31       546

Test Data Report

Accuracy: 52.0

              precision    recall  f1-score   support

        DESC       0.83      0.56      0.67         9
         NUM       0.62      0.91      0.74       138
        ENTY       0.38      0.61      0.46        94
         LOC       0.28      0.25      0.26        65
         HUM       0.36      0.16      0.22        81
        ABBR       0.57      0.23      0.33       113

    