In [1]:
import nltk;

In [2]:
def gold_standard(): 
    """
        :return : A list of the gold standard named entity status of all dataset tokens [True, False, False,...] 
    """
    result=[] 
    for line in open ( 'ner_dataset.txt' ).readlines(): 
        if line.strip():
            token, part_of_speech, chunk, ne_category=line.split() 
            result.append(ne_category!= 'O' )
    return result

In [3]:
def ner(): 
    """
        :return : A list of the predicted named entity status of all dataset tokens [True, False, False,...] 
    """
    result=[];
    sentence_start = True 
    for line in open ( 'ner_dataset.txt' ).readlines(): 
        if line.strip():
            token,part_of_speech,chunk,ne_category=line.split();
            if sentence_start == True:
                result.append(False);
                sentence_start = False;
                continue;
            else:
                if token[0].isupper():
                    result.append(True);
                else:
                    result.append(False);
        else : 
            sentence_start = True 
    return result;

In [4]:
#Source: http://www.nltk.org/book/ch07.html
#Problem 1: Extra Credits
def improved_ner(): 
    """
        :return : A list of the predicted named entity status of all dataset tokens [True, False, False,...] 
    """
    result=[];
    for line in open ( 'ner_dataset.txt' ).readlines(): 
        isNamedEntity = False;
        if line.strip():
            try:
                token,part_of_speech,chunk,ne_category=line.split();
                tokenized_token = nltk.word_tokenize(token);
                pos_tagged = nltk.pos_tag(tokenized_token);
                chunk_tree = nltk.ne_chunk(pos_tagged,binary=True);
                for node in chunk_tree.subtrees():
                    if node.label() == 'NE':
                        isNamedEntity = True;
                        result.append(True);
                        
                if isNamedEntity == False:
                    result.append(False);
                    isNamedEntity = False;
            except Exception:
                print line;
    return result;

In [5]:
#Problem 2: Evaluation function
def evaluate(gold_standard_result, prediction_result):
    if len(gold_standard_result) == len(prediction_result):
        true_positive = 0.0;
        false_positive = 0.0;
        true_negative = 0.0;
        false_negative = 0.0;
        for i in range(0,len(prediction_result)):
            predicted_ner = prediction_result[i];
            gold_standard_ner = gold_standard_result[i];
            if gold_standard_ner == True and predicted_ner == True:
                true_positive += 1;
            elif gold_standard_ner == True and predicted_ner == False:
                false_negative += 1;
            elif gold_standard_ner == False and predicted_ner == False:
                true_negative += 1;
            else:
                false_positive += 1;

        precision = true_positive/(true_positive+false_positive);
        recall = true_positive/(true_positive+false_negative);
        f1_score = (2*precision*recall)/(precision+recall);
        print "Precision: " + repr(precision);
        print "Recall:" + repr(recall);
        print "F1-Score:" + repr(f1_score);
    else:
        print "Error";
    return;    

In [6]:
#Problem 2: Strict measure method - Extra Credits
def evaluate_strict(gold_standard_result, prediction_result):
    if len(gold_standard_result) == len(prediction_result):
        true_positive = 0.0;
        false_positive = 0.0;
        true_negative = 0.0;
        false_negative = 0.0;
        length = len(gold_standard_result);
       
        gold_standard_named_entity_list = [];
        gold_standard_negative_list = [];
        
        i = 0;
        j = 0;
        while(i < length):
            if(gold_standard_result[i] == True and prediction_result[i] == True):
                startIndex = i;
                endIndex = i;
                while(endIndex < length and gold_standard_result[endIndex] == True):
                    endIndex = endIndex + 1;
                
                while(startIndex < endIndex):
                    if(prediction_result[startIndex] == False):
                        break;
                    else:
                        startIndex = startIndex + 1;
               
                if(startIndex == endIndex):
                    true_positive = true_positive + 1;
                i = endIndex;
            elif(gold_standard_result[i] == True and prediction_result[i] == False):
                startIndex = i;
                endIndex = i;
                while(endIndex < length and gold_standard_result[endIndex] == True):
                    endIndex = endIndex + 1;    
                
                if(startIndex == (endIndex-1)):
                    false_negative = false_negative + 1;
                
                i = endIndex;                
            elif(gold_standard_result[i] == False and prediction_result[i] == True):
                i += 1;
                false_positive = false_positive + 1;
            else:
                i += 1;
        precision = true_positive/(true_positive+false_positive);
        recall = true_positive/(true_positive+false_negative);
        f1_score = (2*precision*recall)/(precision+recall);
        print "TP: " + repr(true_positive);
        print "FP:" + repr(false_positive);
        print "TN:" + repr(true_negative);
        print "FN:" + repr(false_negative);
        print "Precision: " + repr(precision);
        print "Recall:" + repr(recall);
        print "F1-Score:" + repr(f1_score);
    else:
        print "Error";
    return;

In [7]:
#Problem 2: Lenient measure method - Extra Credits
def evaluate_lenient(gold_standard_result, prediction_result):
    if len(gold_standard_result) == len(prediction_result):
        true_positive = 0.0;
        false_positive = 0.0;
        true_negative = 0.0;
        false_negative = 0.0;
        length = len(gold_standard_result);
       
        gold_standard_named_entity_list = [];
        gold_standard_negative_list = [];
        
        i = 0;
        j = 0;
        while(i < length):
            if(gold_standard_result[i] == True and prediction_result[i] == True):
                startIndex = i;
                endIndex = i;
                while(endIndex < length and gold_standard_result[endIndex] == True):
                    endIndex = endIndex + 1;
                
                while(startIndex < endIndex):
                    if(prediction_result[startIndex] == False):
                        break;
                    else:
                        startIndex = startIndex + 1;
               
                if(startIndex <= endIndex):
                    true_positive = true_positive + 1;
                i = endIndex;
            elif(gold_standard_result[i] == True and prediction_result[i] == False):
                i = i + 1;
                false_negative = false_negative + 1;       
            elif(gold_standard_result[i] == False and prediction_result[i] == True):
                i += 1;
                false_positive = false_positive + 1;
            else:
                i += 1;
        precision = true_positive/(true_positive+false_positive);
        recall = true_positive/(true_positive+false_negative);
        f1_score = (2*precision*recall)/(precision+recall);
        print "TP: " + repr(true_positive);
        print "FP:" + repr(false_positive);
        print "TN:" + repr(true_negative);
        print "FN:" + repr(false_negative);
        print "Precision: " + repr(precision);
        print "Recall:" + repr(recall);
        print "F1-Score:" + repr(f1_score);
    else:
        print "Error";
    return; 

In [8]:
gold_standard_result=gold_standard() 
prediction_result=ner()
improved_prediction_result = improved_ner();

In [9]:
print len(gold_standard_result);
print len(prediction_result);
print len(improved_prediction_result);

46666
46666
46666


In [11]:
print "Precision, Recall & F1-Score from original NER:"
evaluate(gold_standard_result, prediction_result)
print "\nPrecision, Recall & F1-Score from improved NER:"
evaluate(gold_standard_result, improved_prediction_result)

Precision, Recall & F1-Score from original NER:
Precision: 0.7890855457227138
Recall:0.7914201183431953
F1-Score:0.7902511078286558

Precision, Recall & F1-Score from improved NER:
Precision: 0.8682933228387774
Recall:0.8159516765285996
F1-Score:0.8413091833492214


In [14]:
#Problem 2: Strict measure
evaluate_strict(gold_standard_result, improved_prediction_result);

TP: 4230.0
FP:1004.0
TN:0.0
FN:509.0
Precision: 0.8081773022544899
Recall:0.8925933741295632
F1-Score:0.8482903840368997


In [13]:
#Problem 2: Lenient measure
evaluate_lenient(gold_standard_result, improved_prediction_result);

TP: 4865.0
FP:1004.0
TN:0.0
FN:991.0
Precision: 0.8289316749020276
Recall:0.8307718579234973
F1-Score:0.8298507462686566
