In [4]:
import numpy as np
import pandas as pd
import string

from nltk import data, word_tokenize, pos_tag, ne_chunk, chunk
from nltk.corpus import wordnet, stopwords, treebank
from nltk.metrics import *
from nltk.tag import *
from nltk.tokenize import punkt

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout


In [6]:
# Constants
STANFORD_NER_PATH='./english.all.3class.distsim.crf.ser.gz'
STANFORD_JAR_PATH='./stanford-ner.jar'
train_500 = treebank.tagged_sents()[:500]
train_1000 = treebank.tagged_sents()[:1000]
train_1500 = treebank.tagged_sents()[:1500]
train_2000 = treebank.tagged_sents()[:2000]
train_2500 = treebank.tagged_sents()[:2500]
train_3000 = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3001:]
TRAINING_DATA = [train_500, train_1000, train_1500, train_2000, train_2500, train_3000]


In [7]:
test_data[0]

[('In', 'IN'),
 ('early', 'RB'),
 ('trading', 'NN'),
 ('in', 'IN'),
 ('Tokyo', 'NNP'),
 ('Thursday', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'NNP'),
 ('index', 'NN'),
 ('fell', 'VBD'),
 ('63.79', 'CD'),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35500.64', 'CD'),
 ('.', '.')]

In [10]:
# Part A) Consider Treebank corpus. Train HMM, TnT, perceptron and CRF models
# using the first 500, 1000, 1500, 2000, 2500 and 3000 sentences. Evaluate the
# resulting 24 models using sentences from 3001.
# Provide a figure with four learning curves, each per model type (X=training set
# size; Y=accuracy). Which model would you select? Justify the answer.

In [11]:
def train_test_hmm(train, test):
    print('Train data size: ', str(len(train)))
    print('Test data size: ', str(len(test)))
    tagger = HiddenMarkovModelTrainer().train_supervised(train)
    return tagger.evaluate(test)

In [4]:
def train_test_tnt(train, test):
    print('Train data size: ', str(len(train)))
    print('Test data size: ', str(len(test)))
    tagger = TnT()
    tagger.train(train)
    return tagger.evaluate(test)

In [5]:
def train_test_crf(train, test):
    print('Train data size: ', str(len(train)))
    print('Test data size: ', str(len(test)))
    tagger = CRFTagger()
    tagger.train(train, 'crf_tagger_model')
    return tagger.evaluate(test)

In [6]:
def train_test_pcp(train, test):
    print('Train data size: ', str(len(train)))
    print('Test data size: ', str(len(test)))
    tagger = PerceptronTagger(load=False)
    tagger.train(train)
    return tagger.evaluate(test)

In [12]:
print('Running hmm')
hmm = [train_test_hmm(train_data, test_data) for train_data in TRAINING_DATA]
print('Running tnt')
tnt = [train_test_tnt(train_data, test_data) for train_data in TRAINING_DATA]
print('Running crf')
crf = [train_test_crf(train_data, test_data) for train_data in TRAINING_DATA]
print('Running pcp')
pcp = [train_test_pcp(train_data, test_data) for train_data in TRAINING_DATA]

Running hmm
Train data size:  500
Test data size:  913
Train data size:  1000
Test data size:  913
Train data size:  1500
Test data size:  913
Train data size:  2000
Test data size:  913
Train data size:  2500
Test data size:  913
Train data size:  3000
Test data size:  913
Running tnt
Train data size:  500
Test data size:  913
Train data size:  1000
Test data size:  913
Train data size:  1500
Test data size:  913
Train data size:  2000
Test data size:  913
Train data size:  2500
Test data size:  913
Train data size:  3000
Test data size:  913
Running crf
Train data size:  500
Test data size:  913
Train data size:  1000
Test data size:  913
Train data size:  1500
Test data size:  913
Train data size:  2000
Test data size:  913
Train data size:  2500
Test data size:  913
Train data size:  3000
Test data size:  913
Running pcp
Train data size:  500
Test data size:  913
Train data size:  1000
Test data size:  913
Train data size:  1500
Test data size:  913
Train data size:  2000
Test data

In [2]:
df = {'hmm': hmm, 'tnt': tnt, 'crf': crf, 'pcp': pcp}

NameError: name 'hmm' is not defined

In [18]:
plot_data = []
for model in df.keys():
    plot_data = np.append(plot_data, [Scatter(
        x=['train_500', 'train_1000', 'train_1500', 'train_2000', 'train_2500', 'train_3000'],
        y=df[model],
        name=model
    )])
layout = Layout(
    xaxis=dict(title="Training set"),
    yaxis=dict(title="Accuracy")
)
fig = Figure(data=list(plot_data), layout=layout)
plot(fig)

# The plot shows that all the algorithms improve accuracy with larger training sets. 
# It also shows that the pcp and crf models have the highest accuracy.
# Finally, it shows that the hmm has the lowest accuracy, but that it improves significantly as the
# training size is increased.

'file:///Users/jnalexander/Projects/IHLT-lab7/temp-plot.html'

In [180]:
# B) Read the three first pair of sentences of the training file within the
# evaluation framework of the project. Compute their similarities by considering
# the following approaches:
# I words and Jaccard coefficient (same as in Session 5)
# I words plus NEs and Jaccard coefficient
# Print the results. Do you think it could be relevant to use NEs to compute the
# similarity between two sentences? Justify the answer.

In [39]:
# Pretrained nltk detector
sent_detector = data.load('tokenizers/punkt/english.pickle')
stanford_tagger = StanfordNERTagger(STANFORD_NER_PATH, STANFORD_JAR_PATH)

def lines_to_sentences(line):
    return sent_detector.tokenize(line.strip())

def remove_punctuation(line):
    return line.translate(str.maketrans('', '', string.punctuation))

def extract_words(sent):
    words = word_tokenize(sent)
    return [word for word in words if word not in stopwords.words('english')]

def compare_sentences(sent_0, sent_1):
    # 1. Extract words from sentences
    line_0, line_1 = remove_punctuation(sent_0), remove_punctuation(sent_1)
    words_0, words_1 = extract_words(line_0), extract_words(line_1)    
    # 2. Jaccard distance between words
    print("\nWord set:\n", set(words_0), "\nand:\n", set(words_1))
    word_jaccard = jaccard_distance(set(words_0), set(words_1))
    # 3. Get NEs from sentences
    nes_0, nes_1 = stanford_tagger.tag(words_0), stanford_tagger.tag(words_1)
    # 4. Jaccard distance between nes
    print("\nNEs set:\n", set(nes_0), "\nand:\n", set(nes_1))
    nes_jaccard = jaccard_distance(set(nes_0), set(nes_1))
    # 4. Jaccard distance between nes & words
    return word_jaccard, nes_jaccard

In [40]:
lines = [msr_line_1, msr_line_2, msr_line_3]
for line in lines:
    pair = lines_to_sentences(line)
    print("\nComparing\n", pair[0], "\nWith\n", pair[1])
    word_jaccard, nes_jaccard = compare_sentences(pair[0], pair[1])
    print("Word_jaccard: ", word_jaccard)
    print("Nes_jaccard: ", nes_jaccard)


Comparing
 Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence. 
With
 Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.

Word set:
 {'deliberately', 'distorting', 'witness', 'brother', 'accused', 'evidence', 'Amrozi', 'called'} 
and:
 {'deliberately', 'distorting', 'witness', 'brother', 'Referring', 'accused', 'evidence', 'Amrozi'}

NEs set:
 {('witness', 'O'), ('Amrozi', 'PERSON'), ('evidence', 'O'), ('called', 'O'), ('brother', 'O'), ('deliberately', 'O'), ('accused', 'O'), ('distorting', 'O')} 
and:
 {('witness', 'O'), ('Referring', 'O'), ('Amrozi', 'PERSON'), ('evidence', 'O'), ('brother', 'O'), ('deliberately', 'O'), ('accused', 'O'), ('distorting', 'O')}
Word_jaccard:  0.2222222222222222
Nes_jaccard:  0.2222222222222222

Comparing
 Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion. 
With
 Yucaipa bought Dominick's in 1995 for $693 million a

In [22]:
# Do you think it could be relevant to use NEs to compute the
# similarity between two sentences? 

# The printed results show that alone, the NEs are not enough to distinguish between sentences,
# as for example a sentence that references a company named 'APPLE' will have the same 'ORGANIZATION'
# entity associated with it as a sentence that references a company named 'MICROSOFT'.
# However, when combined with words, it provides a greater measure of similarity, as it allows
# for a sentence such as 'I bought an APPLE at APPLE' to be distinguished from 'I bought APPLE at 
# APPLE' for example.
