# Task B

This notebook explore performance on feature selecting using the pipeline runner to run with different classifier for Task B

Preprocessor is a module written to hold function tokenise and extract_features. The module can then be compiled and used for faster execution and clarity in the notebook.

PipelineRunner is a module written for this exercise to wrap extract training set, 10 fold cross-validation, and testing in functions for ease of use.

First we compile our modules Preprocessor.py and PipelineRunner.py

In [1]:
import PipelineRunner
import Preprocessor

reload(PipelineRunner)
reload(Preprocessor)

from functools import partial

tokenise = partial(Preprocessor.tokenise, lemmatization=True)

# load lexicon transformers for faster startup
lexicon_liuhu = Preprocessor.lexicon_liuhu()
lexicon_emoticon = Preprocessor.lexicon_emoticon()
lexicon_NRC_unigram = Preprocessor.lexicon_NRC_unigram()
lexicon_NRC_bigram = Preprocessor.lexicon_NRC_bigram()
WE_GloVe_Twitter = Preprocessor.WE_GloVe_Twitter()

In [2]:
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import TruncatedSVD

# a list of features to be combine in FeatureUnion
# different features can be commented out to try different combinations
# make sure to also edit weights to match the feature used
feature_list = [
                ('unigram_tfidf', Pipeline([
                    ('ngram', Preprocessor.extract_ngram(ngram_range=(1,1), hashing=True)),
                    ('dict', DictVectorizer()),
                    ('tfidf', TfidfTransformer())
                ])),
                ('kskip_bigram_tfidf', Pipeline([
                    ('ngram', Preprocessor.extract_kskip_bigram(skip=1, hashing=True)),
                    ('dict', DictVectorizer()),
                    ('tfidf', TfidfTransformer())
                ])),
                ('trigram_tfidf', Pipeline([
                    ('ngram', Preprocessor.extract_ngram(ngram_range=(3,3), hashing=True)),
                    ('dict', DictVectorizer()),
                    ('tfidf', TfidfTransformer())
                ])),
                  # sentiwordnet is slow to run and does not improve much, commented out  
#                 ('lexi_sentiwordnet', Pipeline([
#                     ('swn', Preprocessor.lexicon_sentiwordnet(feature='score')),
#                     ('normalise', preprocessing.MinMaxScaler())
#                 ])),
                ('lexi_liuhu', Pipeline([
                    ('liuhu', lexicon_liuhu),
                    ('normalise', preprocessing.Normalizer())
                ])),
                ('lexi_emoticon', Pipeline([
                    ('emoticon', lexicon_emoticon),
                    ('normalise', preprocessing.Normalizer())
                ])),
                ('lexicon_NRC_unigram', Pipeline([
                    ('nrc1', lexicon_NRC_unigram),
                    ('normalise', preprocessing.Normalizer())
                ])),
                ('lexicon_NRC_bigram', Pipeline([
                    ('bigram', Preprocessor.extract_kskip_bigram(skip=3)),
                    ('nrc2', lexicon_NRC_bigram),
                    ('normalise', preprocessing.Normalizer())
                ])),
                ('WE_GloVe', Pipeline([
                    ('glove', WE_GloVe_Twitter),
                    ('normalise', preprocessing.MinMaxScaler())
                ])),
                ('related', Pipeline([
                    ('trl', Preprocessor.extract_tweeter_related()),
                    ('normalise', preprocessing.MinMaxScaler())
                ]))
               ]

# weights of all the feature that match the feature_list. If not given default to 1.0
weights = {
    'unigram_tfidf':       1.0,
    'kskip_bigram_tfidf':  0.4,
    'trigram_tfidf':       0.5,
#     'lexi_sentiwordnet': 0.3,
    'lexi_liuhu' :         0.5,
    'lexi_emoticon':       0.2,
    'lexicon_NRC_unigram': 0.2,
    'lexicon_NRC_bigram' : 0.2,
    'WE_GloVe'  :          0.4,
    'related' :            0.2
}

clf = ('SGD', SGDClassifier(n_iter=50,average=10))
# clf = ('SVC', SVC(kernel='linear'))

# combine features and classifier to pipeline
pipeline = Pipeline([
        ('features', FeatureUnion(
                transformer_list = feature_list, 
                transformer_weights = weights,
        )),
        clf ])

# use our own PipelineRunner to perform testing 
pipeline, tf_mfs, tf_gold, tf_result, dev_mfs, dev_gold, dev_result = \
    PipelineRunner.runAllTaskB(pipeline, tokenise)


>>>Start 10 fold validation:
test range: [0, 798] accuracy: 0.670838548185
test range: [799, 1597] accuracy: 0.730913642053
test range: [1598, 2396] accuracy: 0.742177722153
test range: [2397, 3195] accuracy: 0.73717146433
test range: [3196, 3994] accuracy: 0.743429286608
test range: [3995, 4793] accuracy: 0.734668335419
test range: [4794, 5591] accuracy: 0.689223057644
test range: [5592, 6389] accuracy: 0.74686716792
test range: [6390, 7187] accuracy: 0.709273182957
test range: [7188, 7985] accuracy: 0.713032581454
         |    n         p |
         |    e    n    o |
         |    g    e    s |
         |    a    u    i |
         |    t    t    t |
         |    i    r    i |
         |    v    a    v |
         |    e    l    e |
---------+----------------+
negative | <552> 434  165 |
 neutral |  198<3101> 563 |
positive |  116  746<2111>|
---------+----------------+
(row = reference; col = test)

positive f-measure: 0.726428
neutral f-measure: 0.761636
negative f-measure: 0.547

 ###  Macro F-score of positive and negative from two test 
 The PipelineRunner calculate the Macro f-measure as :  
(f-score_for_positive + f-score_for_negative)/2  

From the above experiment: we got 2 micro f-score that we can use the value the performance, they are both testing using unseen data, the second score, tested using the dev set might provide a more realistic score because it trained on the entired training set and tested on a unseen dev set. The score is very similar to the score we got from 10-fold cross validation but only slightly lower.

We can average the 2 score from the two experiments so that it's easier to compare different settings and classifiers. (notice that this average is NOT the micro f-score from averaging positive and negative class)

In [3]:
(tf_mfs + dev_mfs)/2

0.647364476688519

## Inspect 
The f-score for negative class is relatively low
Les inspect the incorrectly classify tweets for class negative in dev set
We find the false negative and false positive indexes and look them up in the file

In [4]:
# inspect the incorrectly classify tweets for class negative in dev set

FN_Neg = [ i for i in range(len(dev_gold)) if dev_gold[i]=='negative' and  dev_gold[i]!=dev_result[i]]
FP_Neg = [ i for i in range(len(dev_gold)) if dev_result[i]=='negative' and  dev_gold[i]!=dev_result[i]]

FN_Tweets = PipelineRunner.inspectTweetsB(PipelineRunner.twitter_dev_gold_B_path, FN_Neg)

In [5]:
# lets look at what are the false negatives
[ (i, l, dev_result[i], t)for i, l, t in FN_Tweets ] 

[(5,
  'negative',
  'positive',
  "@jacquelinemegan I'm sorry, I Heart Paris is no longer available at the Rockwell branch! You may call 8587000 to get a copy transferred! :)\n"),
 (14,
  'negative',
  'neutral',
  '@prodnose is this one of your little jokes like Elvis playing at the Marquee  next Tuesday?\n'),
 (15,
  'negative',
  'neutral',
  'Gold edges down ahead of US jobs data: SINGAPORE (Reuters) - Gold edged lower on Friday, with investors waiting for... http://t.co/CiqFona1\n'),
 (18,
  'negative',
  'neutral',
  'Monday before I leave Singapore, I am going to post something that might be offensive.\n'),
 (23,
  'negative',
  'neutral',
  "'Love-cheat' Daniel Radcliffe splits with girlfriend Rosie Coker: London, Oct 19: Daniel Radcliffe has split wit... http://t.co/ZVlsK2HQ\n"),
 (44,
  'negative',
  'positive',
  "@JoshNorris @Rotoworld_Draft I'd be pretty mad if the Packers took Bernard in the 1st just bc, Cooper/Eifert would be better IMO.\n"),
 (61,
  'negative',
  'neut

In [6]:
# and false positives
FP_Tweets = PipelineRunner.inspectTweetsB(PipelineRunner.twitter_dev_gold_B_path, FP_Neg)
[ (i, l, dev_result[i], t)for i, l, t in FP_Tweets ] 

[(26,
  'positive',
  'negative',
  "@KevOrf_5 Yeah I think so. We saw Suarez score up near us and we played pretty well 2nd half so it wasn't so bad. Probably should've had ET\n"),
 (27,
  'neutral',
  'negative',
  'I may exit off twitter and fb  and thug with instagram btw its blonde_lifestyle:insta\n'),
 (57,
  'neutral',
  'negative',
  "@CoachVac heey do you know anything about UVA's fallll fest loll they invited me so im going this sat but i really dont know what it is loll\n"),
 (79,
  'neutral',
  'negative',
  "My Pain may be the reason for somebody's laugh. But my laugh must never be the reason for somebody's pain - Charlie Ch http://t.co/iw1fy2wo\n"),
 (130,
  'neutral',
  'negative',
  'Suarez is 1 YC away from a domestic suspension. If he picks up a YC this Sunday vs Newcastle, then he will miss the clash at SB next w/end.\n'),
 (179,
  'neutral',
  'negative',
  'Guarantee if I go to maths tomorrow ill spend the whole time talking\n'),
 (188,
  'neutral',
  'negative',
 


## Run Classifier on test set and write to result file

In [7]:
# write result to file
test_set = PipelineRunner.getTrainingSetB(PipelineRunner.twitter_test_B_path, tokenise)
result = list(pipeline.predict(test_set['tokens']))

assert len(result)==len(test_set['tokens'])

with open('result/test-B-final.txt', 'w') as resultfile:
    lineno = 0
    with open(PipelineRunner.twitter_test_B_path) as tsvfile:
            for aline in tsvfile:
                line = aline.strip().split('\t')
                resultfile.write('\t'.join(line[0:2]+[result[lineno]])+'\n')
                lineno += 1
    assert len(result)==lineno            