In [89]:
import os
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import gensim


In [2]:
semeval_path = './SemEval-PIT2015-py3'

In [3]:
stop_words = set(stopwords.words('english'))

In [5]:
def split_tags(string):
    return [tuple(i.split("/")) for i in string.split()]

def readTrainData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = eval(judge)[0]            
        if nYes >= 3:
            amt_label = True
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
        elif nYes <= 1:
            amt_label = False
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
    return data

def readTestData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = int(judge[0])
        if nYes >= 4:
            expert_label = True
        elif nYes <= 2:
            expert_label = False
        else:
            expert_label = None
        if expert_label != None:
            data.append((split_tags(origsenttag), split_tags(candsenttag), expert_label))
    return data

train_data = readTrainData("SemEval-PIT2015-py3/data/dev.data")
test_data = readTestData("SemEval-PIT2015-py3/data/test.data")

In [6]:
test_data[0]

([('All', 'O', 'DT', 'B-NP', 'O'),
  ('the', 'O', 'DT', 'I-NP', 'O'),
  ('home', 'O', 'NN', 'I-NP', 'O'),
  ('alones', 'O', 'VBZ', 'B-VP', 'O'),
  ('watching', 'O', 'VBG', 'I-VP', 'B-EVENT'),
  ('8', 'O', 'CD', 'B-NP', 'O'),
  ('mile', 'O', 'NN', 'I-NP', 'O')],
 [('The', 'O', 'DT', 'B-NP', 'O'),
  ('last', 'O', 'JJ', 'I-NP', 'O'),
  ('rap', 'O', 'NN', 'I-NP', 'B-EVENT'),
  ('battle', 'O', 'NN', 'I-NP', 'B-EVENT'),
  ('in', 'O', 'IN', 'B-PP', 'O'),
  ('8', 'O', 'CD', 'B-NP', 'O'),
  ('Mile', 'O', 'NNP', 'I-NP', 'O'),
  ('nevr', 'O', 'NN', 'I-NP', 'O'),
  ('gets', 'O', 'VBZ', 'B-VP', 'O'),
  ('old', 'O', 'JJ', 'B-NP', 'O'),
  ('ahah', 'O', 'JJ', 'I-NP', 'O')],
 False)

In [103]:
wordnet_cache = {}
def get_wordnet_features(tweet):
    feature_set = set()

    for item in tweet:
        word = item[0].lower()
        if word not in stop_words:
            if word in wordnet_cache:
                feature_set.update(wordnet_cache[word])
            else:
                temp_features = set()
                syns = wordnet.synsets(word)
                for syn in syns[:4]:
                    temp_features.update(syn.lemma_names())
#                     for hyp in syn.hypernyms():
#                         temp_features.update(hyp.lemma_names())
                wordnet_cache[word] = temp_features
                feature_set.update(temp_features)
    return feature_set

In [104]:
def get_vec_similarity(tweet1, tweet2):
    tokens_1 = [] 
    tokens_2 = [] 
    for w in tweet1:
        word = w[0].lower()
        if word not in stop_words:
            tokens_1.append(word)
    
    for w in tweet2:
        word = w[0].lower()
        if word not in stop_words:
            tokens_2.append(word)
    distance = model.wmdistance(tokens_1, tokens_2)
    if distance == float("inf"):
        return 0.0
    return distance

In [105]:
def extract_features(data):
    features, labels = [], []
    for tweet in data:
        feature = {}
        tweet1, tweet2, areSimilar = tweet
        feature['sim'] = len(get_wordnet_features(tweet1) & get_wordnet_features(tweet2))
        feature['vec_sim'] = get_vec_similarity(tweet1, tweet2)
        features.append(feature)
        labels.append(areSimilar)
    return features, labels

In [106]:
x_train, y_train = extract_features(train_data)

In [107]:
x_test, y_test = extract_features(test_data)

In [108]:
clf = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('logistregress', LogisticRegression(solver='sag')),
])

### Synonyms

In [48]:
os.chdir(semeval_path)

In [93]:
filename_s = 'systemoutputs/PIT2015_KOSTROV_05_SYNONYMS.output'
clf.fit(x_train, y_train)
predicted_probability = clf.predict_proba(x_test)
output = [(prob_true > prob_false, prob_true if prob_true > prob_false else prob_false) for [prob_false, prob_true] in predicted_probability]
with open(filename_s , 'a+') as f:
    f.truncate(0)
    for line in output:
        f.write(str(line[0]).lower() + '\t' + str(line[1]) + '\n')
!python scripts/pit2015_eval_single.py data/test.label $filename_s

838	KOSTROV	05_SYNONYMS		F: 0.516	Prec: 0.565	Rec: 0.474		P-corr: -0.095	F1: 0.346	Prec: 0.209	Rec: 1.000


### Synonyms + hypernyms

In [98]:
filename_s = 'systemoutputs/PIT2015_KOSTROV_05_HYPERNYMS.output'
clf.fit(x_train, y_train)
predicted_probability = clf.predict_proba(x_test)
output = [(prob_true > prob_false, prob_true if prob_true > prob_false else prob_false) for [prob_false, prob_true] in predicted_probability]
with open(filename_s , 'a+') as f:
    f.truncate(0)
    for line in output:
        f.write(str(line[0]).lower() + '\t' + str(line[1]) + '\n')
!python scripts/pit2015_eval_single.py data/test.label $filename_s

838	KOSTROV	05_HYPERNYMS		F: 0.478	Prec: 0.483	Rec: 0.474		P-corr: -0.044	F1: 0.348	Prec: 0.213	Rec: 0.949


### Synonyms + vector similarity

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M-subword.vec')

In [109]:
filename_s = 'systemoutputs/PIT2015_KOSTROV_05_VECTOR.output'
clf.fit(x_train, y_train)
predicted_probability = clf.predict_proba(x_test)
output = [(prob_true > prob_false, prob_true if prob_true > prob_false else prob_false) for [prob_false, prob_true] in predicted_probability]
with open(filename_s , 'a+') as f:
    f.truncate(0)
    for line in output:
        f.write(str(line[0]).lower() + '\t' + str(line[1]) + '\n')
!python scripts/pit2015_eval_single.py data/test.label $filename_s

838	KOSTROV	05_VECTOR		F: 0.514	Prec: 0.579	Rec: 0.463		P-corr: -0.100	F1: 0.347	Prec: 0.210	Rec: 0.994
