In [3]:
import csv
import os
import sys
import os.path
from gensim.models import Word2Vec
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import re , math
from nltk import word_tokenize


In [4]:
glove_input_file = 'glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
filename = 'glove.6B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print ('Done loading Glove embedding')

Done loading Glove embedding


In [19]:
def get_hypernyms(syn):

    """
    Input: a WordNet synset
    Output: all hypernyms of this synsets in the WordNet hierarchy
    """

    hypernyms = []

    for hyp in syn.hypernyms():
        if hyp not in hypernyms:
            hypernyms.append(hyp)
            for hyp2 in hypernyms:
                for hyp3 in hyp2.hypernyms():
                    if hyp3 not in hypernyms:
                        hypernyms.append(hyp3)
    return hypernyms

In [20]:
def get_all_hypernyms(word):

    """
    Input: word (str)
    Output: all synsets containing the word and all of their hypernym synsets
    """

    all_syns_hypernyms = []
    all_syns = wn.synsets(word)

    for syn in all_syns:
        all_syns_hypernyms.append(syn)
        for hyp in get_hypernyms(syn):
            if hyp not in all_syns_hypernyms:
                all_syns_hypernyms.append(hyp)
                for hyp2 in all_syns_hypernyms:
                    for hyp3 in hyp2.hypernyms():
                        if hyp3 not in all_syns_hypernyms:
                            all_syns_hypernyms.append(hyp3)

    return all_syns_hypernyms


In [21]:
def get_all_definitions(word):

    """
    Input: word (str)
    Output: all definitions of the word senses and their hypernyms in WordNet
    """

    all_definitions = dict()

    all_syns_hypernyms = get_all_hypernyms(word)


    for syn in all_syns_hypernyms:

        definition = syn.definition()
        def_tokenized = word_tokenize(definition)

        all_definitions[syn] = [word for word in def_tokenized if word not in stopwords.words('english')]
        # Exclude stopwords

    return all_definitions

In [22]:
def unit_vector(vector):

    """
    Calculate unit unit_vector
    Input: vector (resulting from a calculation; vectors returned by the model are
    normalized already) (numpy array)
    Output: unit vector (numpyp array)
    """

    mag = math.sqrt(sum([pow(value, 2) for value in vector]))

    unit_vec = []

    for value in vector:
        unit_vec.append(value/mag)

    return np.array(unit_vec)


In [23]:
def sim_wv(wv1, wv2, model):

    """
    Calculate cosine similarity of words
    (represented either by a numpy vector or a string)
    Input: word1 (str or numpy array), word2 (str or numpy array), model (w2v model)
    (Unit vectors are calculated)
    Output: cosine similarity between vectors (float)
    """

    # Normalize vectors before calculating their dot product:

    if type(wv1) == str:
        vec1 = model[wv1]

    else:
        vec1 = wv1

    if type(wv2) == str:
        vec2 = model[wv2]

    else:
        vec2 = wv2

    # Only calculate sim if words in the model (i.e. not str 'OOV')

    vec1_unit = unit_vector(vec1)
    vec2_unit = unit_vector(vec2)
    sim_vec = np.dot(vec1_unit, vec2_unit)


    return sim_vec


In [24]:
def sim_definition(prop, definition, model):

    #flat_list = [item for sublist in l for item in sublist]
    #definition_list = definition_dict.values()
    #definitions = [w for definition in definition_list for w in definition]
    sims = []
    for word in definition:
        if (word in model.vocab) and (prop in model.vocab):
            sims.append((sim_wv(word, prop, model)))
            #sims.append( cosine_similarity( model[word],model[prop]  )  )
        else:
            sims.append(0.0)
    if sims:
        return max(sims)
    else:
        return 0.0

In [25]:
def get_highest_def_sim(prop, definition_dict, model):

    sim_syn_list = []

    for syn, definition in definition_dict.items():

        max_sim = float(sim_definition(prop, definition, model))

        sim_syn_list.append((max_sim, syn))
    if sim_syn_list:
        return max(sim_syn_list)
    else:
        return (0.0, '-')

In [26]:
def sim_def_check(concept1, concept2, prop, threshold1, threshold2, model):


    """
    Input: concept1 (str), concept2 (str), threshold1, threshold2 (float)
    Output: Dictionary recording decisions
    Checks whether the attribute is in any of the definitions of concept1 and
    concept2
    """

    decision_dict = dict()

    def_dict1 = get_all_definitions(concept1)
    def_dict2 = get_all_definitions(concept2)

    decision_dict['system'] = 'def_sim'

    sim1, syn1 = get_highest_def_sim(prop, def_dict1, model)
    sim2, syn2 = get_highest_def_sim(prop, def_dict2, model)


    if (sim1 > threshold1) and (sim2 < threshold2):
        answer = '1'
    elif (sim2 > threshold1) and (sim1 < threshold2):
        answer = '0'
    elif (sim1 < threshold2) and (sim2 < threshold2):
        answer = '0'
    elif (sim1 > threshold1) and (sim2 > threshold1):
        answer = '0'
    else: answer = 'None'

    decision_dict['answer'] = answer

    return decision_dict


In [27]:
def direct_def_check(concept1, concept2, prop):

    """
    Input: concept1 (str), concept2 (str)
    Output: Either a decisions (str '1' or '0') or None
    Checks whether the attribute is in any of the definitions of concept1 and
    concept2
    """

    decision_dict = dict()

    def_dict1 = get_all_definitions(concept1)
    def_dict2 = get_all_definitions(concept2)

    def1_answer = 0
    def2_answer = 0


    for syn1, def1 in def_dict1.items():

        if prop in def1:

            def1_answer = 1
            break

    for syn2, def2 in def_dict2.items():
        if prop in def2:
            def2_answer = 1
            break


    if (def1_answer == 1) and (def2_answer == 0):
        answer = '1'

    elif (def1_answer == 1) and (def2_answer == 1):
        answer = '0'

    elif (def1_answer == 0) and (def2_answer == 1):
        answer = '0'

    else:
        answer = 'None'

    decision_dict['answer'] = answer
    decision_dict['system'] = 'def'

    return decision_dict

In [14]:
#create dictionary of words in vocab
vocab = {}
with open('glove.6B.300d.txt' , 'r') as inpfile:
    line = inpfile.readline()
    while line:
        line = line.strip()
        line = line.split()
        vocab[line[0]] = 1
        line = inpfile.readline()
print('Done making vocab')

Done making vocab


In [28]:
def sim_check(concept1, concept2, prop, model):

    decision_dict = dict()
    decision_dict['system'] = 'sim'

    if (concept1 in vocab) and ( concept2 in vocab) and  ( prop in vocab):
        #sim1, sim2 = embedding_sim(concept1, concept2, prop, model)
        sim1 = sim_wv(concept1, prop, model)
        sim2 = sim_wv(concept2, prop, model)


        if sim1 > sim2:
            decision_dict['answer'] = '1'
        else:
            decision_dict['answer'] = '0'
    else:
        decision_dict['answer'] = '0'

    return decision_dict

In [32]:
import time
label = []
data = []
total = 0
cnt = 0
with open('original_train.txt') as train_file:
    reader = csv.reader(train_file)
    st = time.clock()
    for row in reader :
        total += 1
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            cnt += 1
            continue
        
#         if(total%100==0):
#             print(total)
#             print( time.clock()-st)
        concept1 = row[0]
        concept2 = row[1]
        prop = row[2]
#         def_decision_dict = direct_def_check(concept1, concept2, prop)
#         def_sim_decision_dict = sim_def_check(concept1, concept2, prop, .75, .23 , model)
#         def_answer = int(def_decision_dict['answer']) # 1 or 0 or None
#         def_sim_answer = int(def_sim_decision_dict['answer']) # 1 or 0 or None
        #print ( type(def_answer) , type(def_sim_answer)  )  
#         d = 0
#         if ( cosine_similarity(model[row[0]] , model[row[2]]) > cosine_similarity(model[row[1]] , model[row[2]]) ):
#             d = 1
#         else:
#             d = 0

#         data.append([def_answer,def_sim_answer, d])
        data.append([cosine_similarity(model[row[0]] , model[row[2]])[0][0] , cosine_similarity(model[row[1]] , model[row[2]] )[0][0] ])
        label.append(int(row[3]))
        
print('Done')
# print (data)
print( time.clock()-st)

Done
13.922207000000071


In [33]:
#train logistic regression model by using cosine values as feature vector

logistic_regr_model_cosine_sim   = LogisticRegression()
logistic_regr_model_cosine_sim.fit( data , label ) 

print("Done fitting")

Done fitting


In [36]:
total =0
cnt = 0
true_positive  = 0.0
true_negative  = 0.0
false_positive = 0.0
false_negative = 0.0
def_answer = None
def_sim_answer = None
sim_answer = None
with open('original_validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        
        concept1 = row[0]
        concept2 = row[1]
        prop = row[2]
        def_decision_dict = direct_def_check(concept1, concept2, prop)
        def_sim_decision_dict = sim_def_check(concept1, concept2, prop, .75, .23 , model)
        def_answer = def_decision_dict['answer'] # 1 or 0 or None
        def_sim_answer = def_sim_decision_dict['answer'] # 1 or 0 or None
        
        predicted = 1
        if def_answer!='None':
            predicted = int(def_answer)
        elif def_sim_answer!='None':
            predicted = int(def_sim_answer)
        else:
            predicted = logistic_regr_model_cosine_sim.predict( [cosine_similarity(model[row[0]] , model[row[2]])[0][0] , cosine_similarity(model[row[1]] , model[row[2]] )[0][0] ])[0]
        
        
        if(predicted == int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted==1):
                true_positive += 1.0
            else:
                false_negative += 1.0
        else:
            if(predicted==1):
                false_positive += 1.0
            else:
                true_negative += 1.0
        if(total%25==0):
            precision = true_positive/( true_positive + false_positive)
            recall    = true_positive/( true_positive + false_negative)

            print (cnt , total , cnt/total)
            print ('precision = ',  precision , ' recall = ' ,recall )
            print ( 'F1 score = ' , 2.0*((precision*recall)/ (precision+recall))  )
            print('done validating')
            print (cnt , total , cnt/total)
            print('done validating')

precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ',  precision , ' recall = ' ,recall )
print ( 'F1 score = ' , 2.0*((precision*recall)/ (precision+recall))  )
print('done validating')
print (cnt , total , cnt/total)
print('done validating')      
        
        
    

15 25 0.6
precision =  0.5  recall =  0.5
F1 score =  0.5
done validating
15 25 0.6
done validating
29 50 0.58
precision =  0.5454545454545454  recall =  0.5217391304347826
F1 score =  0.5333333333333332
done validating
29 50 0.58
done validating
43 75 0.5733333333333334
precision =  0.5666666666666667  recall =  0.4722222222222222
F1 score =  0.5151515151515152
done validating
43 75 0.5733333333333334
done validating
58 100 0.58
precision =  0.5853658536585366  recall =  0.4897959183673469
F1 score =  0.5333333333333333
done validating
58 100 0.58
done validating
72 125 0.576
precision =  0.5818181818181818  recall =  0.5161290322580645
F1 score =  0.5470085470085471
done validating
72 125 0.576
done validating
90 150 0.6
precision =  0.6060606060606061  recall =  0.5405405405405406
F1 score =  0.5714285714285714
done validating
90 150 0.6
done validating
106 175 0.6057142857142858
precision =  0.6025641025641025  recall =  0.5529411764705883
F1 score =  0.5766871165644172
done valida

774 1250 0.6192
precision =  0.634020618556701  recall =  0.5838607594936709
F1 score =  0.6079077429983525
done validating
774 1250 0.6192
done validating
791 1275 0.6203921568627451
precision =  0.6380471380471381  recall =  0.5848765432098766
F1 score =  0.6103059581320451
done validating
791 1275 0.6203921568627451
done validating
807 1300 0.6207692307692307
precision =  0.6390728476821192  recall =  0.583963691376702
F1 score =  0.6102766798418974
done validating
807 1300 0.6207692307692307
done validating
822 1325 0.6203773584905661
precision =  0.6404494382022472  recall =  0.588495575221239
F1 score =  0.6133743274404304
done validating
822 1325 0.6203773584905661
done validating
842 1350 0.6237037037037036
precision =  0.6425196850393701  recall =  0.5921625544267054
F1 score =  0.6163141993957704
done validating
842 1350 0.6237037037037036
done validating
859 1375 0.6247272727272727
precision =  0.6449612403100775  recall =  0.5917496443812233
F1 score =  0.6172106824925816
d

1481 2400 0.6170833333333333
precision =  0.634011090573013  recall =  0.5674110835401158
F1 score =  0.5988651243998254
done validating
1481 2400 0.6170833333333333
done validating
1495 2425 0.6164948453608248
precision =  0.6335460346399271  recall =  0.5682747342600164
F1 score =  0.5991379310344829
done validating
1495 2425 0.6164948453608248
done validating
1508 2450 0.6155102040816327
precision =  0.6305329719963866  recall =  0.5670186839967506
F1 score =  0.5970915312232677
done validating
1508 2450 0.6155102040816327
done validating
1520 2475 0.6141414141414141
precision =  0.629695885509839  recall =  0.5654618473895582
F1 score =  0.595852729581041
done validating
1520 2475 0.6141414141414141
done validating
1536 2500 0.6144
precision =  0.6268788682581786  recall =  0.5667466027178257
F1 score =  0.5952980688497062
done validating
1536 2500 0.6144
done validating
1552 2525 0.6146534653465346
precision =  0.6272965879265092  recall =  0.567246835443038
F1 score =  0.59576235

In [None]:
#validate using logistic regression model using cosine values as features vector

total =0.0
cnt = 0.0
true_positive  = 0.0
true_negative  = 0.0
false_positive = 0.0
false_negative = 0.0
with open('original_validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1.0
        
        concept1 = row[0]
        concept2 = row[1]
        prop = row[2]
        def_decision_dict = direct_def_check(concept1, concept2, prop)
        def_sim_decision_dict = sim_def_check(concept1, concept2, prop, .75, .23 , model)
        def_answer = int(def_decision_dict['answer']) # 1 or 0 or None
        def_sim_answer = int(def_sim_decision_dict['answer']) # 1 or 0 or None
        d = 0
        if ( cosine_similarity(model[row[0]] , model[row[2]])[0][0] > cosine_similarity(model[row[1]] , model[row[2]])[0][0] ):
            d = 1
        else:
            d = 0
    
        predicted_class = logistic_regr_model_cosine_sim.predict( [def_answer , def_sim_answer ,d] )
        
        #print(predicted_class , row[3])
        
        if(predicted_class[0] == int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1.0
            else:
                false_negative += 1.0
        else:
            if(predicted_class[0]==1):
                false_positive += 1.0
            else:
                true_negative += 1.0
                
        if(total%100==0):
            precision = true_positive/( true_positive + false_positive)
            recall    = true_positive/( true_positive + false_negative)

            print (cnt , total , cnt/total)
            print ('precision = ',  precision , ' recall = ' ,recall )
            print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
            print('done validating')
            print (cnt , total , cnt/total)
            print('\n\n')
            
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ',  precision , ' recall = ' ,recall )
print ( 'F1 score = ' , 2.0*((precision*recall)/ (precision+recall))  )
print('done validating')
print (cnt , total , cnt/total)
print('done validating')