In [1]:
import csv
import os
import sys
import os.path
from gensim.models import Word2Vec
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from gensim.scripts.glove2word2vec import glove2word2vec


In [51]:
#create dictionary of words in vocab
vocab = {}
with open('glove.6B.300d.txt' , 'r') as inpfile:
    line = inpfile.readline()
    while line:
        line = line.strip()
        line = line.split()
        vocab[line[0]] = 1
        line = inpfile.readline()
print('Done making vocab')

#load word2vec embedding

glove_input_file = 'glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
filename = 'glove.6B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print ('Done loading embedding')

Done making vocab
Done loading embedding


In [52]:
#function returns cosine similartiy between given words
def calculate_cosine_similartiy(word1 , word2):
    A = model[word1 ]
    A = np.reshape(A, (1, -1))
    B = model[word2 ]
    B = np.reshape(B, (1, -1))
    x = cosine_similarity( A , B )[0][0]
    return x
    

In [53]:
#convert training data in representational format

cosine_similarity_words_features = []  
concatinated_word_embeddings     = []
label = []


total = 0   #total rows in train data
cnt   = 0   #points present in vocab
output_file_one  = open('train_output_one.txt', 'w')
output_file_zero = open('train_output_zero.txt', 'w')


with open('train.txt') as train_file:
    reader = csv.reader(train_file)
    for row in reader :
        total += 1
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            cnt += 1
            continue
        cosine_similarity_between_w1_atb = calculate_cosine_similartiy(row[0] , row[2])
        cosine_similarity_between_w2_atb = calculate_cosine_similartiy(row[1] , row[2])
        #print(cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb)
        if (row[3]=='1'):
            output_file_one.write(row[0]+" " + row[1]+" " + row[2] +'\n' )
            output_file_one.write(str(cosine_similarity_between_w1_atb)+" " + str(cosine_similarity_between_w2_atb)+" " + str( row[3] )+'\n' )
        else:
            output_file_zero.write(row[0]+" " + row[1]+" " + row[2] +'\n' )
            output_file_zero.write(str(cosine_similarity_between_w1_atb)+" " + str(cosine_similarity_between_w2_atb)+" " + str( row[3] )+'\n' )
        
        label.append([int(row[3])])
        cosine_similarity_words_features.append([cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb ])
        concatinated_list = (model[row[0]]  +  model[row[1]]  + model[row[2]] ).tolist()
        concatinated_word_embeddings.append(concatinated_list  )
        
print('Done representing')
    


Done representing


In [54]:
#train logistic regression model by using cosine values as feature vector

logistic_regr_model_cosine_sim   = LogisticRegression()
logistic_regr_model_cosine_sim.fit( cosine_similarity_words_features , label )

print("Done fitting")

Done fitting


In [55]:
#train logistic regression model by concatinating word embedding values as feature vector


logistic_regr_model_concatinated = LogisticRegression(max_iter = 300)
logistic_regr_model_concatinated.fit( concatinated_word_embeddings , label  ) 
print("Done fitting")


Done fitting


In [56]:
#validate using logistic regression model using cosine values as features vector

total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0
with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        
        cosine_similarity_between_w1_atb = calculate_cosine_similartiy(row[0] , row[2])
        cosine_similarity_between_w2_atb = calculate_cosine_similartiy(row[1] , row[2])
        
        predicted_class = logistic_regr_model_cosine_sim.predict( [cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb] )
        
        #print(predicted_class , row[3])
        
        if(predicted_class[0] == int(row[3])):
            cnt += 1
            
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ' , precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1397 2721 0.5134141859610437
precision =  0.5144337527757217  recall =  0.5099046221570066
F1 score =  0.5121591746499632
done validating


In [57]:
#validate using logistic regression model using concatinating embedding as features vector

total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0

with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        concatinated_list = (model[row[0]]  +  model[row[1]]  + model[row[2]] ).tolist()
        predicted_class = logistic_regr_model_concatinated.predict(concatinated_list)
        
        #print( predicted_class , row[3])
        if(predicted_class[0]==int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ' , precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1407 2721 0.5170893054024256
precision =  0.5248730964467005  recall =  0.3793103448275862
F1 score =  0.4403747870528109
done validating


In [58]:
#Gaussian Naive bayes using cosine_values as embeddings
from sklearn.naive_bayes import GaussianNB 

gnb_cosine = GaussianNB()
NB_model_cosine = gnb_cosine.fit(cosine_similarity_words_features , label)
gnb_concat = GaussianNB()
NB_model_concat = gnb_concat.fit(concatinated_word_embeddings , label)

print("Done model fitting")

Done model fitting


In [59]:
#validate using Naive Baise model using cosine values as features vector

total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0
with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        
        cosine_similarity_between_w1_atb = calculate_cosine_similartiy(row[0] , row[2])
        cosine_similarity_between_w2_atb = calculate_cosine_similartiy(row[1] , row[2])
        
        predicted_class = NB_model_cosine.predict( [cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb] )
        
        #print(predicted_class , row[3])
        
        if(predicted_class[0] == int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ', precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1267 2721 0.46563763322307977
precision =  0.46885694729637234  recall =  0.5025678650036683
F1 score =  0.48512747875354106
done validating


In [60]:
#validate using Naive Bayes model using concatinating embedding as features vector

total =0
cnt = 0
total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0
with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        concatinated_list = (model[row[0]]  +  model[row[1]]  + model[row[2]] ).tolist()
        predicted_class = NB_model_concat.predict(concatinated_list)
        
        #print( predicted_class , row[3])
        if(predicted_class[0]==int(row[3])):
            cnt += 1
        
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ',  precision , ' recall = ' ,recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1282 2721 0.47115031238515254
precision =  0.46  recall =  0.3206162876008804
F1 score =  0.3778642455685257
done validating


In [61]:
#Additional feature extraction using wordnet
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import re
stop_words = set(stopwords.words('english'))

def create_definition(word):
    sys = wordnet.synsets(word)
    definition_list = []
    for x in sys:
        line = x.definition().split() 
        definition_list += [w for w in line if not w in stop_words]
    words_in_definition = re.sub(r'[^a-zA-Z0-9 ]',r''," ".join(definition_list))
    return words_in_definition
        
        

In [62]:
#Modified feature vectors

total = 0
feature_mat = []
label = []
with open('train.txt') as train_file:
    reader = csv.reader(train_file)
    for row in reader:
        total += 1
        
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            cnt += 1
            continue
        w1_definition  = create_definition(row[0]).split()
        w2_definition  = create_definition(row[1]).split()
        att_definition = create_definition(row[2]).split()
        #print(w1_definition , w2_definition , att_definition)
        a=b=c=d=0.0
        if(row[2] in w1_definition):
            a = 1.0
        if(row[2] in w2_definition):
            b = 1.0
        if(row[0] in att_definition):
            c = 1.0
        if(row[1] in att_definition):
            d = 1.0
        e = calculate_cosine_similartiy(row[0] , row[2])
        f = calculate_cosine_similartiy(row[1] , row[2])
        label.append(int(row[3]))
        X = model[row[0]]
        Y = model[row[1]]
        Z = model[row[2]]
        temp_list = np.concatenate((X,Y,Z , X-Z ,Y-Z )).tolist()
        feature_mat.append( [a,b,c,d,e,f]+temp_list )
        
        
print('done')

done


In [46]:
#check feature vector
X = model['sugar']
Y = model['tea']
Z = model['water']
P = feature_mat[0]
#print ( X, Y , np.concatenate((X,Y)) )
Q = np.concatenate((X,Y,Z , X-Z ,Y-Z )).tolist()
print (len((P+Q)))

506


In [63]:
#train logistic regression model by using cosine values as feature vector

logistic_regr_model_feature_extract = LogisticRegression()
logistic_regr_model_feature_extract.fit( feature_mat , label )

print("Done fitting")

Done fitting


In [64]:
total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0

with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        w1_definition  = create_definition(row[0]).split()
        w2_definition  = create_definition(row[1]).split()
        att_definition = create_definition(row[2]).split()
        #print(w1_definition , w2_definition , att_definition)
        a=b=c=d=0
        if(row[2] in w1_definition):
            a = 1
        if(row[2] in w2_definition):
            b = 1
        if(row[0] in att_definition):
            c = 1
        if(row[1] in att_definition):
            d = 1
        e = calculate_cosine_similartiy(row[0] , row[2])
        f = calculate_cosine_similartiy(row[1] , row[2])
        label.append(int(row[3]))
        X = model[row[0]]
        Y = model[row[1]]
        Z = model[row[2]]
        temp_list = np.concatenate((X,Y,Z , X-Z ,Y-Z )).tolist()
        predicted_class = logistic_regr_model_feature_extract.predict([a,b,c,d,e,f]+temp_list)
        
        #print( predicted_class , row[3])
        if(predicted_class[0]==int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ' , precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1521 2721 0.5589856670341786
precision =  0.5612321562734786  recall =  0.5480557593543653
F1 score =  0.55456570155902
done validating
