In [1]:
import csv
import os
import sys
import os.path
from gensim.models import Word2Vec
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from gensim.scripts.glove2word2vec import glove2word2vec


In [2]:
#create dictionary of words in vocab
vocab = {}
with open('glove.6B.100d.txt' , 'r') as inpfile:
    line = inpfile.readline()
    while line:
        line = line.strip()
        line = line.split()
        vocab[line[0]] = 1
        line = inpfile.readline()
print('Done making vocab')

#load word2vec embedding

glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
print ('Done loading embedding')

Done making vocab
Done loading embedding


In [3]:
#function returns cosine similartiy between given words
def calculate_cosine_similartiy(word1 , word2):
    A = model[word1 ]
    A = np.reshape(A, (1, -1))
    B = model[word2 ]
    B = np.reshape(B, (1, -1))
    x = cosine_similarity( A , B )[0][0]
    return x
    

In [4]:
#convert training data in representational format

cosine_similarity_words_features = []  
concatinated_word_embeddings     = []
label = []


total = 0   #total rows in train data
cnt   = 0   #points present in vocab
output_file_one  = open('train_output_one.txt', 'w')
output_file_zero = open('train_output_zero.txt', 'w')


with open('train.txt') as train_file:
    reader = csv.reader(train_file)
    for row in reader :
        total += 1
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            cnt += 1
            continue
        cosine_similarity_between_w1_atb = calculate_cosine_similartiy(row[0] , row[2])
        cosine_similarity_between_w2_atb = calculate_cosine_similartiy(row[1] , row[2])
        #print(cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb)
        if (row[3]=='1'):
            output_file_one.write(row[0]+" " + row[1]+" " + row[2] +'\n' )
            output_file_one.write(str(cosine_similarity_between_w1_atb)+" " + str(cosine_similarity_between_w2_atb)+" " + str( row[3] )+'\n' )
        else:
            output_file_zero.write(row[0]+" " + row[1]+" " + row[2] +'\n' )
            output_file_zero.write(str(cosine_similarity_between_w1_atb)+" " + str(cosine_similarity_between_w2_atb)+" " + str( row[3] )+'\n' )
        
        label.append([int(row[3])])
        cosine_similarity_words_features.append([cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb ])
        concatinated_list = (model[row[0]]  +  model[row[1]]  + model[row[2]] ).tolist()
        concatinated_word_embeddings.append(concatinated_list  )
        
print('Done representing')
    


Done representing


In [5]:
#train logistic regression model by using cosine values as feature vector

logistic_regr_model_cosine_sim   = LogisticRegression(max_iter = 300)
logistic_regr_model_cosine_sim.fit( cosine_similarity_words_features , label )

print("Done fitting")

Done fitting


In [6]:
#train logistic regression model by concatinating word embedding values as feature vector


logistic_regr_model_concatinated = LogisticRegression(max_iter = 300)
logistic_regr_model_concatinated.fit( concatinated_word_embeddings , label  ) 
print("Done fitting")


Done fitting


In [17]:
#validate using logistic regression model using cosine values as features vector

total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0
with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        
        cosine_similarity_between_w1_atb = calculate_cosine_similartiy(row[0] , row[2])
        cosine_similarity_between_w2_atb = calculate_cosine_similartiy(row[1] , row[2])
        
        predicted_class = logistic_regr_model_cosine_sim.predict( [cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb] )
        
        #print(predicted_class , row[3])
        
        if(predicted_class[0] == int(row[3])):
            cnt += 1
            
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ' , precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1341 2721 0.4928335170893054
precision =  0.49402670414617006  recall =  0.5157740278796772
F1 score =  0.5046661880832735
done validating


In [18]:
#validate using logistic regression model using concatinating embedding as features vector

total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0

with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        concatinated_list = (model[row[0]]  +  model[row[1]]  + model[row[2]] ).tolist()
        predicted_class = logistic_regr_model_concatinated.predict(concatinated_list)
        
        #print( predicted_class , row[3])
        if(predicted_class[0]==int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ' , precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1354 2721 0.4976111723631018
precision =  0.4978021978021978  recall =  0.33235509904622157
F1 score =  0.3985921689397272
done validating


In [11]:
#Gaussian Naive bayes using cosine_values as embeddings
from sklearn.naive_bayes import GaussianNB 

gnb_cosine = GaussianNB()
NB_model_cosine = gnb_cosine.fit(cosine_similarity_words_features , label)
gnb_concat = GaussianNB()
NB_model_concat = gnb_concat.fit(concatinated_word_embeddings , label)

print("Done model fitting")

Done model fitting


In [19]:
#validate using Naive Baise model using cosine values as features vector

total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0
with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        
        cosine_similarity_between_w1_atb = calculate_cosine_similartiy(row[0] , row[2])
        cosine_similarity_between_w2_atb = calculate_cosine_similartiy(row[1] , row[2])
        
        predicted_class = NB_model_cosine.predict( [cosine_similarity_between_w1_atb , cosine_similarity_between_w2_atb] )
        
        #print(predicted_class , row[3])
        
        if(predicted_class[0] == int(row[3])):
            cnt += 1
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ', precision , ' recall = ' , recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1242 2721 0.4564498346196251
precision =  0.46432964329643295  recall =  0.5539251650770359
F1 score =  0.5051856808297089
done validating


In [20]:
#validate using Naive Bayes model using concatinating embedding as features vector

total =0
cnt = 0
total =0
cnt = 0
true_positive  = 0
true_negative  = 0
false_positive = 0
false_negative = 0
with open('validation.txt') as validation_file:
    reader = csv.reader(validation_file)
    for row in reader :
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            continue
        total += 1
        concatinated_list = (model[row[0]]  +  model[row[1]]  + model[row[2]] ).tolist()
        predicted_class = NB_model_concat.predict(concatinated_list)
        
        #print( predicted_class , row[3])
        if(predicted_class[0]==int(row[3])):
            cnt += 1
        
        if(int(row[3]) == 1):
            if(predicted_class[0]==1):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(predicted_class[0]==1):
                false_positive += 1
            else:
                true_negative += 1
        
precision = true_positive/( true_positive + false_positive)
recall    = true_positive/( true_positive + false_negative)
            
print (cnt , total , cnt/total)
print ('precision = ',  precision , ' recall = ' ,recall )
print ( 'F1 score = ' , 2*((precision*recall)/ (precision+recall))  )
print('done validating')

1223 2721 0.4494671076809996
precision =  0.4432296047098402  recall =  0.38664710198092445
F1 score =  0.41300940438871475
done validating


In [None]:
#Additional feature extraction using wordnet

with open('train.txt') as train_file:
    reader = csv.reader(train_file)
    for row in reader :
        total += 1
        if (row[0] not in vocab) or (row[1] not in vocab) or (row[2] not in vocab):
            cnt += 1
            continue
