In [1]:
import gensim
import pickle
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [1]:
import numpy as np
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import tqdm

def get_sent2vec(df):
        
    s2v = []
    
    for row_num, question in enumerate(df.values):

        q_words = str(question).lower().split()

        M = []
        for word in q_words:

            try:
                M.append(model[word])
            except:
                continue


        if len(M) == 0:
            q_s2v = [0 for i in range(0,300)]
        else:    
            M1 = np.array(M)

            v = M1.sum(axis=0)
            q_s2v = v/np.sqrt((v ** 2).sum())

        s2v.append(q_s2v)

    return s2v

def get_s2v_features(arr1, arr2):
    
    return {
        "w2v_cosine": cosine(arr1, arr2),
        "w2v_cityblock": cityblock(arr1, arr2),
        "w2v_jacard": jacard(arr1, arr2),
        "w2v_canberra": canberra(arr1, arr2),
        "w2v_euclidean": eucliedean(arr1, arr2),
        "w2v_braycurtis": braycurtis(arr1, arr2)
    }  

In [3]:
import pandas as pd
df_train = pd.read_csv('./train.csv').fillna("")
df_test = pd.read_csv('./test.csv').fillna("")

In [71]:
# train_q1_s2v = get_sent2vec(df_train.question1)
# with open("train_q1_s2v.pickle", "wb") as handle:
#     pickle.dump(train_q1_s2v, handle)
with open("train_q1_s2v.pickle", "rb") as handle:
    train_q1_s2v = pickle.load(handle)

In [76]:
# train_q2_s2v = get_sent2vec(df_train.question2)
# with open("train_q2_s2v.pickle", "wb") as handle:
#     pickle.dump(train_q2_s2v, handle)
with open("train_q2_s2v.pickle", "rb") as handle:
    train_q2_s2v = pickle.load(handle)

In [90]:
train_s2v_features = pd.DataFrame()
train_s2v_features['cosine_distance'] = [cosine(x, y) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]
train_s2v_features['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]
train_s2v_features['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]
train_s2v_features['canberra_distance'] = [canberra(x, y) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]
train_s2v_features['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]
train_s2v_features['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]
train_s2v_features['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(train_q1_s2v,train_q2_s2v)]

# from scipy.stats import skew, kurtosis
# train_s2v_features['skew_q1vec'] = [skew(x) for x in train_q1_s2v]
# train_s2v_features['skew_q2vec'] = [skew(x) for x in train_q2_s2v]
# train_s2v_features['kur_q1vec'] = [kurtosis(x) for x in train_q1_s2v]
# train_s2v_features['kur_q2vec'] = [kurtosis(x) for x in train_q2_s2v]

train_s2v_features.to_csv("train_s2v_features.csv", index = False)

In [93]:
%reset_selective -f train_q1_s2v
%reset_selective -f train_q2_s2v
%reset_selective -f train_s2v_features

In [4]:
def split_and_get(part_df, part_df_filename):
    
    test_q1_s2v = get_sent2vec(part_df.question1)
    with open(part_df_filename + "_q1_s2v.pickle", "wb") as handle:
        pickle.dump(test_q1_s2v, handle)

    test_q2_s2v = get_sent2vec(part_df.question2)
    with open(part_df_filename + "_q2_s2v.pickle", "wb") as handle:
        pickle.dump(test_q2_s2v, handle)

In [5]:
split_and_get(df_test[0:390000], 'test_1')
split_and_get(df_test[390000:780000], 'test_2')
split_and_get(df_test[780000:1170000], 'test_3')
split_and_get(df_test[1170000:1560000], 'test_4')
split_and_get(df_test[1560000:1950000], 'test_5')
split_and_get(df_test[1950000:], 'test_6')

In [2]:
import pandas as pd
def test_s2v_features(part_df_filename):
    
    with open(part_df_filen
              ame + "_q1_s2v.pickle", "rb") as handle:
        test_q1_s2v = pickle.load(handle)

    with open(part_df_filename + "_q2_s2v.pickle", "rb") as handle:
        test_q2_s2v = pickle.load(handle)

    test_s2v_features = pd.DataFrame()
    test_s2v_features['cosine_distance'] = [cosine(x, y) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]
    test_s2v_features['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]
    test_s2v_features['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]
    test_s2v_features['canberra_distance'] = [canberra(x, y) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]
    test_s2v_features['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]
    test_s2v_features['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]
    test_s2v_features['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(test_q1_s2v,test_q2_s2v)]

    # from scipy.stats import skew, kurtosis
    # test_s2v_features['skew_q1vec'] = [skew(x) for x in test_q1_s2v]
    # test_s2v_features['skew_q2vec'] = [skew(x) for x in test_q2_s2v]
    # test_s2v_features['kur_q1vec'] = [kurtosis(x) for x in test_q1_s2v]
    # test_s2v_features['kur_q2vec'] = [kurtosis(x) for x in test_q2_s2v]

    test_s2v_features.to_csv(part_df_filename + "_s2v_features.csv", index = False)

In [3]:
import pickle
test_s2v_features('test_1')
test_s2v_features('test_2')
test_s2v_features('test_3')
test_s2v_features('test_4')
test_s2v_features('test_5')
test_s2v_features('test_6')

hi


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
  / np.double(np.bitwise_or(u != 0, v != 0).sum()))
  return abs(u - v).sum() / abs(u + v).sum()


In [5]:
test_1_s2v_features = pd.read_csv("test_1_s2v_features.csv")
test_2_s2v_features = pd.read_csv("test_2_s2v_features.csv")
test_3_s2v_features = pd.read_csv("test_3_s2v_features.csv")
test_4_s2v_features = pd.read_csv("test_4_s2v_features.csv")
test_5_s2v_features = pd.read_csv("test_5_s2v_features.csv")
test_6_s2v_features = pd.read_csv("test_6_s2v_features.csv")

test_s2v_features = pd.concat([test_1_s2v_features, test_2_s2v_features, test_3_s2v_features, test_4_s2v_features, test_5_s2v_features, test_6_s2v_features])

test_s2v_features.to_csv("test_s2v_features.csv", index = False)