In [None]:
#Count common words, overall word count and question length
def common_words(x):
    q1, q2 = x
    return len(set(str(q1).lower().split()) & set(str(q2).lower().split()))

def words_count(question):
    return len(str(question).split())

def length(question):
    return len(str(question))

In [None]:
#Features Set 1 (FS1)
#Count common words, overall word count and question length for both Train and Test Data
traindata['q1_words_num'] = traindata['q1_cleaned'].map(words_count)
traindata['q2_words_num'] = traindata['q2_cleaned'].map(words_count)
traindata['q1_length'] = traindata['q1_cleaned'].map(length)
traindata['q2_length'] = traindata['q2_cleaned'].map(length)
traindata['common_words'] = traindata[['q1_cleaned', 'q2_cleaned']].apply(common_words, axis=1)

testdata['q1_words_num'] = testdata['q1_cleaned'].map(words_count)
testdata['q2_words_num'] = testdata['q2_cleaned'].map(words_count)
testdata['q1_length'] = testdata['q1_cleaned'].map(length)
testdata['q2_length'] = testdata['q2_cleaned'].map(length)
testdata['common_words'] = testdata[['q1_cleaned', 'q2_cleaned']].apply(common_words, axis=1)

#Count difference in question length for Train and Test Data
traindata['diff_length'] = traindata.q1_length - traindata.q2_length
testdata['diff_length'] = testdata.q1_length - testdata.q2_length

In [None]:
#Features Set 2 (FS2)
#Create Fuzzy features for the traindata
from fuzzywuzzy import fuzz
traindata['fuzz_qratio'] = traindata.apply(lambda x: fuzz.QRatio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)
traindata['fuzz_WRatio'] = traindata.apply(lambda x: fuzz.WRatio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1) 
traindata['fuzz_partial_ratio'] = traindata.apply(lambda x: fuzz.partial_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1) 
traindata['fuzz_partial_token_set_ratio'] = traindata.apply(lambda x: fuzz.partial_token_set_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)
traindata['fuzz_partial_token_sort_ratio'] = traindata.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1) 
traindata['fuzz_token_set_ratio'] = traindata.apply(lambda x: fuzz.token_set_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)
traindata['fuzz_token_sort_ratio'] = traindata.apply(lambda x: fuzz.token_sort_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)

#Create Fuzzy features for the traindatatestdata
testdata['fuzz_qratio'] = testdata.apply(lambda x: fuzz.QRatio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)
testdata['fuzz_WRatio'] = testdata.apply(lambda x: fuzz.WRatio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1) 
testdata['fuzz_partial_ratio'] = testdata.apply(lambda x: fuzz.partial_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1) 
testdata['fuzz_partial_token_set_ratio'] = testdata.apply(lambda x: fuzz.partial_token_set_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)
testdata['fuzz_partial_token_sort_ratio'] = testdata.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1) 
testdata['fuzz_token_set_ratio'] = testdata.apply(lambda x: fuzz.token_set_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)
testdata['fuzz_token_sort_ratio'] = testdata.apply(lambda x: fuzz.token_sort_ratio(str(x['q1_cleaned']), str(x['q2_cleaned'])), axis=1)

In [None]:
#Word2vec features 

import gensim
#Merging all sencences
sentences = traindata['q1_cleaned'].values.tolist() +  traindata['q2_cleaned'].values.tolist()

#Creating a model 
model = gensim.models.Word2Vec(sentences, min_count=1)

In [None]:
def sent2vec(words): #takes array of cleaned tokens 
#words = str(s).lower().decode('utf-8')
#words = word_tokenize(words)
#words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        M.append(model[w])
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [None]:
#Create word2vec vectors for both Train and Test Data based on cleaned questions
traindata['q1_vec'] = traindata['q1_cleaned'].apply(sent2vec)
traindata['q2_vec'] = traindata['q2_cleaned'].apply(sent2vec)

testdata['q1_vec'] = testdata['q1_cleaned'].apply(sent2vec)
testdata['q2_vec'] = testdata['q2_cleaned'].apply(sent2vec)

In [None]:
#Drop NaN values for Train Data
traindata = traindata.dropna()

In [None]:
import scipy.spatial.distance as dist
import scipy.stats as stats

In [None]:
#Features Set 3 (FS3)
#Word2vec on traindata
traindata['euclidean'] = traindata.apply(lambda row: dist.euclidean(row['q1_vec'], row['q2_vec']) , axis=1)
traindata['manhattan'] = traindata.apply(lambda row: dist.cityblock(row['q1_vec'], row['q2_vec']) , axis=1)
traindata['canberra'] = traindata.apply(lambda row: dist.canberra(row['q1_vec'], row['q2_vec']) , axis=1)
traindata['minkowski'] = traindata.apply(lambda row: dist.minkowski(row['q1_vec'], row['q2_vec'], 3) , axis=1)
traindata['braycurtis'] = traindata.apply(lambda row: dist.braycurtis(row['q1_vec'], row['q2_vec']) , axis=1)
traindata['cosine'] = traindata.apply(lambda row: dist.cosine(row['q1_vec'], row['q2_vec']) , axis=1)

traindata['scew_q1'] = traindata['q1_vec'].apply(stats.skew)
traindata['scew_q2'] = traindata['q2_vec'].apply(stats.skew)

traindata['kurtosis_q1'] = traindata['q1_vec'].apply(stats.kurtosis)
traindata['kurtosis_q2'] = traindata['q2_vec'].apply(stats.kurtosis)

In [None]:
#Features Set 3 (FS3)
#Word2vec on testdata
testdata['euclidean'] = testdata.apply(lambda row: dist.euclidean(row['q1_vec'], row['q2_vec']) , axis=1)
testdata['manhattan'] = testdata.apply(lambda row: dist.cityblock(row['q1_vec'], row['q2_vec']) , axis=1)
testdata['canberra'] = testdata.apply(lambda row: dist.canberra(row['q1_vec'], row['q2_vec']) , axis=1)
testdata['minkowski'] = testdata.apply(lambda row: dist.minkowski(row['q1_vec'], row['q2_vec'], 3) , axis=1)
testdata['braycurtis'] = testdata.apply(lambda row: dist.braycurtis(row['q1_vec'], row['q2_vec']) , axis=1)
testdata['cosine'] = testdata.apply(lambda row: dist.cosine(row['q1_vec'], row['q2_vec']) , axis=1)

testdata['scew_q1'] = testdata['q1_vec'].apply(stats.skew)
testdata['scew_q2'] = testdata['q2_vec'].apply(stats.skew)

testdata['kurtosis_q1'] = testdata['q1_vec'].apply(stats.kurtosis)
testdata['kurtosis_q2'] = testdata['q2_vec'].apply(stats.kurtosis)

In [None]:
#Check whether the questions pairs have the same start word
def same_start_word(row):
    if not row['q1_cleaned'] or not row['q2_cleaned']:
        return np.nan
    return int(row['q1_cleaned'][0] == row['q2_cleaned'][0])

#Check for total unique words
def total_unique_words(row):
    return len(set(row['q1_cleaned']).union(row['q2_cleaned']))

In [None]:
#Features Set 4 (FS4)
traindata['same_start_word'] = traindata[['q1_cleaned', 'q2_cleaned']].apply(same_start_word, axis=1)
traindata['total_unique_words'] = traindata[['q1_cleaned', 'q2_cleaned']].apply(total_unique_words, axis=1)
testdata['same_start_word'] = testdata[['q1_cleaned', 'q2_cleaned']].apply(same_start_word, axis=1)
testdata['total_unique_words'] = testdata[['q1_cleaned', 'q2_cleaned']].apply(total_unique_words, axis=1)