In [1]:
import pandas as pd
import numpy as np
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,ExtraTreesClassifier 
from sklearn import preprocessing

In [2]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [3]:
def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)

In [4]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [5]:
d = {'sentence1': [' I am going to India ', 'I will be eating coffee' ], 'sentence2': ['I am going to bharat', 'I will be drinking coffee']}
data = pd.DataFrame(data=d)

In [6]:
data['len_q1'] = data.sentence1.apply(lambda x: len(str(x)))
data['len_q2'] = data.sentence2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.sentence1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.sentence2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.sentence1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.sentence2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['sentence1']).lower().split()).intersection(set(str(x['sentence2']).lower().split()))), axis=1)

In [7]:
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['sentence1']), str(x['sentence2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['sentence1']), str(x['sentence2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['sentence1']), str(x['sentence2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['sentence1']), str(x['sentence2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['sentence1']), str(x['sentence2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['sentence1']), str(x['sentence2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['sentence1']), str(x['sentence2'])), axis=1)


In [8]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [9]:
data['wmd'] = data.apply(lambda x: wmd(x['sentence1'], x['sentence2']), axis=1)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['sentence1'], x['sentence2']), axis=1)

In [10]:
sentence1_vectors = np.zeros((data.shape[0], 300))
error_count = 0

#Word2Vec Features
for i, q in tqdm(enumerate(data.sentence1.values)):
    sentence1_vectors[i, :] = sent2vec(q)

sentence2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.sentence2.values)):
    sentence2_vectors[i, :] = sent2vec(q)

2it [00:00, 175.38it/s]
2it [00:00, 1734.98it/s]


In [11]:
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]
data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(sentence1_vectors), np.nan_to_num(sentence2_vectors))]

In [12]:
#Skew and Kurtosis
data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(sentence1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(sentence2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(sentence1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(sentence2_vectors)]

# Testing

In [13]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [14]:
Z= data.drop(['sentence1','sentence2'], 1)
Z.head()

Unnamed: 0,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzz_qratio,fuzz_WRatio,...,cityblock_distance,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,21,20,1,9,11,5,5,4,77,77,...,11.18545,1.0,158.725751,0.794813,0.356511,0.441748,-0.121869,-0.135944,0.385477,-0.279203
1,23,25,-2,13,14,5,5,4,83,83,...,9.256322,1.0,146.048932,0.665047,0.300408,0.355684,0.164981,0.057053,0.551481,-0.196682


In [15]:
X=Z
X

Unnamed: 0,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzz_qratio,fuzz_WRatio,...,cityblock_distance,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,21,20,1,9,11,5,5,4,77,77,...,11.18545,1.0,158.725751,0.794813,0.356511,0.441748,-0.121869,-0.135944,0.385477,-0.279203
1,23,25,-2,13,14,5,5,4,83,83,...,9.256322,1.0,146.048932,0.665047,0.300408,0.355684,0.164981,0.057053,0.551481,-0.196682


In [16]:
import pickle
scalerfile = 'scaler.sav'
min_max_scaler = pickle.load(open(scalerfile, 'rb'))

In [17]:
X = min_max_scaler.transform(X)

In [18]:
# Loading the saved decision tree model pickle
loaded_pkl= open('model.pkl', 'rb')
clf = pickle.load(loaded_pkl)

In [19]:
prediction1 = clf.predict(X)

In [20]:
prediction1

array([0., 1.])