In [2]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup
import distance
from fuzzywuzzy import fuzz
from wordcloud import WordCloud, STOPWORDS
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm.notebook import tqdm_notebook

In [None]:
# Reading file
df = pd.read_csv('train.csv')
print('Number of data points: {}'.format(df.shape[0]))
df.head()

In [None]:
df.info()

In [None]:
# Plotting data points in each class
df.groupby('is_duplicate')['id'].count().plot.bar()

In [None]:
print('Total number of question pairs: {}'.format(df.shape[0]))

In [None]:
print('Similar questions (is_duplicate=1): {}%'.format(round(((df.groupby('is_duplicate')['id'].count()[1]) / (df.shape[0]))*100, 2)))
print('Dissimilar questions (is_duplicate=0): {}%'.format(100-round(((df.groupby('is_duplicate')['id'].count()[1]) / (df.shape[0]))*100, 2)))

In [None]:
qids = pd.Series(df['qid1'].to_list() + df['qid2'].to_list())

unique_qs = len(np.unique(qids))
print('Total number of unique questions: {}'.format(unique_qs))

qs_morethan_onetime = np.sum(qids.value_counts()>1)
print('Questions that appear more than one time: {}'.format(qs_morethan_onetime))

print('Max number of time a single question is repeated: {}'.format(np.max(qids.value_counts())))

In [None]:
x = ['Unique Questions', 'Repeated Questions']
y = [unique_qs, qs_morethan_onetime]
plt.figure(figsize=(10,6))
plt.title('Plot representing unique and repeated questions')
sns.barplot(x, y)
plt.show()

In [None]:
# checking if we have duplicate data point
pair_duplicates = df[['qid1','qid2','is_duplicate']].groupby(['qid1','qid2']).count().reset_index()
print('Duplicate data points: {}'.format(df.shape[0]-pair_duplicates.shape[0]))

In [None]:
plt.figure(figsize=(20,10))
plt.hist(qids.value_counts(), bins=160)
plt.yscale('log', nonposy='clip')
plt.title('Log-Histogram of question appearance count')
plt.xlabel('Number of occurance')
plt.ylabel('Number of questions')
plt.show()

In [None]:
# checking nan rows
nan_rows = df[df.isnull().values]
print(nan_rows)
df = df.fillna('')

In [None]:
# Seperating target variable
X = df.iloc[:,:5]
y = df.iloc[:,-1]
print(X.shape)
print(y.shape)

In [None]:
# generating train and test datasets
if os.path.isfile('quora_train.csv'):
    df = pd.read_csv('quora_train.csv', encoding='latin-1')
    df = df.fillna('')
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)
    train = [X_train, y_train]
    test = [X_test, y_test]
    df_train = pd.concat(train, axis=1)
    df_test = pd.concat(test, axis=1)
    df_train.to_csv('quora_train.csv', index=False)
    df_test.to_csv('quora_test.csv', index=False)

In [None]:
if os.path.isfile('quora_fe_without_preprocessing.csv'):
    df = pd.read_csv('quora_fe_without_preprocessing.csv', encoding='latin-1')
    df = df.fillna('')
else:
    df = pd.read_csv('quora_train.csv', encoding='latin-1')
    df = df.fillna('')
    
    #qid1_freq: frequency of qid1
    df['qid1_freq'] = df.groupby('qid1')['qid1'].transform('count')
    
    #qid2_freq: frequency of qid2
    df['qid2_freq'] = df.groupby('qid2')['qid2'].transform('count')
    
    #q1_len: length of question1
    df['q1_len'] = df['question1'].str.len()
    
    #q2_len: length of question2
    df['q2_len'] = df['question2'].str.len()
    
    #q1_n_words: number of words in question1
    df['q1_n_words'] = df['question1'].apply(lambda row: len(row.split(' ')))
    
    #q2_n_words: number of words in question2
    df['q2_n_words'] = df['question2'].apply(lambda row: len(row.split(' ')))
    
    #common_words: number of common unique words in question1 and question2
    def normalized_common_words(row):
        w1 = set(map(lambda row: row.lower().strip(), row['question1'].split(' ')))
        w2 = set(map(lambda row: row.lower().strip(), row['question2'].split(' ')))
        return 1.0 * len(w1 & w2)
    df['common_words'] = df.apply(normalized_common_words, axis=1)
    
    #total_words: total number of words in question1 and question2
    def normalized_word_total(row):
        w1 = set(map(lambda row: row.lower().strip(), row['question1'].split(' ')))
        w2 = set(map(lambda row: row.lower().strip(), row['question2'].split(' ')))
        return 1.0 * (len(w1) + len(w2))
    df['total_words'] = df.apply(normalized_word_total, axis=1)
    
    #shared_words: common_words/total_words
    df['shared_words'] = df['common_words']/df['total_words']
    
    #qid1+qid2_freq: sum of frequencies of qid1 and qid2
    df['qid1+qid2_freq'] = df['qid1_freq']+df['qid2_freq']
    
    #qid1-qid2_freq: absolute difference of frequencies of qid1 and qid2
    df['qid1-qid2_freq'] = abs(df['qid1_freq']-df['qid2_freq'])
    
    df.to_csv('quora_fe_without_preprocessing.csv', index=False)

In [None]:
print('Minumum length of questions in question1: {}'.format(min(df['q1_n_words'])))
print('Minimum length of questions in question2: {}'.format(min(df['q2_n_words'])))

print('Number of questions with minimum length in question1: {}'.format(df[df['q1_n_words']==1].shape[0]))
print('Number of questions with minimum length in question2: {}'.format(df[df['q2_n_words']==1].shape[0]))

In [None]:
plt.figure(figsize=(12,8))

plt.subplot(1,2,1)
sns.violinplot(x='is_duplicate', y='shared_words', data=df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate']==1]['shared_words'], color='red')
sns.distplot(df[df['is_duplicate']==0]['shared_words'], color='blue')
plt.show()

In [None]:
plt.figure(figsize=(12,8))

plt.subplot(1,2,1)
sns.violinplot(x='is_duplicate', y='common_words', data=df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate']==1]['common_words'], color='red')
sns.distplot(df[df['is_duplicate']==0]['common_words'], color='blue')
plt.show()

In [None]:
SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words('english')

def pre_process(col):
    col = str(col).lower()
    col = col.replace(",000,000", "m")\
        .replace(",000", "k")\
        .replace("′", "'")\
        .replace("’", "'")\
        .replace("won't", "will not")\
        .replace("cannot", "can not")\
        .replace("can't", "can not")\
        .replace("n't", " not")\
        .replace("what's", "what is")\
        .replace("it's", "it is")\
        .replace("'ve", " have")\
        .replace("i'm", "i am")\
        .replace("'re", " are")\
        .replace("he's", "he is")\
        .replace("she's", "she is")\
        .replace("'s", " own")\
        .replace("%", " percent ")\
        .replace("₹", " rupee ")\
        .replace("$", " dollar ")\
        .replace("€", " euro ")\
        .replace("'ll", " will")
    
    col = re.sub(r"([0-9]+)000000", r"\1m", col)
    col = re.sub(r"([0-9]+)000", r"\1k", col)
    
    porter = PorterStemmer()
    
    # '\W' matches any non aplhanumeric charecter
    pattern = re.compile('\W')
    
    if type(col)==type(''):
        col = re.sub(pattern, ' ', col)
    
    if type(col)==type(''):
        col=porter.stem(col)
        text = BeautifulSoup(col)
        col = text.get_text()
    
    return col

In [None]:
def get_token_features(q1, q2):
    
    # create a list of size 10 and initialize each element with 0
    token_features = [0.0]*10
    
    # convert sentence to tokens
    q1_tokens = q1.split(' ')
    q2_tokens = q2.split(' ')
    
    if len(q1_tokens)==0 or len(q2_tokens)==0:
        return token_features
    
    # get words other than stopwords
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    # get stopwords
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    common_words_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # cwc_min: ratio of common words to minimum length of word count of question1 and question2
    token_features[0] = common_words_count/(min(len(q1_words),len(q2_words))+SAFE_DIV)
    # cwc_max: ratio of common words to maximum length of word count of question1 and question2
    token_features[1] = common_words_count/(max(len(q1_words),len(q2_words))+SAFE_DIV)
    
    # csc_min: ratio of common stop words to minimum length of stop count of question1 and question2
    token_features[2] = common_stop_count/(min(len(q1_stops),len(q2_stops))+SAFE_DIV)
    # csc_max: ratio of common stop words to maximum length of stop count of question1 and question2
    token_features[3] = common_stop_count/(max(len(q1_stops),len(q2_stops))+SAFE_DIV)
    
    # ctc_min: ratio of common token words to minimum length of token count of question1 and question2
    token_features[4] = common_token_count/(min(len(q1_tokens),len(q2_tokens))+SAFE_DIV)
    # ctc_max: ratio of common token words to maximum length of token count of question1 and question2
    token_features[5] = common_token_count/(max(len(q1_tokens),len(q2_tokens))+SAFE_DIV)
    
    # last_word_eq
    token_features[6] = int(q1_tokens[-1]==q2_tokens[-1])
    
    # first_word_eq
    token_features[7] = int(q1_tokens[0]==q2_tokens[0])
    
    # abs_len_diff
    token_features[8] = abs(len(q1_tokens)-len(q2_tokens))
    
    # mean_length
    token_features[9] = (len(q1_tokens)+len(q2_tokens))/2
    
    return token_features

In [None]:
# longest_substr_ratio: len(longest common substring) / (min(len(q1_tokens), len(q2_tokens)))
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a,b))
    if len(strs)==0:
        return 0
    else:
        return len(strs[0])/min(len(a),len(b),1)

In [None]:
def extract_features(df):
    df['question1'] = df['question1'].apply(pre_process)
    df['question2'] = df['question2'].apply(pre_process)
    
    token_features = df.apply(lambda x: get_token_features(x['question1'],x['question2']), axis=1)
    
    df['cwc_min']       = list(map(lambda x: x[0], token_features))
    df['cwc_max']       = list(map(lambda x: x[1], token_features))
    df['csc_min']       = list(map(lambda x: x[2], token_features))
    df['csc_max']       = list(map(lambda x: x[3], token_features))
    df['ctc_min']       = list(map(lambda x: x[4], token_features))
    df['ctc_max']       = list(map(lambda x: x[5], token_features))
    df['last_word_eq']  = list(map(lambda x: x[6], token_features))
    df['first_word_eq'] = list(map(lambda x: x[7], token_features))
    df['abs_len_diff']  = list(map(lambda x: x[8], token_features))
    df['mean_len']      = list(map(lambda x: x[9], token_features))
    
    df['token_set_ratio']      = df.apply(lambda x: fuzz.token_set_ratio(x['question1'],x['question2']), axis=1)
    df['token_sort_ratio']     = df.apply(lambda x: fuzz.token_sort_ratio(x['question1'],x['question2']), axis=1)
    df['fuzz_ratio']           = df.apply(lambda x: fuzz.QRatio(x['question1'],x['question2']), axis=1)
    df['fuzz_partial_ratio']   = df.apply(lambda x: fuzz.partial_ratio(x['question1'],x['question2']), axis=1)
    df['longest_substr_ratio'] = df.apply(lambda x: get_longest_substr_ratio(x['question1'],x['question2']), axis=1)
    
    return df

In [None]:
if os.path.isfile('quora_fe_nlp.csv'):
    df = pd.read_csv('quora_fe_nlp.csv', encoding='latin-1')
    df.fillna('')
else:
    df = pd.read_csv('quora_train.csv', encoding='latin-1')
    df = df.fillna('')
    df = extract_features(df)
    df.to_csv('quora_fe_nlp.csv', index=False)

In [None]:
if os.path.isfile('quora_train_p.txt'):
    text_p = open('quora_train_p.txt').read()
    text_n = open('quora_train_n.txt').read()
else:
    df_duplicate = df[df['is_duplicate']==1]
    df_nonduplicate = df[df['is_duplicate']==0]

    p = np.dstack([df_duplicate['question1'], df_duplicate['question2']]).flatten()
    n = np.dstack([df_nonduplicate['question1'], df_nonduplicate['question2']]).flatten()

    print('Number of data points in class 1: {}'.format(len(p)))
    print('Number of data points in class 2: {}'.format(len(n)))

    np.savetxt('quora_train_p.txt', p, delimiter=' ', fmt='%s')
    np.savetxt('quora_train_n.txt', n, delimiter=' ', fmt='%s')

In [None]:
text_p = open('quora_train_p.txt').read()
text_n = open('quora_train_n.txt').read()

stopwords = set(STOPWORDS)
stopwords.add('said')
stopwords.add('br')
stopwords.add(' ')
stopwords.remove('not')
stopwords.remove('no')
stopwords.remove('like')

print('Total number of words in duplicate pair: {}'.format(len(text_p)))
print('Total number of words in non duplicate pair: {}'.format(len(text_n)))

In [None]:
wc = WordCloud(background_color='white', max_words=len(text_p), stopwords=stopwords)
wc.generate(text_p)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
wc = WordCloud(background_color='white', max_words=len(text_n), stopwords=stopwords)
wc.generate(text_n)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
sns.pairplot(df[['ctc_min','cwc_min','csc_min','token_sort_ratio','is_duplicate']], hue='is_duplicate', vars=['ctc_min','cwc_min','csc_min','token_sort_ratio'])
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.subplot(1,2,1)
sns.violinplot(x='is_duplicate', y='token_sort_ratio', data=df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate']==1]['token_sort_ratio'], color='red')
sns.distplot(df[df['is_duplicate']==0]['token_sort_ratio'], color='blue')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.subplot(1,2,1)
sns.violinplot(x='is_duplicate', y='fuzz_ratio', data=df)

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate']==1]['fuzz_ratio'], color='red')
sns.distplot(df[df['is_duplicate']==0]['fuzz_ratio'], color='blue')
plt.show()

In [None]:
df_subsampled = df.head(10000)
X = MinMaxScaler().fit_transform(df_subsampled[['cwc_min','cwc_max','csc_min','csc_max','ctc_min','ctc_max','last_word_eq','first_word_eq','abs_len_diff','mean_len','token_set_ratio','token_sort_ratio','fuzz_ratio','fuzz_partial_ratio','longest_substr_ratio']])
y = df_subsampled['is_duplicate'].values

tsne = TSNE(random_state=7).fit_transform(X)

In [None]:
df = pd.DataFrame({'x':tsne[:,0], 'y':tsne[:,1], 'label':y})

sns.lmplot(data=df, x='x', y='y', hue='label', fit_reg=False, size=8, palette='Set1', markers=['s', 'o'])
plt.title('Perplexity: {} and Max-iter {}'.format(30,1000))
plt.show()

In [None]:
if not os.path.isfile('quora_tfidf_q1.csv'):
    df = pd.read_csv('quora_train.csv', encoding='latin-1')
    df = df.fillna('')
    questions = list(df['question1']) + list(df['question2'])
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(questions)
    word_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
    nlp = spacy.load('en_core_web_sm')
    sample = nlp('Hi')
    
    vect_form_1 = []
    for que_1 in tqdm_notebook(list(df['question1'])):
        tokens_1 = nlp(que_1)
        vec_1 = np.zeros([len(sample[0].vector)])
        for token_1 in tokens_1:
            wv_1 = token_1.vector
            try:
                idf=word_idf[str(token_1)]
            except:
                idf=0
            vec_1 += wv_1*idf
        vect_form_1.append(vec_1)
    df['q1_vect_form'] = vect_form_1
    df_q1 = pd.DataFrame(df['q1_vect_form'].values.tolist())
    df_q1.to_csv('quora_tfidf_q1.csv', index=False)

In [None]:
if not os.path.isfile('quora_tfidf_q2.csv'):
    df = pd.read_csv('quora_train.csv', encoding='latin-1')
    df = df.fillna('')
    questions = list(df['question1']) + list(df['question2'])
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(questions)
    word_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
    nlp = spacy.load('en_core_web_sm')
    sample = nlp('Hi')
    
    vect_form_2 = []
    for que_2 in tqdm_notebook(list(df['question2'])):
        tokens_2 = nlp(que_2)
        vec_2 = np.zeros([len(sample[0].vector)])
        for token_2 in tokens_2:
            wv_2 = token_2.vector
            try:
                idf=word_idf[str(token_2)]
            except:
                idf=0
            vec_2 += wv_2*idf
        vect_form_2.append(vec_2)
    df['q2_vect_form'] = vect_form_2
    df_q2 = pd.DataFrame(df['q2_vect_form'].values.tolist())
    df_q2.to_csv('quora_tfidf_q2.csv', index=False)

In [16]:
if not os.path.isfile('quora_final_features.csv'):
    
    df_fe_wo_pp = pd.read_csv('quora_fe_without_preprocessing.csv', encoding='latin-1')
    
    df_nlp = pd.read_csv('quora_fe_nlp.csv', encoding='latin-1')
    df_nlp = df_nlp.drop(['qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], axis=1)
    
    df = df_fe_wo_pp.merge(df_nlp, on='id', how='left')
    
    df_q1 = pd.read_csv('quora_tfidf_q1.csv', encoding='latin-1')
    df_q1['id'] = df_fe_wo_pp['id']
    
    df = df.merge(df_q1, on='id', how='left')
    
    df_q2 = pd.read_csv('quora_tfidf_q2.csv', encoding='latin-1')
    df_q2['id'] = df_fe_wo_pp['id']
    
    df = df.merge(df_q2, on='id', how='left')
    
    df.to_csv('quora_final_features.csv', index=False)