In [94]:
import numpy as np
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup
import distance
import warnings
warnings.filterwarnings('ignore')
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm.notebook import tqdm_notebook
from wordcloud import STOPWORDS

In [61]:
df_train = pd.read_csv('quora_train.csv', encoding='latin-1')

In [62]:
qid1_count = df_train['qid1'].value_counts().to_dict()
qid2_count = df_train['qid2'].value_counts().to_dict()

In [63]:
df = pd.read_csv('quora_test.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,375001,243548,505985,Why do I feel stressed with no reason behind?,"I feel stressed, pressured and lonely. Can any...",0
1,187485,285732,285733,Can I charge the two 12v DC batteries that pow...,I have a 2.5KVA inverter with two 12v 200AH ba...,0
2,49537,88159,88160,How do I delete the Facebook Messenger account...,How can I delete a Facebook messenger account ...,1
3,36302,66208,66209,What is a symbol?,What is symbolic programming?,0
4,77252,131935,131936,Is it possible to change your class schedule i...,Should high school classes be optional?,0


In [64]:
def get_qid1_counts(col):
    cnt = qid1_count.get(col, 0)
    return cnt

In [72]:
def get_qid2_counts(col):
    cnt = qid2_count.get(col, 0)
    return cnt

In [75]:
if not os.path.isfile('quora_test_fe_without_preprocessing.csv'):
    df = pd.read_csv('quora_test.csv', encoding='latin-1')
    df = df.fillna('')
    
    #qid1_freq: frequency of qid1
    df['qid1_freq'] = df['qid1'].apply(lambda x: get_qid1_counts(x))
    
    #qid2_freq: frequency of qid2
    df['qid2_freq'] = df['qid2'].apply(lambda x: get_qid2_counts(x))
    
    #q1_len: length of question1
    df['q1_len'] = df['question1'].str.len()
    
    #q2_len: length of question2
    df['q2_len'] = df['question2'].str.len()
    
    #q1_n_words: number of words in question1
    df['q1_n_words'] = df['question1'].apply(lambda row: len(row.split(' ')))
    
    #q2_n_words: number of words in question2
    df['q2_n_words'] = df['question2'].apply(lambda row: len(row.split(' ')))
    
    #common_words: number of common unique words in question1 and question2
    def normalized_common_words(row):
        w1 = set(map(lambda row: row.lower().strip(), row['question1'].split(' ')))
        w2 = set(map(lambda row: row.lower().strip(), row['question2'].split(' ')))
        return 1.0 * len(w1 & w2)
    df['common_words'] = df.apply(normalized_common_words, axis=1)
    
    #total_words: total number of words in question1 and question2
    def normalized_word_total(row):
        w1 = set(map(lambda row: row.lower().strip(), row['question1'].split(' ')))
        w2 = set(map(lambda row: row.lower().strip(), row['question2'].split(' ')))
        return 1.0 * (len(w1) + len(w2))
    df['total_words'] = df.apply(normalized_word_total, axis=1)
    
    #shared_words: common_words/total_words
    df['shared_words'] = df['common_words']/df['total_words']
    
    #qid1+qid2_freq: sum of frequencies of qid1 and qid2
    df['qid1+qid2_freq'] = df['qid1_freq']+df['qid2_freq']
    
    #qid1-qid2_freq: absolute difference of frequencies of qid1 and qid2
    df['qid1-qid2_freq'] = abs(df['qid1_freq']-df['qid2_freq'])
    
    df.to_csv('quora_test_fe_without_preprocessing.csv', index=False)

In [80]:
SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words('english')

def pre_process(col):
    col = str(col).lower()
    col = col.replace(",000,000", "m")\
        .replace(",000", "k")\
        .replace("′", "'")\
        .replace("’", "'")\
        .replace("won't", "will not")\
        .replace("cannot", "can not")\
        .replace("can't", "can not")\
        .replace("n't", " not")\
        .replace("what's", "what is")\
        .replace("it's", "it is")\
        .replace("'ve", " have")\
        .replace("i'm", "i am")\
        .replace("'re", " are")\
        .replace("he's", "he is")\
        .replace("she's", "she is")\
        .replace("'s", " own")\
        .replace("%", " percent ")\
        .replace("₹", " rupee ")\
        .replace("$", " dollar ")\
        .replace("€", " euro ")\
        .replace("'ll", " will")
    
    col = re.sub(r"([0-9]+)000000", r"\1m", col)
    col = re.sub(r"([0-9]+)000", r"\1k", col)
    
    porter = PorterStemmer()
    
    # '\W' matches any non aplhanumeric charecter
    pattern = re.compile('\W')
    
    if type(col)==type(''):
        col = re.sub(pattern, ' ', col)
    
    if type(col)==type(''):
        col=porter.stem(col)
        text = BeautifulSoup(col)
        col = text.get_text()
    
    return col

In [81]:
def get_token_features(q1, q2):
    
    # create a list of size 10 and initialize each element with 0
    token_features = [0.0]*10
    
    # convert sentence to tokens
    q1_tokens = q1.split(' ')
    q2_tokens = q2.split(' ')
    
    if len(q1_tokens)==0 or len(q2_tokens)==0:
        return token_features
    
    # get words other than stopwords
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    # get stopwords
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    common_words_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # cwc_min: ratio of common words to minimum length of word count of question1 and question2
    token_features[0] = common_words_count/(min(len(q1_words),len(q2_words))+SAFE_DIV)
    # cwc_max: ratio of common words to maximum length of word count of question1 and question2
    token_features[1] = common_words_count/(max(len(q1_words),len(q2_words))+SAFE_DIV)
    
    # csc_min: ratio of common stop words to minimum length of stop count of question1 and question2
    token_features[2] = common_stop_count/(min(len(q1_stops),len(q2_stops))+SAFE_DIV)
    # csc_max: ratio of common stop words to maximum length of stop count of question1 and question2
    token_features[3] = common_stop_count/(max(len(q1_stops),len(q2_stops))+SAFE_DIV)
    
    # ctc_min: ratio of common token words to minimum length of token count of question1 and question2
    token_features[4] = common_token_count/(min(len(q1_tokens),len(q2_tokens))+SAFE_DIV)
    # ctc_max: ratio of common token words to maximum length of token count of question1 and question2
    token_features[5] = common_token_count/(max(len(q1_tokens),len(q2_tokens))+SAFE_DIV)
    
    # last_word_eq
    token_features[6] = int(q1_tokens[-1]==q2_tokens[-1])
    
    # first_word_eq
    token_features[7] = int(q1_tokens[0]==q2_tokens[0])
    
    # abs_len_diff
    token_features[8] = abs(len(q1_tokens)-len(q2_tokens))
    
    # mean_length
    token_features[9] = (len(q1_tokens)+len(q2_tokens))/2
    
    return token_features

In [84]:
# longest_substr_ratio: len(longest common substring) / (min(len(q1_tokens), len(q2_tokens)))
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a,b))
    if len(strs)==0:
        return 0
    else:
        return len(strs[0])/min(len(a),len(b),1)

In [85]:
def extract_features(df):
    df['question1'] = df['question1'].apply(pre_process)
    df['question2'] = df['question2'].apply(pre_process)
    
    token_features = df.apply(lambda x: get_token_features(x['question1'],x['question2']), axis=1)
    
    df['cwc_min']       = list(map(lambda x: x[0], token_features))
    df['cwc_max']       = list(map(lambda x: x[1], token_features))
    df['csc_min']       = list(map(lambda x: x[2], token_features))
    df['csc_max']       = list(map(lambda x: x[3], token_features))
    df['ctc_min']       = list(map(lambda x: x[4], token_features))
    df['ctc_max']       = list(map(lambda x: x[5], token_features))
    df['last_word_eq']  = list(map(lambda x: x[6], token_features))
    df['first_word_eq'] = list(map(lambda x: x[7], token_features))
    df['abs_len_diff']  = list(map(lambda x: x[8], token_features))
    df['mean_len']      = list(map(lambda x: x[9], token_features))
    
    df['token_set_ratio']      = df.apply(lambda x: fuzz.token_set_ratio(x['question1'],x['question2']), axis=1)
    df['token_sort_ratio']     = df.apply(lambda x: fuzz.token_sort_ratio(x['question1'],x['question2']), axis=1)
    df['fuzz_ratio']           = df.apply(lambda x: fuzz.QRatio(x['question1'],x['question2']), axis=1)
    df['fuzz_partial_ratio']   = df.apply(lambda x: fuzz.partial_ratio(x['question1'],x['question2']), axis=1)
    df['longest_substr_ratio'] = df.apply(lambda x: get_longest_substr_ratio(x['question1'],x['question2']), axis=1)
    
    return df

In [91]:
if not os.path.isfile('quora_test_fe_nlp.csv'):
    df = pd.read_csv('quora_test.csv', encoding='latin-1')
    df = df.fillna('')
    df = extract_features(df)
    df.to_csv('quora_test_fe_nlp.csv', index=False)

In [97]:
if not os.path.isfile('quora_test_tfidf_q1.csv'):
    df = pd.read_csv('quora_test.csv', encoding='latin-1')
    df = df.fillna('')
    
    df_train = pd.read_csv('quora_train.csv', encoding='latin-1')
    df_train = df_train.fillna('')
    
    questions = list(df_train['question1']) + list(df_train['question2'])
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(questions)
    word_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
    nlp = spacy.load('en_core_web_sm')
    sample = nlp('Hi')
    
    vect_form_1 = []
    for que_1 in tqdm_notebook(list(df['question1'])):
        tokens_1 = nlp(que_1)
        vec_1 = np.zeros([len(sample[0].vector)])
        for token_1 in tokens_1:
            wv_1 = token_1.vector
            try:
                idf=word_idf[str(token_1)]
            except:
                idf=0
            vec_1 += wv_1*idf
        vect_form_1.append(vec_1)
    df['q1_vect_form'] = vect_form_1
    df_q1 = pd.DataFrame(df['q1_vect_form'].values.tolist())
    df_q1.to_csv('quora_test_tfidf_q1.csv', index=False)

HBox(children=(IntProgress(value=0, max=80858), HTML(value='')))




In [98]:
if not os.path.isfile('quora_test_tfidf_q2.csv'):
    df = pd.read_csv('quora_test.csv', encoding='latin-1')
    df = df.fillna('')
    
    df_train = pd.read_csv('quora_train.csv', encoding='latin-1')
    df_train = df_train.fillna('')
    
    questions = list(df_train['question1']) + list(df_train['question2'])
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(questions)
    word_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
    nlp = spacy.load('en_core_web_sm')
    sample = nlp('Hi')
    
    vect_form_2 = []
    for que_2 in tqdm_notebook(list(df['question2'])):
        tokens_2 = nlp(que_2)
        vec_2 = np.zeros([len(sample[0].vector)])
        for token_2 in tokens_2:
            wv_2 = token_2.vector
            try:
                idf=word_idf[str(token_2)]
            except:
                idf=0
            vec_2 += wv_2*idf
        vect_form_2.append(vec_2)
    df['q2_vect_form'] = vect_form_2
    df_q2 = pd.DataFrame(df['q2_vect_form'].values.tolist())
    df_q2.to_csv('quora_test_tfidf_q2.csv', index=False)

HBox(children=(IntProgress(value=0, max=80858), HTML(value='')))




In [99]:
if not os.path.isfile('quora_test_final_features.csv'):
    
    df_fe_wo_pp = pd.read_csv('quora_test_fe_without_preprocessing.csv', encoding='latin-1')
    
    df_nlp = pd.read_csv('quora_test_fe_nlp.csv', encoding='latin-1')
    df_nlp = df_nlp.drop(['qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], axis=1)
    
    df = df_fe_wo_pp.merge(df_nlp, on='id', how='left')
    
    df_q1 = pd.read_csv('quora_test_tfidf_q1.csv', encoding='latin-1')
    df_q1['id'] = df_fe_wo_pp['id']
    
    df = df.merge(df_q1, on='id', how='left')
    
    df_q2 = pd.read_csv('quora_test_tfidf_q2.csv', encoding='latin-1')
    df_q2['id'] = df_fe_wo_pp['id']
    
    df = df.merge(df_q2, on='id', how='left')
    
    df.to_csv('quora_test_final_features.csv', index=False)