In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
DATA_FPATH = "../data/train.csv"

In [3]:
raw_data = pd.read_csv(DATA_FPATH, index_col=0)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   qid1          404290 non-null  int64 
 1   qid2          404290 non-null  int64 
 2   question1     404289 non-null  object
 3   question2     404288 non-null  object
 4   is_duplicate  404290 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [4]:
raw_data.sample(3)

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22322,41913,41914,What are the best and iconic movie posters ?,Where can I get free movie posters?,0
48238,86043,86044,Medical Research: What percentage of people ar...,Why are some people angry about the use of lab...,0
306067,7087,34348,How can an adult male increase his height afte...,Can height be increased after age 21?,1


In [5]:
raw_data.isnull().sum()

qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [6]:
raw_data.dropna(inplace=True)

In [7]:
raw_data["is_duplicate"].value_counts(normalize=True)

0    0.630799
1    0.369201
Name: is_duplicate, dtype: float64

## Data preprocessing

In [8]:
qid1 = raw_data.loc[:, ["qid1", "question1"]].drop_duplicates()\
               .rename(columns = {"qid1": "qid", "question1": "question"})
qid2 = raw_data.loc[:, ["qid2", "question2"]].drop_duplicates()\
               .rename(columns = {"qid2": "qid", "question2": "question"})
questions = pd.concat([qid1, qid2], ignore_index=True).drop_duplicates().set_index("qid")

In [9]:
questions.shape

(537929, 1)

In [10]:
questions.sample(5)

Unnamed: 0_level_0,question
qid,Unnamed: 1_level_1
327953,My 14 year old daughter has had constant head ...
80100,Who invented the moment magnitude scale? What ...
174461,How does ISIS get its money?
78857,What is wau in math?
337757,What is the difference between a moral princip...


In [11]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [12]:
## initialise the inbuilt Stemmer
stemmer = PorterStemmer()

In [13]:
## We can also use Lemmatizer instead of Stemmer
lemmatizer = WordNetLemmatizer()

In [63]:
def preprocess(raw_text, flag):
    
    # change sentence to lower case
    sentence = raw_text.lower().strip()
    
    # Replace certain special characters with their string equivalents
    mappings = {'%': ' percent', '$': ' dollar ',
                '₹': ' rupee ', '€': ' euro ', '@': ' at'}
    for k, v in mappings.items():
        sentence = sentence.replace(k, v)
    
    # Removing special characters
    sentence = re.sub("\W", " ", sentence).strip()
    
    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [64]:
# stemming_df = questions["question"].progress_apply(lambda x: preprocess(x, 'stem'))
# stemming_df.columns = ['clean_text_stem', 'text_length_stem']

In [65]:
qid11 = [918,
 7820,
 51974,
 81557,
 88243,
 138624,
 143896,
 146637,
 147282,
 148487,
 178302,
 180631,
 202835,
 203205,
 206835,
 253879,
 255112,
 258113,
 258424,
 260769,
 271885,
 278091,
 280965,
 283417,
 293284,
 313464,
 320317,
 326379,
 338821,
 348903,
 362259,
 363613,
 390281,
 392095,
 395737]

In [66]:
temp_qids = [1831,
 3007,
 6553,
 14396,
 15264,
 18608,
 18927,
 25026,
 33265,
 39204,
 41767,
 47035,
 53770,
 58523,
 58638,
 61064,
 65095,
 66222,
 71436,
 78296,
 92105,
 93561,
 95429,
 95567,
 83329,
 118624,
 119713,
 125095,
 131275,
 145814,
 138455,
 153131,
 156776,
 161071,
 166559,
 171925,
 175199,
 177864,
 178936,
 179423,
 182424,
 188110,
 193662,
 214814,
 218560,
 206339,
 35958,
 94981,
 227879,
 229194,
 231585,
 232475,
 93562,
 236468,
 13922,
 244609,
 246498,
 247989,
 251091,
 183668,
 262028,
 272161,
 273408,
 275164,
 279452,
 283053,
 288142,
 289688,
 300250,
 300509,
 302383,
 305752,
 87944,
 312495,
 312898,
 317100,
 317686,
 321298,
 323809,
 18928,
 332622,
 334697,
 339489,
 340361,
 348547,
 58524,
 118625,
 370965,
 233933,
 14397,
 376051,
 376791,
 391451,
 392793,
 82154,
 400296,
 411318,
 413963,
 97851,
 422330,
 424494,
 425114,
 426917,
 435267,
 333135,
 441233,
 443715,
 443899,
 326142,
 450926,
 451788,
 452592,
 455113,
 456631,
 465877,
 469891,
 472266,
 474349,
 475056,
 477519,
 490443,
 477703,
 492181,
 496099,
 499970,
 506098,
 507298,
 511793,
 512812,
 535899,
 1832,
 18609,
 26464,
 33219,
 33266,
 37899,
 38507,
 49823,
 53771,
 58639,
 60167,
 63632,
 65096,
 76269,
 77322,
 84068,
 114086,
 116374,
 117514,
 129800,
 147531,
 150748,
 151838,
 157839,
 165846,
 173754,
 175200,
 193663,
 195911,
 205092,
 208530,
 209607,
 209638,
 220694,
 224899,
 227880,
 228469,
 231586,
 231995,
 232476,
 236469,
 244610,
 246499,
 263829,
 276892,
 278282,
 283926,
 289404,
 292170,
 299077,
 301395,
 305753,
 317101,
 317687,
 326937,
 331164,
 334698,
 335646,
 339490,
 342017,
 348548,
 366276,
 369967,
 371911,
 373996,
 377891,
 390061,
 397309,
 403528,
 410901,
 419957,
 422331,
 430592,
 432058,
 435268,
 437592,
 438033,
 440206,
 444554,
 445715,
 446310,
 455114,
 460336,
 465878,
 466189,
 469892,
 471984,
 472957,
 476972,
 477520,
 479955,
 494464,
 520796,
 522000,
 522793,
 533355,
 533491,
 533614,
 535325]

In [67]:
# lemmatizing_df = questions["question"].loc[temp_qids].progress_apply(lambda x: preprocess(x, 'lemma'))
lemmatizing_df = questions["question"].progress_apply(lambda x: preprocess(x, 'lemma'))
lemmatizing_df.columns = ['clean_text_lemma', 'text_length_lemma']

100%|█████████████████████████████████████████████████████████████████████| 537929/537929 [37:21<00:00, 239.97it/s]


In [62]:
lemmatizing_df

Unnamed: 0_level_0,clean_text_lemma,text_length_lemma
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
1831,18,1
3007,204 8 503 13 305 11 907 25 705,9
6553,,0
14396,9 10 2,3
15264,0 1,2
...,...,...
522793,a2a,1
533355,2 2 2 2 2 1 2,7
533491,,0
533614,0,1


In [45]:
preprocessed_questions = pd.concat([questions, lemmatizing_df], axis=1)

In [46]:
with open("../data/preprocessed_questions.pkl", "wb") as f:
    pickle.dump(preprocessed_questions, f)