In [3]:
import sys
sys.path.append("../..")

In [None]:
# install the BeautifulSoup package
!pip install bs4

In [49]:
import pandas as pd
from nltk.util import ngrams
from nltk.corpus import stopwords
from src.preprocess import clean_html, clean_punctuation, clean_uppercase, clean_lemmatize
from nltk.tokenize import word_tokenize
import distance
from fuzzywuzzy import fuzz
import nltk
from sklearn.preprocessing import MinMaxScaler

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dimitriskpl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Set paths to train, test datasets and to where the preprocessed train, test datasets will be saved

In [21]:
PATH_TO_TRAIN_DATA = "../../bigdata2023duplicatedetection/q_3_2/data/train.csv"
PATH_TO_TEST_DATA = "../../bigdata2023duplicatedetection/q_3_2/data/test_without_labels.csv"

# Path where the data will be saved after preprocessing and extracting features
PATH_TO_PREPROCESSED_TRAIN_DATA = "../../bigdata2023duplicatedetection/q_3_2/preprocessed_data/preprocessed_train_df.pkl"
PATH_TO_PREPROCESSED_TEST_DATA = "../../bigdata2023duplicatedetection/q_3_2/preprocessed_data/preprocessed_test_df.pkl"

In [22]:
train_df = pd.read_csv(PATH_TO_TRAIN_DATA)
test_df = pd.read_csv(PATH_TO_TEST_DATA)

In [23]:
train_df.head(5)

Unnamed: 0,Id,Question1,Question2,IsDuplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [24]:
test_df.head(5)

Unnamed: 0,Id,Question1,Question2
0,283003,What can someone do if they've lost the wirele...,What is the best USB wireless mouse that can b...
1,283004,Why India need to elect Prime minister?,Is prime minister of India elected or appointed?
2,283005,How can I make money online with free of cost?,How can I make money online for free?
3,283006,Does MDMA affect the first and higher order mo...,Do antipsychotics affect the first and higher ...
4,283007,"I am a Saudi National and have ""SR 3 million"" ...",Where should I invest money to get high returns?


In [29]:
print('Total Null values per column in train dataset')
print(train_df.isnull().sum())

Total Null values per column in train dataset
Id             0
Question1      0
Question2      2
IsDuplicate    0
dtype: int64


In [31]:
# Drop row with Null values
train_df = train_df.dropna()

In [30]:
print('Total Null values per column in test dataset')
test_df.isnull().sum()

Total Null values per column in test dataset


Id           0
Question1    1
Question2    0
dtype: int64

In [33]:
cnt_duplicdates = train_df[train_df['IsDuplicate'] == 1].shape[0]
cnt_non_duplicdates = train_df[train_df['IsDuplicate'] == 0].shape[0]
total_rows = train_df.shape[0]
print(f"Total duplicates pairs: {cnt_duplicdates} -> {(cnt_duplicdates/total_rows*100):.2f}%")
print(f"Total non duplicates pairs: {cnt_non_duplicdates} -> {(cnt_non_duplicdates/total_rows*100):.2f}%")

Total duplicates pairs: 105123 -> 37.15%
Total non duplicates pairs: 177879 -> 62.85%


For this task, we implement a more streamlined preprocessing approach to the data, aiming to retain as much potentially valuable information as possible for effectively comparing the questions

In [12]:
def preprocess(df, modify_columns):
    df = clean_html(df, modify_columns)
    print("HTML clean done")

    df = clean_punctuation(df, modify_columns)
    print("Punctation clean done")

    df = clean_uppercase(df, modify_columns)
    print("Uppercase clean done")

    df = clean_lemmatize(df, modify_columns)
    print("Lemmatize done")

    return df

In [34]:
train_df = preprocess(train_df, ["Question1", "Question2"])

HTML clean done
Punctation clean done
Uppercase clean done
Lemmatize done


In [36]:
# Replace Null values with ""
test_df.fillna("", inplace=True)

In [38]:
test_df = preprocess(test_df, ["Question1", "Question2"])

HTML clean done
Punctation clean done
Uppercase clean done
Lemmatize done


In [39]:
def safe_divide(numerator, denominator):
    if denominator == 0:
        return 0  
    else:
        return numerator / denominator

For each pair of questions we extract new features:

* **cwc_min**: The ratio of common non-stopwords to the total number of non-stopwords in the shorter question
* **cwc_max**: The ratio of common non-stopwords to the total number of non-stopwords in the longer question
* **csc_min**: The ration of common stopwords between the questions considering the shorter question
* **csc_max**: The ration of common stopwords between the questions considering the longer question
* **ctc_min**: The ratio of common tokens (including stopwords) to the total tokens in the shorter question
* **ctc_max**: The ratio of common tokens (including stopwords) to the total tokens in the longer question
* **last_word_eq**: Last word of both question is same or not
* **first_word_eq**: First word of both question is same or not
* **abs_len_diff**: Absolute difference of the number of words
* **mean_len**: Average Token Length of both Questions
* **jaccard_sim**: Jaccard similarity
* **word_overlap**: Total common words 
* **share_n_grams**: Total shared 2-grams and 3-grams
* **n_words_diff**: Difference of number of words 
* **token_set_ratio**: Compares the unordered token sets of both questions to measure similarity
* **token_sort_ratio**: Compares the token sequences after sorting
* **fuzz_ratio**: A simple ratio of similarity between the two question strings, considering the sequence of characters
* **fuzz_partial_ratio**: Compares the similarity of partial strings
* **longest_substr_ratio**: The ratio of the length of the longest common substring to the length of the shorter question

In [40]:
def get_token_features(q1, q2, stopwords):
    token_features = [0.0]*16
    
    # Converting the Sentence into Tokens: 
    q1_tokens = word_tokenize(q1)
    q2_tokens = word_tokenize(q2)

    if not q1_tokens or not q2_tokens:
        return token_features
    
    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in stopwords])
    q2_words = set([word for word in q2_tokens if word not in stopwords])
    
    # Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in stopwords])
    q2_stops = set([word for word in q2_tokens if word in stopwords])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    # The ratio of common non-stopwords to the total number of non-stopwords in the shorter question
    token_features[0] = safe_divide(common_word_count, min(len(q1_words), len(q2_words)))

    # The ratio of common non-stopwords to the total number of non-stopwords in the longer question
    token_features[1] = safe_divide(common_word_count, max(len(q1_words), len(q2_words)))

    # The ration of common stopwords between the questions considering the shorter question
    token_features[2] = safe_divide(common_stop_count, min(len(q1_stops), len(q2_stops)))

    # The ration of common stopwords between the questions considering the longer question
    token_features[3] = safe_divide(common_stop_count, max(len(q1_stops), len(q2_stops)))

    # The ratio of common tokens (including stopwords) to the total tokens in the shorter question
    token_features[4] = safe_divide(common_token_count, min(len(q1_tokens), len(q2_tokens)))

    # The ratio of common tokens (including stopwords) to the total tokens in the longer question
    token_features[5] = safe_divide(common_token_count, max(len(q1_tokens), len(q2_tokens)))
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    # Absolute difference of the number of words
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    
    # Average Token Length of both Questions
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2

    # jaccard similarity
    token_features[10] = safe_divide(common_word_count, len(q1_words.union(q2_words)))

    # total common words 
    token_features[11] = common_word_count

    # total shared 2-grams and 3-grams
    n_grams_q1 = set(ngrams(q1_tokens, 2)) | set(ngrams(q1_tokens, 3))
    n_grams_q2 = set(ngrams(q2_tokens, 2)) | set(ngrams(q2_tokens, 3))
    token_features[12] = len(n_grams_q1.intersection(n_grams_q2))

    # difference of number of words
    token_features[13] = abs(len(q1_words) - len(q2_words))

    return token_features

# returns the Longest Common sub string
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))     
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

def extract_features(df):
    token_features = df.apply(lambda x: get_token_features(x["Question1"], x["Question2"], stopwords.words("english")), axis=1)
    df["cwc_min"] = list(map(lambda x: x[0], token_features))
    df["cwc_max"] = list(map(lambda x: x[1], token_features))
    df["csc_min"] = list(map(lambda x: x[2], token_features))
    df["csc_max"] = list(map(lambda x: x[3], token_features))
    df["ctc_min"] = list(map(lambda x: x[4], token_features))
    df["ctc_max"] = list(map(lambda x: x[5], token_features))
    df["last_word_eq"] = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"] = list(map(lambda x: x[8], token_features))
    df["mean_len"] = list(map(lambda x: x[9], token_features))
    df["jaccard_sim"] = list(map(lambda x: x[10], token_features))
    df["word_overlap"] = list(map(lambda x: x[11], token_features)) 
    df["share_n_grams"] = list(map(lambda x: x[12], token_features)) 
    df["n_words_diff"] = list(map(lambda x: x[13], token_features)) 
    df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["Question1"], x["Question2"]), axis=1) # Compares the unordered token sets of both questions to measure similarity
    df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["Question1"], x["Question2"]), axis=1) # Compares the token sequences after sorting
    df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["Question1"], x["Question2"]), axis=1) # A simple ratio of similarity between the two question strings, considering the sequence of characters
    df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["Question1"], x["Question2"]), axis=1) # Compares the similarity of partial strings
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["Question1"], x["Question2"]), axis=1) # The ratio of the length of the longest common substring to the length of the shorter question
    return df
     

In [41]:
train_df = extract_features(train_df)

In [42]:
test_df = extract_features(test_df)

In [43]:
train_df.head(5)

Unnamed: 0,Id,Question1,Question2,IsDuplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,...,mean_len,jaccard_sim,word_overlap,share_n_grams,n_words_diff,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,what be the step by step guide to invest in sh...,what be the step by step guide to invest in sh...,0,1.0,0.833333,1.0,1.0,0.916667,0.785714,...,13.0,0.833333,5.0,21.0,1.0,100,93,93,100,0.982456
1,1,what be the story of kohinoor kohinoor diamond,what would happen if the indian government ste...,0,0.666667,0.25,0.666667,0.5,0.5,0.307692,...,10.5,0.222222,2.0,3.0,5.0,81,60,62,72,0.553191
2,2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0,0.6,0.5,0.4,0.25,0.5,0.357143,...,12.0,0.375,3.0,1.0,1.0,73,63,37,44,0.181818
3,3,why be i mentally very lonely how can i solve it,find the remainder when math2324math be divide...,0,0.0,0.0,0.25,0.142857,0.111111,0.090909,...,10.0,0.0,0.0,0.0,2.0,28,25,20,26,0.081633
4,4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,0.4,0.2,1.0,0.666667,0.571429,0.307692,...,10.0,0.153846,2.0,0.0,5.0,67,47,36,55,0.153846


In [45]:
#changing columns to numeric type
num_cols = train_df.drop(columns=['Id', 'Question1', 'Question2']).columns
for i in num_cols:
    train_df[i] = train_df[i].apply(pd.to_numeric)

In [46]:
test_df.head(5)

Unnamed: 0,Id,Question1,Question2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,mean_len,jaccard_sim,word_overlap,share_n_grams,n_words_diff,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,283003,what can someone do if theyve lose the wireles...,what be the best usb wireless mouse that can b...,0.428571,0.333333,0.5,0.444444,0.411765,0.388889,0.0,...,17.5,0.230769,3.0,0.0,2.0,67,49,46,45,0.131579
1,283004,why india need to elect prime minister,be prime minister of india elect or appoint,0.8,0.8,0.0,0.0,0.571429,0.5,0.0,...,7.5,0.666667,4.0,1.0,0.0,81,67,40,42,0.384615
2,283005,how can i make money online with free of cost,how can i make money online for free,1.0,0.8,0.75,0.6,0.875,0.7,0.0,...,9.0,0.8,4.0,9.0,1.0,94,84,81,89,0.756757
3,283006,do mdma affect the first and high order moment...,do antipsychotic affect the first and high ord...,0.909091,0.909091,1.0,1.0,0.944444,0.944444,1.0,...,18.0,0.833333,10.0,29.0,0.0,97,91,93,94,0.919192
4,283007,i be a saudi national and have sr 3 million in...,where should i invest money to get high return,0.2,0.0625,0.5,0.181818,0.333333,0.103448,0.0,...,19.0,0.05,1.0,0.0,11.0,39,35,24,39,0.170213


Beacause some features are on different scale,we use MinMaxScaler for each one of them individually

In [48]:
features_to_scale = ["abs_len_diff", "mean_len", "word_overlap", "share_n_grams", "n_words_diff", "token_set_ratio", "token_sort_ratio", "fuzz_ratio", "fuzz_partial_ratio"]

In [50]:
features_scaler_dict = {}
for feature_to_scale in features_to_scale:
   scaler = MinMaxScaler()
   train_df[feature_to_scale] = scaler.fit_transform(train_df[feature_to_scale].values.reshape(-1, 1))
   test_df[feature_to_scale] = scaler.transform(test_df[feature_to_scale].values.reshape(-1, 1))

In [52]:
train_df.head(5)

Unnamed: 0,Id,Question1,Question2,IsDuplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,...,mean_len,jaccard_sim,word_overlap,share_n_grams,n_words_diff,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,what be the step by step guide to invest in sh...,what be the step by step guide to invest in sh...,0,1.0,0.833333,1.0,1.0,0.916667,0.785714,...,0.096296,0.833333,0.227273,0.244186,0.014925,1.0,0.93,0.93,1.0,0.982456
1,1,what be the story of kohinoor kohinoor diamond,what would happen if the indian government ste...,0,0.666667,0.25,0.666667,0.5,0.5,0.307692,...,0.077778,0.222222,0.090909,0.034884,0.074627,0.81,0.6,0.62,0.72,0.553191
2,2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0,0.6,0.5,0.4,0.25,0.5,0.357143,...,0.088889,0.375,0.136364,0.011628,0.014925,0.73,0.63,0.37,0.44,0.181818
3,3,why be i mentally very lonely how can i solve it,find the remainder when math2324math be divide...,0,0.0,0.0,0.25,0.142857,0.111111,0.090909,...,0.074074,0.0,0.0,0.0,0.029851,0.28,0.25,0.2,0.26,0.081633
4,4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,0.4,0.2,1.0,0.666667,0.571429,0.307692,...,0.074074,0.153846,0.090909,0.0,0.074627,0.67,0.47,0.36,0.55,0.153846


In [51]:
test_df.head(5)

Unnamed: 0,Id,Question1,Question2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,mean_len,jaccard_sim,word_overlap,share_n_grams,n_words_diff,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,283003,what can someone do if theyve lose the wireles...,what be the best usb wireless mouse that can b...,0.428571,0.333333,0.5,0.444444,0.411765,0.388889,0.0,...,0.12963,0.230769,0.136364,0.0,0.029851,0.67,0.49,0.46,0.45,0.131579
1,283004,why india need to elect prime minister,be prime minister of india elect or appoint,0.8,0.8,0.0,0.0,0.571429,0.5,0.0,...,0.055556,0.666667,0.181818,0.011628,0.0,0.81,0.67,0.4,0.42,0.384615
2,283005,how can i make money online with free of cost,how can i make money online for free,1.0,0.8,0.75,0.6,0.875,0.7,0.0,...,0.066667,0.8,0.181818,0.104651,0.014925,0.94,0.84,0.81,0.89,0.756757
3,283006,do mdma affect the first and high order moment...,do antipsychotic affect the first and high ord...,0.909091,0.909091,1.0,1.0,0.944444,0.944444,1.0,...,0.133333,0.833333,0.454545,0.337209,0.0,0.97,0.91,0.93,0.94,0.919192
4,283007,i be a saudi national and have sr 3 million in...,where should i invest money to get high return,0.2,0.0625,0.5,0.181818,0.333333,0.103448,0.0,...,0.140741,0.05,0.045455,0.0,0.164179,0.39,0.35,0.24,0.39,0.170213


Save the preprocessed datasets to the predefined paths

In [54]:
train_df.to_pickle(PATH_TO_PREPROCESSED_TRAIN_DATA)
test_df.to_pickle(PATH_TO_PREPROCESSED_TEST_DATA)