In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r"questions.csv")

In [3]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
data.shape

(404351, 6)

In [5]:
new_data = data.sample(30000, random_state=42)

In [6]:
new_data.shape

(30000, 6)

In [7]:
new_data.isna().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [8]:
new_data.duplicated().sum()

0

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404351 entries, 0 to 404350
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404351 non-null  int64 
 1   qid1          404351 non-null  int64 
 2   qid2          404351 non-null  int64 
 3   question1     404350 non-null  object
 4   question2     404349 non-null  object
 5   is_duplicate  404351 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [10]:
new_data.drop(columns=['id','qid1','qid2'], inplace=True)

# Preprocessing

In [11]:
new_data['question1'] = new_data['question1'].str.lower()
new_data['question2'] = new_data['question2'].str.lower()

In [12]:
new_data.sample(5)

Unnamed: 0,question1,question2,is_duplicate
322449,how & why do i become a theoretical physicist?,what exactly does a theoretical physicist do?,0
312977,how exactly is curve fitting and hypothesis te...,what does fitting a model in machine learning ...,0
233659,where can i find cph4?,what is the power of cph4?,0
30513,how do you commit suicide?,what is the best way to commit suicide?,1
391462,which is the nearest local railway station to ...,which is the nearest railway station to reach ...,1


In [13]:
import string
exclude = string.punctuation

In [14]:
def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', exclude))
    return text

In [15]:
new_data['question1'] = new_data['question1'].apply(remove_punctuation)
new_data['question2'] = new_data['question2'].apply(remove_punctuation)

In [16]:
new_data.sample(5)

Unnamed: 0,question1,question2,is_duplicate
382217,how were the hieroglyphics deciphered,how were hieroglyphics first decoded,1
264191,what are are the best apps for the windows phone,what are the top 5 apps for windows phone,1
39707,project sites computer science,how does being blind from birth affect your ab...,0
182134,what can we do after completing ba english,what should i do after completing a ba,0
265164,i am not ready to have a technical interview y...,should i reschedule my amazon phone interview ...,0


In [17]:
from nltk.corpus import stopwords
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [18]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append("")
        else:
            new_text.append(word)

    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [19]:
new_data['question1'] = new_data['question1'].apply(remove_stopwords)
new_data['question2'] = new_data['question2'].apply(remove_stopwords)

In [20]:
new_data.head(5)

Unnamed: 0,question1,question2,is_duplicate
120567,boggart work,would boggart boggart,0
324466,difference project manager product manager,differences project management business m...,0
398558,hotel jabalpur would safe unmarried couple...,hotel allahabad would safe unmarried coupl...,0
339914,stronger super saiyan 4 super saiyan god,gohan turn super saiyan 2,0
185732,fill address line 1 address line 2,register desired web address,0


In [21]:
import re

def remove_url(text):
    res = re.compile(r'https?://\S+|www\.\S+')
    return res.sub(r"",text)

In [22]:
new_data['question1'] = new_data['question1'].apply(remove_url)
new_data['question2'] = new_data['question2'].apply(remove_url)

In [23]:
new_data.sample(5)

Unnamed: 0,question1,question2,is_duplicate
27542,speak louder wearing earphones,actions speak louder words,0
249287,safety precautions handling shotguns propo...,safety precautions handling shotguns propo...,1
396850,good universities computer science around ...,best computer science schools,1
12681,blatant examples deus ex machina movies,ex machina 2015 movie position caleb w...,0
180,get deleted instagram chats,view deleted instagram dms,1


In [24]:
import requests

# Step 1: Load slang file
url = "https://raw.githubusercontent.com/rishabhverma17/sms_slang_translator/master/slang.txt"
response = requests.get(url)
lines = response.text.splitlines()

In [25]:
# Step 2: Build dictionary (CORRECT SPLIT)
chat_dict = {}
for line in lines:
    if "=" in line:
        slang, meaning = line.split("=", 1)
        chat_dict[slang.upper()] = meaning

In [26]:
chat_dict

{'A3': 'Anytime, Anywhere, Anyplace',
 'ADIH': 'Another Day In Hell',
 'AFK': 'Away From Keyboard',
 'AFAIK': 'As Far As I Know',
 'ASAP': 'As Soon As Possible',
 'ASL': 'Age, Sex, Location',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'BAE': 'Before Anyone Else',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRUH': 'Bro',
 'BRT': 'Be Right There',
 'BSAAW': 'Big Smile And A Wink',
 'BTW': 'By The Way',
 'BWL': 'Bursting With Laughter',
 'CSL': 'Can’t Stop Laughing',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'DM': 'Direct Message',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FIMH': 'Forever In My Heart',
 'FOMO': 'Fear Of Missing Out',
 'FR': 'For Real',
 'FWIW': "For What It's Worth",
 'FYP': 'For You Page',
 'FYI': 'For Your Information',
 'G9': 'Genius',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GMTA': 'Great Minds Think Alik

In [27]:
# Step 3: Conversion function
def chat_conversion(text):
    new_text = []
    for word in text.split():
        key = word.upper()
        if key in chat_dict:
            new_text.append(chat_dict[key])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [28]:
new_data['question1'] = new_data['question1'].apply(chat_conversion)
new_data['question2'] = new_data['question2'].apply(chat_conversion)

In [29]:
new_data.sample(5)

Unnamed: 0,question1,question2,is_duplicate
238820,improve c programming skills,improve java programming skills,0
173300,luck even exist,luck exist,1
249111,incresing prices petrola deasel every month …m...,siren always driver behind driver every cop car,0
193154,country need win medal olympics,country benefit winning lot medals olympics,0
292147,relationship pearson anova data analysis,calculate effect size comparing two models anova,0


In [30]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [31]:
def stemming(text):
    res = [ps.stem(i) for i in text.split()]
    return " ".join(res)

In [32]:
new_data['question1'] = new_data['question1'].apply(stemming)
new_data['question2'] = new_data['question2'].apply(stemming)

In [33]:
new_data.sample(5)

Unnamed: 0,question1,question2,is_duplicate
146278,digit india success,digit india mission go right direct,1
391044,new harri potter book harri potter curs child,dislik harri potter curs child,1
242603,best beach goa ashvem arambol ozran anjuna vagat,best place stay near ashwem beach goa,0
337414,best earphon 1000,best earphon 1k,1
342804,effect protest reform,protest reform consequ,1


In [34]:
decontract = pd.read_csv(r"contractions.csv")
decontract

Unnamed: 0,Contraction,Meaning
0,'aight,alright
1,ain't,is not
2,amn't,am not
3,aren't,are not
4,can't,cannot
...,...,...
143,y'all're,you all are
144,you'd,you would
145,you'll,you will
146,you're,you are


In [35]:

def decontracting_words(text):
    contractions_dict = dict(zip(decontract['Contraction'], decontract['Meaning']))
    new_text = []
    for word in text.split():
        if word in contractions_dict:
            new_text.append(contractions_dict[word])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [36]:
new_data['question1'] = new_data['question1'].apply(decontracting_words)
new_data['question2'] = new_data['question2'].apply(decontracting_words)

In [37]:
new_data.sample(5)

Unnamed: 0,question1,question2,is_duplicate
390507,peopl kill,peopl kill peopl,0
349102,founder whohubcom,founder findingcluecom,0
54630,new year resolut 2017,best new year resolut 2017,1
190499,difficult build oper system,long would take creat oper system,1
41595,four point treati versail brat serv,brat describ four main point treati versail,1


In [38]:
new_data

Unnamed: 0,question1,question2,is_duplicate
120567,boggart work,would boggart boggart,0
324466,differ project manag product manag,differ project manag busi manag,0
398558,hotel jabalpur would safe unmarri coupl withou...,hotel allahabad would safe unmarri coupl witho...,0
339914,stronger super saiyan 4 super saiyan god,gohan turn super saiyan 2,0
185732,fill address line 1 address line 2,regist desir web address,0
...,...,...,...
353403,hire recruit help find job,becom recruit,0
168089,convert jainism,mean navkar mantra jainism,0
228275,one becom effect digit market,becom digit market expert,1
158461,result ban 500 1000 rupe note india,what balaji vishwanathan take ban 500 1000 rs ...,1


In [39]:
new_data['is_duplicate'].value_counts()

is_duplicate
0    18914
1    11086
Name: count, dtype: int64

In [40]:
X = new_data.iloc[:,:-1]
y = new_data.iloc[:,-1]

In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [42]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(24000, 2)
(6000, 2)
(24000,)
(6000,)


In [43]:
X_train

Unnamed: 0,question1,question2
86697,get famou youtub,becom famou youtub vine
136860,jacobi brisset get mentor bill parcel sinc hig...,satisfi sex life
74870,actual happen po wuxi finger hold tai lung,actual happen goa beach
169844,cunnilingu harm,cunnilingu unhealthi
180548,tracer round legal public possess state alabama,tracer round legal public possess state califo...
...,...,...
275976,start busi india,start new busi india
401536,calcul weight steel billet,determin steel weight formula
57627,doraemon underr america,doraemon underr
29743,substitut condens milk evapor milk,substitut evapor milk sweeten condens milk


In [44]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features=5000)

# questions = X_train['question1'].tolist() + X_train['question2'].tolist()
# cv.fit(questions)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

questions = X_train['question1'].tolist() + X_train['question2'].tolist()
tfidf.fit(questions)

In [45]:
# q1_train = cv.transform(X_train['question1']).toarray()
# q2_train = cv.transform(X_train['question2']).toarray()

# print(q1_train.shape)
# print(q2_train.shape)

q1_train = tfidf.transform(X_train['question1']).toarray()
q2_train = tfidf.transform(X_train['question2']).toarray()

print(q1_train.shape)
print(q2_train.shape)

(24000, 5000)
(24000, 5000)


In [46]:
# # Transform test
# q1_test = cv.transform(X_test['question1']).toarray()
# q2_test = cv.transform(X_test['question2']).toarray()

# print(q1_test.shape)
# print(q2_test.shape)

# Transform test
q1_test = tfidf.transform(X_test['question1']).toarray()
q2_test = tfidf.transform(X_test['question2']).toarray()

print(q1_test.shape)
print(q2_test.shape)

(6000, 5000)
(6000, 5000)


In [47]:
X_train_final = np.hstack([q1_train, q2_train])
X_test_final = np.hstack([q1_test, q2_test])

print(X_train_final.shape)
print(X_test_final.shape)

(24000, 10000)
(6000, 10000)


In [56]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(n_estimartors=300, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_final, y_train)
y_pred_xgb = xgb.predict(X_test_final)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

Parameters: { "n_estimartors", "use_label_encoder" } are not used.



Accuracy: 0.7266666666666667
