In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt 
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  DISPLAYING THE DATASET USING PANDAS

In [None]:
# Displaying the dataset using Pandas
data = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
data

# SUMMARY OF THE DATAFRAME

In [None]:
data.info()

# CHECKING THE PRESENCE OF NULL VALUES

In [None]:
# Checking the presence of Null values
data.isnull().sum()

# REPLACING NULL VALUES

In [None]:
# Replacing Null values
data["question1"].fillna( method ='ffill', inplace = True) 
data["question2"].fillna( method ='ffill', inplace = True) 
data

# Counting the unique values in "is_duplicate" column

In [None]:
# Counting the unique values in "is_duplicate" column
a = data.pivot_table(index = ['is_duplicate'], aggfunc ='size') 
a = a.reset_index()
a.columns= ["Values", "Counts"]
a

# Visualizing unique value counts of the column "is_duplicate" using pie chart

In [None]:
# Creating Pie Chart
fig = plt.figure(figsize =(5, 30)) 
plt.pie(a["Counts"], labels = a["Values"])
plt.legend(a["Counts"], fontsize=10)
plt.title("Unique value Counts in is_duplicate column", fontsize=25)

# Displaying Pie Chart 
plt.show() 

# RESULTS OF THE ABOVE VISUALIZATION 

In [None]:
b = a.loc[a['Values'] == 0, 'Counts'].iloc[0]
c = a.loc[a['Values'] == 1, 'Counts'].iloc[0]
d = round((b/(b+c))*100,2)
e = round(100-d,2) 
print('Total number of question pairs for training : ',len(data))
print("Number of similar question pairs            : ", b)
print("Percentage of similar question pairs        : ", d, "%")
print("Number of non-similar question pairs        : ", c)
print("Percentage of non-similar question pairs    : ", e, "%")

# Concatenating two dataframes "qid1" and "qid2" and providing results from it

In [None]:
qids = data["qid1"].append(data["qid2"]) 
f = len(np.unique(qids))
g = np.sum(qids.value_counts() == 1)
h = round((g/f)*100,2)
i = np.sum(qids.value_counts() > 1)
j = round((i/f)*100,2)
print('Total number of questions in the training data         : ', f)
print('Number of questions that appear single time            : ', g)
print('Percentage of questions that appear single time        : ', h, '%')
print('Number of questions that appear multiple times         : ', i)
print('Percentage of questions that appear multiple times     : ', j, '%')
print('Maximum number of times a single question is repeated  : ', max(qids.value_counts()))

# Counting the unique values in "qids" column

In [None]:
k = qids.value_counts() 
k = k.reset_index()
k.columns= ["Values", "Counts"]
k

# VISUALIZING QUESTION APPEARANCES USING STACK PLOT

In [None]:
# Creating Stack Plot
plt.stackplot(k["Counts"], k["Values"], color="red") 
plt.xlabel("COUNTS") 
plt.ylabel("NUMBER OF QUESTIONS") 
plt.title("QUESTION APPEARANCE COUNTS VISUALIZATION")

# Displaying Stack Plot
plt.show() 

 # DISPLAYING TOP 20 UNIQUE VALUES HAVING HIGHER NUMBER OF COUNTS

In [None]:
# Displaying top 20 unique values having higher number of counts
k = k[0:20]
k

# VISUALIZING TOP 20 UNIQUE VALUES HAVING HIGHER NUMBER OF COUNTS USING LINEAR PLOT WITH CUSTOMIZATIONS

In [None]:
# Creating Linear Plot with Customizations
fig = plt.figure(figsize =(15, 5)) 
plt.plot(k["Counts"], k["Values"], color='orange', linestyle='dashed', linewidth = 3, marker='o', markerfacecolor='blue', markersize=12)
plt.xlabel("COUNTS")
plt.ylabel("VALUES") 
plt.title("VISUALIZING TOP 20 UNIQUE VALUES HAVING HIGHER NUMBER OF COUNTS")

# Displaying Linear Plot with Customizations
plt.show() 

# CREATING A NEW DATAFRAME

In [None]:
l = {'Values':['Single', 'Multiple'], 'Counts':[g, i]} 
l = pd.DataFrame(l) 
l

# VISUALIZING COMPARISON BETWEEN SINGLE AND MULITPLE TIMES REPEATED QUESTIONS USING DONUT PLOT

In [None]:
# Creating Donut Plot
circle = plt.Circle( (0,0), 0.5, color='white')
plt.pie(l["Counts"], labels=l["Values"])
p=plt.gcf()
p.gca().add_artist(circle)
plt.legend(l["Counts"])
plt.title("COMPARISON BETWEEN SINGLE AND MULITPLE TIMES REPEATED QUESTIONS")

# Displaying Donut Plot
plt.show() 

# PREVIEW SOME OF COMPARISONS BETWEEN QUESTION PAIRS

In [None]:
# Preview some of comparisons between question pairs
a = 0 
for i in range(a,a+10):
    print(data.question1[i])
    print(data.question2[i])
    print()

In [None]:
stop_words = set(stopwords.words('english'))
stop_words

# REMOVING PUNCTUATIONS AND STOP WORDS FROM QUESTIONS

In [None]:
def words(text, remove_stop_words=True, stem_words=False):
    # Remove punctuation from questions
    text = ''.join([c for c in text if c not in punctuation])
    
    # Lowering the words in questions
    text = text.lower()
    
    # Remove stop words from questions
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Return a list of words
    return(text)


In [None]:
def process(question_list, questions):
    for question in questions:
        question_list.append(words(question))
processed_question1 = []
processed_question2 = []
process(processed_question1, data.question1)
process(processed_question2, data.question2)

# PREVIEW SOME OF COMPARISONS BETWEEN QUESTION PAIRS AFTER PROCESSING

In [None]:
# Preview some of comparisons between question pairs after processing
a = 0 
for i in range(a,a+10):
    print(processed_question1[i])
    print(processed_question2[i])
    print()

# DEFINING TFIDF VECTORIZER

In [None]:
# define tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = 'word',
                        stop_words = 'english',
                        lowercase = True,
                        max_features = 300,
                        norm = 'l1')

# CONCATENATING QUESTION COLUMNS

In [None]:
words = pd.concat([data.question1, data.question2], axis = 0)
words.head()

# FITTING AND TRANSFORMING QUESTIONS WITH TFIDF VECTORIZER

In [None]:
tfidf.fit(words)
data_q1 = tfidf.transform(data.question1)
data_q2 = tfidf.transform(data.question2)

# ASSIGNING INDEPENDENT AND DEPENDENT VARIABLES

In [None]:
x = abs(data_q1 - data_q2)
y = data['is_duplicate']

# SPLITTING INTO TRAIN AND TEST SET

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# TRAINING MACHINE LEARNING MODEL

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)

# PREDICTING THE TEST RESULTS

In [None]:
y_pred = lr.predict(x_test) 
y_pred 

# CONFUSION MATRIX

In [None]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, y_pred) 
TP = cm[1][1] 
TN = cm[0][0]
FP = cm[1][0]
FN = cm[0][1] 
print("True Positive  : ", TP)
print("True Negative  : ", TN)
print("False Positive : ", FP)
print("False Negative : ", FN)

In [None]:
Accuracy = (TP + TN) / (TP + TN + FP + FN) 
Accuracy