In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info() # some values are missing from the 'question2' column

In [None]:
df.question2.isnull().sum() # since very few values are missing from this column hence it's better to remove both the rows.

**let's see if the classes are balanced or not**

In [None]:
df['is_duplicate'].value_counts()

**https://stackoverflow.com/questions/26476668/frequency-plot-in-python-pandas-dataframe**

In [None]:
import matplotlib.pyplot as plt
import pandas

fig, ax = plt.subplots() # plt.subplots() is a function that returns a tuple containing a figure and axes object(s).

df['is_duplicate'].value_counts().plot(ax=ax, kind='bar')

**https://stackoverflow.com/questions/34162443/why-do-many-examples-use-fig-ax-plt-subplots-in-matplotlib-pyplot-python**

In [None]:
df.groupby("is_duplicate").count() 
# It will select all the columns present inside df and then it will group accoridng to 'is_duplicate'

In [None]:
df[['id','is_duplicate']].groupby(['is_duplicate']).count()
# This will first select two columns from df and then will group them according to 'is_duplicate' .

In [None]:
df[['id','is_duplicate']].groupby(['is_duplicate']).count().plot.bar() 
# This will first select three columns from df and then will group them according to 'is_duplicate' and then it will plot

In [None]:
df.groupby("is_duplicate")['id'].count().plot.bar()
# selecting all columns from df then grouping them according to 'is_duplicate' and then accordingly counting the 'id' column
# as 'id' would be unique of every pair

In [None]:
df[['id','is_duplicate']].groupby('is_duplicate').count().plot.bar()
# selecting the column that are required to plot
# remeber we need numeric data to plot
# df[[is_duplicate']].groupby('is_duplicate').count().plot.bar() , will result in an error

In [None]:
import seaborn as sns
sns.countplot(x = 'is_duplicate',data = df) # This is simplest and elgant

In [None]:
sns.countplot(x = df.is_duplicate) # another way since we want to plot only one column so why providing whole data

In [None]:
print('\n~> Question pairs are Similar (is_duplicate = 1):\n   {}%'.format(round(df['is_duplicate'].mean()*100, 2)) )
# we can use for loop too
print('\n~> Question pairs are Similar (is_duplicate = 1):\n   {}%'.format(100 - round(df['is_duplicate'].mean()*100, 2)))
# just subtract from 100 to get the percentage of other class

**Number of Unique Questions**

In [None]:
import numpy as np
qids = pd.Series( df['qid1'].tolist() + df['qid2'].tolist() )
unique_questions = len(qids.unique())
print("Total Number of Unique Questions {}".format(unique_questions))

repeated_questions = np.sum(qids.value_counts()>1) # value_counts gives the count of every category
print("Total Number of Unique Questions are {}({})\n".format(repeated_questions,repeated_questions/unique_questions*100))

q_vals = qids.value_counts() # contains frequncy of every question
print("Maximum Number of times one question is repeated is {}\n".format(max(q_vals)) )


# print(type(q_vals),q_vals.shape) # two columns one containing category other containing it's occurence
# print(q_vals)
# q_vals = q_vals.values # Return Series as ndarray or ndarray-like depending on the dtype
# print(q_vals)

In [None]:
x_axis = ['Unique Questions','Repeated Questions']
y_axis = [unique_questions,repeated_questions]

plt.figure(figsize=(10,6))
plt.title (" Plot representing unique and repeated questions  ")
sns.barplot(x = x_axis,y= y_axis)
plt.show()

## Checking for Duplicates

In [None]:
pair_duplicates = df[['qid1','qid2','is_duplicate']].groupby(['qid1','qid2']).count().reset_index()
print(pair_duplicates.head())
print("Number of Duplicate Questions are :- ",df.shape[0] - pair_duplicates.shape[0]) # Finding the Difference in number of rows

## Number of occurrences of each Question

In [None]:
plt.figure(figsize = (20,5))
plt.hist(qids.value_counts()) # If bins is an integer, it defines the number of equal-width bins in the range.by def 10 bins

plt.yscale("log")# setting the y-axis scale to log

plt.title('Log-Histogram of question appearance counts')

plt.xlabel('Number of occurences of question')

plt.ylabel('Number of questions')

print ('Maximum number of times a single question is repeated: {}\n'.format(max(qids.value_counts()))) 

In [None]:
plt.figure(figsize = (20,6))
plt.hist(qids.value_counts(),bins = 160) 
plt.yscale("log",nonpositive = 'clip')# Any nonpositive value will be clipped to a number very close to 0 or 1

plt.title('Log-Histogram of question appearance counts',fontsize = 15)

plt.xlabel('Number of occurences of Question',fontsize = 15)

plt.ylabel('Number of Questions',fontsize = 15)

print ('Maximum number of times a single Question is repeated: {}\n'.format(max(qids.value_counts()))) 

## Checking for Null Values

In [None]:
nan_rows = df[df.isnull().any(axis = 1)] # it will filter row by row instead column by column
print (nan_rows)

In [None]:
# Filling the null values with ' '
df = df.fillna('')
nan_rows = df[df.isnull().any(1)] 
#  when the text column is missing, we have to drop those rows.but here we are just filling because only two rows are missing.
print (nan_rows)

<h2> Basic Feature Extraction (before cleaning) </h2>

Let us now construct a few features like:
 - **freq_qid1** = Frequency of qid1's
 - ____freq_qid2____ = Frequency of qid2's 
 - ____q1len____ = Length of q1 (string length)
 - ____q2len____ = Length of q2
 - ____q1_n_words____ = Number of words in Question 1
 - ____q2_n_words____ = Number of words in Question 2
 - ____word_Common____ = (Number of common unique words in Question 1 and Question 2)
 - ____word_Total____ =(Total num of words in Question 1 + Total num of words in Question 2)
 - ____word_share____ = (word_common)/(word_Total)
 - ____freq_q1+freq_q2____ = sum total of frequency of qid1 and qid2 
 - ____freq_q1-freq_q2____ = absolute difference of frequency of qid1 and qid2 

In [None]:
df.head()

**https://pbpython.com/pandas_transform.html**

In [None]:
df['freq_qid1'] = df.groupby('qid1')['qid1'].transform('count') # where count is the function that would be applied to each group.
df['freq_qid2'] = df.groupby('qid2')['qid2'].transform('count') 

print(df[['freq_qid1','freq_qid2']]) #  here 0th column  is Index of original data Frame

In [None]:
df['q1len'] = df['question1'].str.len() # length of the questions
df['q2len'] = df['question2'].str.len()

In [None]:
df['q1_n_words'] = df['question1'].apply(lambda row : len(row.split(" ")) ) # finding the number of words in a question
df['q2_n_words'] = df['question2'].apply(lambda row : len(row.split(" ")) )

In [None]:
df[['question1','q1len','freq_qid1','question2','q2len','freq_qid2']].head()

In [None]:
def normalized_word_Common(row):
    
    q1 = set( map ( lambda word : word.lower().strip() , row['question1'].split(" ") ))
    q2 = set( map ( lambda word : word.lower().strip() , row['question2'].split(" ") ))
    
#     print(len(q1 and q2 ))
#     print(len(q1  &  q2 ))        
#     print(len(q1.intersection(q2)))
    
    return 1.0 * len(q1.intersection(q2))

df['word_Common'] = df.apply(normalized_word_Common,axis = 1) # applying function along column

In [None]:
df.head()