# Data Exploration

In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
pd.set_option('display.max_colwidth', 200)
# Any results you write to the current directory are saved as output.

In [6]:
class Lodash:    
    def flow(self, *args):
        def fns(payload):
            result = payload
            for fn in args:
                result = fn(result)
            return result
        return fns
dash = Lodash()

## Training Dataset structure
*Id*: the id of for the data

*pairqid1*: The id for the first question

*aid2*: The id for the second question.

*is_duplicate*: Is considered that the two questions are the same

In [3]:
train  =  pd.read_csv('../input/train.csv')
train.head(10)

## Checking the data

Basic functions to count for the whole dataframe.

In [4]:
def getNumberOfRows(df):
    return len(df)

def getDuplicates(df):
    return df[df['is_duplicate'] == 1]

def getDuplicatesPercentage(df):
    duplicateRows = dash.flow(getDuplicates, getNumberOfRows)(df)
    totalRows = getNumberOfRows(df)
    return round(duplicateRows*100/totalRows,2)

def getQuestionsIds(df):
    return df['qid1'].tolist() + df['qid2'].tolist()

def getTotalNumberOfQuestions(df):
    return dash.flow(getQuestionsIds, len)(df)

def getUniqueQuestionsIds(df):
    return dash.flow(getQuestionsIds, set)(df)

def getTotalNumberOfUniqueQUestions(df):
    return dash.flow(getUniqueQuestionsIds, len)(df)

def getRepeatedQuestionsIds(df):
    serie = dash.flow(getQuestionsIds, pd.Series)(df)
    counts = serie.value_counts()
    repeated = counts[counts > 1]
    return list(set(repeated.index.tolist()))

def getNumberOfRepeatedQuestionsIds(df):
    return dash.flow(getRepeatedQuestionsIds, len)(df)

def getPercentageOfRepeatedQuestions(df):
    return round(getNumberOfRepeatedQuestionsIds(df)/getTotalNumberOfUniqueQUestions(df),2)*100

In [8]:
def printReport(df):
    print("Total Number of Questions Pairs: {:,}".format(getNumberOfRows(df)))
    print("Total Number of Questions: {:,}".format(getTotalNumberOfQuestions(df)))
    print("Total Number of Unique Questions: {:,}". format(getTotalNumberOfUniqueQUestions(df)))
    print("Total Number of Repeated Questions: {:,}".format(getNumberOfRepeatedQuestionsIds(df)))
    print("Percentage of Repeated Questions: {:,}%".format(getPercentageOfRepeatedQuestions(df)))
    print('\n')

printReport(train)

## Number of duplicated pairs
The percentage of datapoints that are considered duplicated is 36.9%

In [9]:
numberOfQuestions = train.size
numberOfDuplicatedPairs = train[train['is_duplicate'] == 1].size
percentageDuplicatedPairs = numberOfDuplicatedPairs*100/numberOfQuestions
# print('Total number of question pairs for training: {}'.format(len(df_train)))
# print('Duplicate pairs: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))
# qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
# print('Total number of questions in the training data: {}'.format(len(
#     np.unique(qids))))
# print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1)))
print('The size of the training set is: {}'.format(numberOfQuestions))
print('Total number of duplicated pairs: {}'.format(numberOfDuplicatedPairs))
print('Percentage Duplicated pairs: {}'.format(percentageDuplicatedPairs))

In [25]:
count_duplicate = train['is_duplicate'].value_counts()
plt.bar([0, 1], count_duplicate.values)
plt.ylabel('Number of pairs')
plt.xlabel('Duplicate', fontsize=12)
plt.title('Balance of classes in training set')
plt.show()



### Quantiles size for Training Data, number of words

Given that to feed the model the size of the input has to be standard, it's required to check the size of the questions.


In [27]:
questions_train = pd.concat([train['question1'], train['question2']])
q_train_len = questions_train.apply(lambda x: len(str(x).split(' ')))
quantiles_train = q_train_len.quantile([0.25, 0.5, 0.75, 0.99])
print("Quantiles Train: ")
print(quantiles_train)

We can see that the mean of words per question is 10 and that the 99 percent of the words are covered by a length of 31 words, so using 32 words as input lenght for questions is good enought.

In [28]:
plt.figure(figsize=(10, 5))
plt.hist(q_train_len, bins=100, range=[0,300])
plt.title('Length Training set')
plt.xlabel('Number of characters per question')
plt.ylabel('Number of questions')
plt.yscale('log', nonposy='clip')

## Testing Data Set

In [39]:
test  =  pd.read_csv('../input/test.csv')
test.head(10)

### Quantiles for words

For the test dataset 32 is still a good upper boundary for the number of words

In [30]:
questions_test = pd.concat([test['question1'], test['question2']])
q_test_len = questions_test.apply(lambda x: len(str(x).split(' ')))
quantiles_test = q_test_len.quantile([0.25, 0.5, 0.75, 0.99])
print("Quantiles test: ")
print(quantiles_test)

In [31]:
plt.figure(figsize=(10, 5))
plt.hist(q_test_len, bins=100, range=[0,300])
plt.title('Length Training set')
plt.xlabel('Number of characters per question')
plt.ylabel('Number of questions')
plt.yscale('log', nonposy='clip')

# BenchMarks

### Statistics for pairs with repeated questions.

Here we show that for the next mutually exclusive sets, the percentage of duplicated pairs might vary

- No question is repeated: 22.4%
- One question is repeated: 15.71%
- The two questions are repeated: 71.6%

This is a bias given in the way the data is collected, but still we can use this percentages in our testing set, if the distribution is similar, to get a good score.

In [33]:

def getRowsWithNoRepeatedQuestion(df):
    nonRepeatedQeuestions = set(getUniqueQuestionsIds(df)) - set(getRepeatedQuestionsIds(df))
    return df[ (df['qid1'].isin(nonRepeatedQeuestions) & df['qid2'].isin(nonRepeatedQeuestions)) ]

def getRowsWithAtLeastOneRepeatedQuestion(df):
    repeatedQuestionsIds = getRepeatedQuestionsIds(df)
    nonRepeatedQeuestions = set(getUniqueQuestionsIds(df)) - set(getRepeatedQuestionsIds(df))
    onlyQ1IsRepeated = df['qid1'].isin(repeatedQuestionsIds) & df['qid2'].isin(nonRepeatedQeuestions)
    onlyQ2IsRepeated = df['qid2'].isin(repeatedQuestionsIds) & df['qid1'].isin(nonRepeatedQeuestions)

    return df[ onlyQ1IsRepeated | onlyQ2IsRepeated]

def getRowsWithTwoRepeatedQuestions(df):
    repeatedQuestionsIds = getRepeatedQuestionsIds(df)
    return df[ (df['qid1'].isin(repeatedQuestionsIds) & df['qid2'].isin(repeatedQuestionsIds)) ]

    

In [34]:
def printDuplicatePairsReport(df):
    print("Total Number of Questions Pairs: {:,}".format(getNumberOfRows(df)))
    print("Percentage of pairs Marked as duplicate: {:,}%\n".format(getDuplicatesPercentage(df)))

def printStatisticsForPairsWithRepeatedQuestions(df):
    print("## Statistics for rows with no repeated questions ##")
    dash.flow(getRowsWithNoRepeatedQuestion, printDuplicatePairsReport)(df)
    print("## Statistics for rows with one repeated question ##")
    dash.flow(getRowsWithAtLeastOneRepeatedQuestion, printDuplicatePairsReport)(df)
    print("## Statistics for rows with two repeated questions ##")
    dash.flow(getRowsWithTwoRepeatedQuestions, printDuplicatePairsReport)(df)

printStatisticsForPairsWithRepeatedQuestions(train)

## base score

A good base score is to use These precentages for each group in the test case in order to get a base score.
to do this we need to:
 - Read the test file.
 - Create ids for each question.
 - generate a df similar to the train ds.
 - Assign the score according to the group the row belongs (no repeated question, one repeated questions, two repeated questions)
 - create the score to upload it.

In [35]:
#Read the file
#test = pd.read_csv('../input/test.csv', nrows= 10000)
test = pd.read_csv('../input/train.csv', nrows = 100)

duplicate_score = test['is_duplicate'].mean()

In [38]:
def classifyByRepetition(row, repeatedQuestions):
    q1IsRepeated = row['question1'] in repeatedQuestions
    q2IsRepeated = row['question2'] in repeatedQuestions
    
    if not q1IsRepeated and not q2IsRepeated:
        return 0.224
    elif q1IsRepeated and q2IsRepeated:
        return 0.0716
    else:
        return 0.1571

def getRepeatedQuestions(df):
    return set(df['question1'].append(df['question2'], ignore_index = True)
        .value_counts()
        .where(lambda x: x>1)
        .dropna()
        .index
        .get_values())
    
def addIsDuplicatedColum(df):
    getIsDuplicateValue = lambda row: classifyByRepetition(row, getRepeatedQuestions(df))
    df['prediction_is_duplicate'] =  df.apply(getIsDuplicateValue, axis=1 )
    return df

def calculate_and_print_benchmark_based_on_duplicate_percentage(df):
    print('Binary Cross-Entropy Score based on duplicates:', log_loss(df['is_duplicate'], np.zeros_like(df['is_duplicate']) + duplicate_score)) 

def calculate_and_print_benchmark_based_on_repetition(df):
    print('Binary Cross-Entropy Score based on questions repetition:', log_loss(df['is_duplicate'], df['prediction_is_duplicate']))
    
def mainAddIsDuplicatedColumnAndTransformToCsv(df):
    df = addIsDuplicatedColum(df)
    calculate_and_print_benchmark_based_on_duplicate_percentage(df)
    calculate_and_print_benchmark_based_on_repetition(df)
    #df.to_csv('base_submission.csv', index = False, columns=['test_id', 'is_duplicate'])

mainAddIsDuplicatedColumnAndTransformToCsv(test)


### Counting Open and yes/no questions

I would like to know how many questions classified as yes/no questions are in the dataset.
This might help to see if yes/no questions are easier to tackle than open questions.

In [16]:


def isOpenQuestion(q):
    yesNoQuestionsInitializers = ['is', 'are', 'should', 'do', 'does', 'can']
    openQuestionInitializer = ['what', 'how', 'why', 'who', 'when', 'where', 'which', "what's", "how's", "why's", "who's", "when's", "where's"]
    isOpen = any( str(q).lower().startswith(i) for i in openQuestionInitializer)
    isYesNo = any( str(q).lower().startswith(i) for i in yesNoQuestionsInitializers)
    if isOpen:
        return 0
    elif isYesNo:
        return 1
    else:
        return 2

train['Q1TypeOfQuestion'] = train['question1'].apply(isOpenQuestion)
train['Q2TypeOfQuestion'] = train['question2'].apply(isOpenQuestion)

In [19]:
numberOfYesNoQuestions = train[train['Q1TypeOfQuestion'] == 1].shape[0] + train[train['Q2TypeOfQuestion'] == 1].shape[0]
print("Number of yes/no questions: ", numberOfYesNoQuestions)

In [20]:
numberOfOpenQuestions = train[train['Q1TypeOfQuestion'] == 0].shape[0] + train[train['Q2TypeOfQuestion'] == 0].shape[0]
numberOfOpenQuestions
print("Number of Open questions: ", numberOfOpenQuestions)

In [21]:
numberOfNonClassifiedQuestions = train[train['Q1TypeOfQuestion'] == 2].shape[0] + train[train['Q2TypeOfQuestion'] == 2].shape[0]
numberOfNonClassifiedQuestions
print("Number of no classified questions: ", numberOfNonClassifiedQuestions)

From this numbers we can conclude that the number of open questions in the dataset bigger that the number of yes no questions, even if all the classified questions where yes/no questions, the number of open questions is way bigger

### Creation of a correlation matrix based on the number of duplicates given the kind of question

In [22]:
corrMat = [[0,0,0],[0,0,0],[0,0,0]]
for i in range(0,3):
    for j in range(0,3):
        corrMat[i][j] = train[(train['Q1TypeOfQuestion'] == i) & (train['Q2TypeOfQuestion'] == j)].is_duplicate.mean()
corrMat = np.array(corrMat)
sns.heatmap(corrMat, vmax=0.5, square=True, annot=True)

We can see from this correlation map that if the questions are the same type of questions the correlation is higher than in other cases.