In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
pd.set_option('display.max_colwidth', -1)

In [2]:
class Lodash:    
    def flow(self, *args):
        def fns(payload):
            result = payload
            for fn in args:
                result = fn(result)
            return result
        return fns
dash = Lodash()

In [3]:
train = pd.read_csv('../input/train.csv');

## Checking the data

Basic functions to count for the whole dataframe.

In [4]:
def getNumberOfRows(df):
    return len(df)

def getDuplicates(df):
    return df[df['is_duplicate'] == 1]

def getDuplicatesPercentage(df):
    duplicateRows = dash.flow(getDuplicates, getNumberOfRows)(df)
    totalRows = getNumberOfRows(df)
    return round(duplicateRows*100/totalRows,2)

def getQuestionsIds(df):
    return df['qid1'].tolist() + df['qid2'].tolist()

def getTotalNumberOfQuestions(df):
    return dash.flow(getQuestionsIds, len)(df)

def getUniqueQuestionsIds(df):
    return dash.flow(getQuestionsIds, set)(df)

def getTotalNumberOfUniqueQUestions(df):
    return dash.flow(getUniqueQuestionsIds, len)(df)

def getRepeatedQuestionsIds(df):
    serie = dash.flow(getQuestionsIds, pd.Series)(df)
    counts = serie.value_counts()
    repeated = counts[counts > 1]
    return list(set(repeated.index.tolist()))



def getNumberOfRepeatedQuestionsIds(df):
    return dash.flow(getRepeatedQuestionsIds, len)(df)

def getPercentageOfRepeatedQuestions(df):
    return round(getNumberOfRepeatedQuestionsIds(df)/getTotalNumberOfUniqueQUestions(df),2)*100

In [5]:
def printReport(df):
    print("Total Number of Questions Pairs: {:,}".format(getNumberOfRows(df)))
    print("Percentage of pairs Marked as duplicate: {:,}%".format(getDuplicatesPercentage(df)))
    print("Total Number of Questions: {:,}".format(getTotalNumberOfQuestions(df)))
    print("Total Number of Unique Questions: {:,}". format(getTotalNumberOfUniqueQUestions(df)))
    print("Total Number of Repeated Questions: {:,}".format(getNumberOfRepeatedQuestionsIds(df)))
    print("Percentage of Repeated Questions: {:,}%".format(getPercentageOfRepeatedQuestions(df)))
    print('\n')

printReport(train)
#getRepeatedQuestionsIds(train)

### Statistics for pairs with repeated questions.

Here we show that for the next mutually exclusive sets, the percentage of duplicated pairs might vary

- No question is repeated: 22.4%
- One question is repeated: 15.71%
- The two questions are repeated: 71.6%

This is a bias given in the way the data is collected, but still we can use this percentages in our testing set, if the distribution is similar, to get a good score.

In [6]:

def getRowsWithNoRepeatedQuestion(df):
    nonRepeatedQeuestions = set(getUniqueQuestionsIds(df)) - set(getRepeatedQuestionsIds(df))
    return df[ (df['qid1'].isin(nonRepeatedQeuestions) & df['qid2'].isin(nonRepeatedQeuestions)) ]

def getRowsWithAtLeastOneRepeatedQuestion(df):
    repeatedQuestionsIds = getRepeatedQuestionsIds(df)
    nonRepeatedQeuestions = set(getUniqueQuestionsIds(df)) - set(getRepeatedQuestionsIds(df))
    onlyQ1IsRepeated = df['qid1'].isin(repeatedQuestionsIds) & df['qid2'].isin(nonRepeatedQeuestions)
    onlyQ2IsRepeated = df['qid2'].isin(repeatedQuestionsIds) & df['qid1'].isin(nonRepeatedQeuestions)

    return df[ onlyQ1IsRepeated | onlyQ2IsRepeated]

def getRowsWithTwoRepeatedQuestions(df):
    repeatedQuestionsIds = getRepeatedQuestionsIds(df)
    return df[ (df['qid1'].isin(repeatedQuestionsIds) & df['qid2'].isin(repeatedQuestionsIds)) ]

    

In [7]:
def printDuplicatePairsReport(df):
    print("Total Number of Questions Pairs: {:,}".format(getNumberOfRows(df)))
    print("Percentage of pairs Marked as duplicate: {:,}%\n".format(getDuplicatesPercentage(df)))

def printStatisticsForPairsWithRepeatedQuestions(df):
    print("## Statistics for rows with no repeated questions ##")
    dash.flow(getRowsWithNoRepeatedQuestion, printDuplicatePairsReport)(df)
    print("## Statistics for rows with one repeated question ##")
    dash.flow(getRowsWithAtLeastOneRepeatedQuestion, printDuplicatePairsReport)(df)
    print("## Statistics for rows with two repeated questions ##")
    dash.flow(getRowsWithTwoRepeatedQuestions, printDuplicatePairsReport)(df)

printStatisticsForPairsWithRepeatedQuestions(train)

## base score

A good base score is to use These precentages for each group in the test case in order to get a base score.
to do this we need to:
 - Read the test file.
 - Create ids for each question.
 - generate a df similar to the train ds.
 - Assign the score according to the group the row belongs (no repeated question, one repeated questions, two repeated questions)
 - create the score to upload it.

In [None]:
#Read the file
#test = pd.read_csv('../input/test.csv', nrows= 10000)
test = pd.read_csv('../input/train.csv', nrows = 100)

duplicate_score = test['is_duplicate'].mean()

In [None]:
def classifyByRepetition(row, repeatedQuestions):
    q1IsRepeated = row['question1'] in repeatedQuestions
    q2IsRepeated = row['question2'] in repeatedQuestions
    
    if not q1IsRepeated and not q2IsRepeated:
        return 0.224
    elif q1IsRepeated and q2IsRepeated:
        return 0.0716
    else:
        return 0.1571

def getRepeatedQuestions(df):
    return set(df['question1'].append(df['question2'], ignore_index = True)
        .value_counts()
        .where(lambda x: x>1)
        .dropna()
        .index
        .get_values())
    
def addIsDuplicatedColum(df):
    getIsDuplicateValue = lambda row: classifyByRepetition(row, getRepeatedQuestions(df))
    df['prediction_is_duplicate'] =  df.apply(getIsDuplicateValue, axis=1 )
    return df

def calculate_and_print_benchmark_based_on_duplicate_percentage(df):
    print('Binary Cross-Entropy Score based on duplicates:', log_loss(df['is_duplicate'], np.zeros_like(df['is_duplicate']) + duplicate_score)) 

def calculate_and_print_benchmark_based_on_repetition(df):
    print('Binary Cross-Entropy Score based on questions repetition:', log_loss(df['is_duplicate'], df['prediction_is_duplicate']))
    
def mainAddIsDuplicatedColumnAndTransformToCsv(df):
    df = addIsDuplicatedColum(df)
    calculate_and_print_benchmark_based_on_duplicate_percentage(df)
    calculate_and_print_benchmark_based_on_repetition(df)
    #df.to_csv('base_submission.csv', index = False, columns=['test_id', 'is_duplicate'])

mainAddIsDuplicatedColumnAndTransformToCsv(test)
