In [1]:
import functools
from collections import Counter
from collections import defaultdict

import numpy as np
import pandas as pd
from nltk.corpus import stopwords


def q1_freq(row):
    return len(q_dict[row['question1']])


def q2_freq(row):
    return len(q_dict[row['question2']])


def q1_q2_intersect(row):
    return len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)


def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q1words) + len(q2words))
    return R


def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return len(wic) / len(uw)


def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))


def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))


def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])


def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))


def wc_ratio(row):
    l1 = len(row['question1']) * 1.0
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))


def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))


def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops]) * 1.0
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])


def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))


def char_ratio(row):
    l1 = len(''.join(row['question1']))
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2


def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0

    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0

    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['word_match'] = data.apply(f, axis=1, raw=True)  # 1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.apply(f, axis=1, raw=True)  # 2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True)  # 3

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True)  # 4
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True)  # 5
    X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True)  # 6
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True)  # 7
    X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True)  # 8

    f = functools.partial(wc_diff_unique_stop, stops=stops)
    X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True)  # 9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)
    X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True)  # 10

    X['same_start'] = data.apply(same_start_word, axis=1, raw=True)  # 11
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True)  # 12

    f = functools.partial(char_diff_unique_stop, stops=stops)
    X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True)  # 13

    #     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  # 15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  # 16

    X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True)  # 17

    return X


df_train = pd.read_csv('./input/train.csv')
df_train = df_train.fillna(' ')
df_test = pd.read_csv('./input/test.csv')
df_test = df_test.fillna(' ')
ques = pd.concat([df_train[['question1', 'question2']],df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])
df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)
df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)
df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())
df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())
test_leaky = df_test.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']]
train_leaky = df_train.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']]
# explore
stop_word = set(stopwords.words("english"))
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weight = {word: get_weight(count) for word, count in counts.items()}
train = build_features(df_train, stop_word, weight)
test = build_features(df_test, stop_word, weight)




In [30]:
test

Unnamed: 0,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,same_start,char_diff,char_diff_unq_stop,total_unique_words,total_unq_words_stop,char_ratio
0,0.266667,0.234251,0.274019,0.090909,3,1.272727,2,1.181818,3,1.500000,0.0,8,13,22,13,1.170213
1,0.500000,0.436043,0.480962,0.235294,7,0.500000,7,0.500000,2,0.714286,0.0,16,4,17,9,0.698113
2,0.444444,0.418727,0.468893,0.285714,8,0.428571,6,0.500000,3,0.500000,1.0,23,9,14,7,0.510638
3,0.000000,0.000000,0.000000,0.000000,1,0.750000,1,0.750000,0,1.000000,0.0,9,5,7,4,0.625000
4,0.800000,0.841273,1.000000,0.428571,2,1.500000,2,1.500000,1,0.666667,1.0,4,13,7,3,0.862069
5,0.200000,0.226283,0.222582,0.117647,0,1.000000,1,0.900000,2,0.666667,0.0,5,6,17,9,0.901961
6,0.600000,0.595715,0.624099,0.500000,1,1.100000,1,1.100000,2,1.500000,1.0,3,9,14,7,1.083333
7,0.363636,0.278213,0.285686,0.307692,1,0.944444,2,0.888889,0,1.000000,0.0,1,4,26,18,1.012346
8,0.571429,0.654703,0.588009,0.583333,2,1.222222,1,1.111111,1,1.333333,1.0,14,11,12,5,1.437500
9,0.235294,0.191632,0.220013,0.120000,11,0.500000,10,0.473684,3,0.700000,0.0,55,24,25,15,0.360465


In [3]:
from sklearn.metrics import roc_auc_score

In [5]:
result = pd.read_csv('./input/train.csv', encoding="ISO-8859-1")

In [6]:
indexs = train.axes[1]

In [27]:
train['tfidf_wm_stops'] = train['tfidf_wm_stops'].fillna(-1)
train['wc_ratio'] = train['wc_ratio'].fillna(-1)
train['wc_ratio_unique'] = train['wc_ratio_unique'].fillna(-1)
train['wc_ratio_unique_stop'] = train['wc_ratio_unique_stop'].fillna(-1)
train['same_start'] = train['same_start'].fillna(-1)
train['char_ratio'] = train['char_ratio'].fillna(-1)

In [29]:
test['tfidf_wm_stops'] = test['tfidf_wm_stops'].fillna(-1)
test['wc_ratio'] = test['wc_ratio'].fillna(-1)
test['wc_ratio_unique'] = test['wc_ratio_unique'].fillna(-1)
test['wc_ratio_unique_stop'] = test['wc_ratio_unique_stop'].fillna(-1)
test['same_start'] = test['same_start'].fillna(-1)
test['char_ratio'] = test['char_ratio'].fillna(-1)

In [28]:
for each in indexs:
    print(each)
    print(roc_auc_score(result['is_duplicate'],train[each]))

word_match
0.780553200628
tfidf_wm
0.772718636386
tfidf_wm_stops
0.770564687846
jaccard
0.741117231335
wc_diff
0.390990589736
wc_ratio
0.494672780503
wc_diff_unique
0.392914098644
wc_ratio_unique
0.495693160408
wc_diff_unq_stop
0.387059413197
wc_ratio_unique_stop
0.500162010156
same_start
0.602470422174
char_diff
0.378886442653
char_diff_unq_stop
0.370369884371
total_unique_words
0.326251497522
total_unq_words_stop
0.313208799888
char_ratio
0.509679342405


In [10]:
import math


In [26]:
for each in train['char_ratio']:
    if math.isnan(each) + math.isinf(each):
        print(each)

nan
nan


In [31]:
for each in indexs:
    print(each)
    for value in test[each]:
        if math.isinf(value) or math.isnan(value):
            print(value)

word_match
tfidf_wm
tfidf_wm_stops
jaccard
wc_diff
wc_ratio
wc_diff_unique
wc_ratio_unique
wc_diff_unq_stop
wc_ratio_unique_stop
same_start
char_diff
char_diff_unq_stop
total_unique_words
total_unq_words_stop
char_ratio


In [33]:
train.to_csv('./input/train_refeatured_2.csv', index=False)

In [34]:
test.to_csv('./input/test_refeatured_2.csv', index=False)