In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from scipy.optimize import minimize
# import xgboost as xgb
import lightgbm as lgb
from sklearn.cross_validation import train_test_split
import multiprocessing
import difflib

import dask.dataframe as dd
import dask.bag as db
from dask.dataframe.utils import make_meta
from dask.multiprocessing import get

stops = set(stopwords.words("english"))



In [2]:
train = pd.read_csv('./data/train.csv')#[:1000] #remove limit
test = pd.read_csv('./data/test.csv')#[:1000] #remove limit

In [3]:
def apply_parallel(sequence, func, partitions=4):
    bag = db.from_sequence(sequence, npartitions=partitions)
    res = bag.map(func).compute()
    return res

def get_words(question):
    words = {}
    for word in question.split():
        if word not in stops:
            words[word] = 1
    return words

In [4]:
def diff_ratios(st1, st2):
    seq = difflib.SequenceMatcher()
    seq.set_seqs(str1, str2)
    return seq.ratio()

def word_match_share(word_dicts):
    q1words, q2words = word_dicts
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q1words) + len(q2words))
    return R

def get_weight(count, eps=500, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1. / (count + eps)

def tfidf_word_match_share(word_dicts):
    q1words, q2words = word_dicts
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def get_unigrams(que):
    return [word for word in nltk.word_tokenize(que.lower()) if word not in stops]

def get_common_unigrams(row):
    return len(set(row["unigrams_ques1"]).intersection(set(row["unigrams_ques2"])))

def get_common_unigram_ratio(row):
    return float(row["zunigrams_common_count"]) / max(len( set(row["unigrams_ques1"]).union(set(row["unigrams_ques2"])) ),1)

def get_bigrams(que):
    return [i for i in nltk.ngrams(que, 2)]

def get_common_bigrams(row):
    return len( set(row["bigrams_ques1"]).intersection(set(row["bigrams_ques2"])) )

def get_common_bigram_ratio(row):
    return float(row["zbigrams_common_count"]) / max(len( set(row["bigrams_ques1"]).union(set(row["bigrams_ques2"])) ),1)

def get_features(train):
    print('== start get features')
    train['question1'] = train['question1'].astype(str)
    train['question2'] = train['question2'].astype(str)
    train['question1'] = train['question1'].str.lower()
    train['question2'] = train['question2'].str.lower()
    
    print('== receiving nouns')
    train['q1_nouns'] = apply_parallel(train['question1'], lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(x)) if t in ['N']]) # t[:1]
    train['q2_nouns'] = apply_parallel(train['question2'], lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(x)) if t in ['N']]) # t[:1]

    print('== receiving length')
    train['z_len1'] = train['question1'].map(len)
    train['z_len2'] = train['question2'].map(len)
    train['z_word_len1'] = train['question1'].map(lambda x: len(x.split()))
    train['z_word_len2'] = train['question2'].map(lambda x: len(x.split()))
    
    #train['z_noun_match'] = train.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)  #takes long
    #train['z_match_ratio'] = train.apply(lambda r: diff_ratios(r.question1, r.question2), axis=1)  #takes long

    word_dicts1 = apply_parallel(train['question1'], get_words)
    word_dicts2 = apply_parallel(train['question2'], get_words)
    word_dicts = list(zip(word_dicts1, word_dicts2))
#     del word_dicts1
#     del word_dicts2
    
    print('== receiving matches and ngrams')
    train['z_word_match'] = apply_parallel(word_dicts, word_match_share)    
    train['z_tfidf_word_match'] = apply_parallel(word_dicts, word_match_share)

    train['unigrams_ques1'] = apply_parallel(train['question1'], get_unigrams)
    train['unigrams_ques2'] = apply_parallel(train['question1'], get_unigrams)
    
    train['zunigrams_common_count'] = train.apply(get_common_unigrams, axis=1)
    train['zunigrams_common_ratio'] = train.apply(get_common_unigram_ratio, axis=1)
    train['bigrams_ques1'] = train['unigrams_ques1'].apply(get_bigrams)
    train['bigrams_ques2'] = train['unigrams_ques2'].apply(get_bigrams) 
    train['zbigrams_common_count'] = train.apply(get_common_bigrams, axis=1)
    train['zbigrams_common_ratio'] = train.apply(get_common_bigram_ratio, axis=1)
    print('== features received.')
    return train

In [13]:
train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist()).astype(str)
test_qs = pd.Series(test['question1'].tolist() + test['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split() + (" ".join(test_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [5]:
%%time
train = get_features(train)
train = train.fillna(-1)

Wall time: 8min 40s


In [6]:
train.to_csv('feat_train.csv', index=False)

In [7]:
col = [c for c in train.columns if c[:1]=='z']

pos_train = train[train['is_duplicate'] == 1]
neg_train = train[train['is_duplicate'] == 0]
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
train = pd.concat([pos_train, neg_train])

x_train, x_valid, y_train, y_valid = train_test_split(train[col], train['is_duplicate'], test_size=0.2, random_state=0)

In [8]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_valid = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

In [9]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


print('Start training...')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_valid,
                early_stopping_rounds=5)
print('Done')

Start training...
[1]	valid_0's binary_logloss: 0.668217
Train until valid scores didn't improve in 5 rounds.
[2]	valid_0's binary_logloss: 0.645644
[3]	valid_0's binary_logloss: 0.625138
[4]	valid_0's binary_logloss: 0.60644
[5]	valid_0's binary_logloss: 0.589337
[6]	valid_0's binary_logloss: 0.573636
[7]	valid_0's binary_logloss: 0.559222
[8]	valid_0's binary_logloss: 0.545941
[9]	valid_0's binary_logloss: 0.533683
[10]	valid_0's binary_logloss: 0.522337
[11]	valid_0's binary_logloss: 0.511823
[12]	valid_0's binary_logloss: 0.502078
[13]	valid_0's binary_logloss: 0.493031
[14]	valid_0's binary_logloss: 0.484614
[15]	valid_0's binary_logloss: 0.476786
[16]	valid_0's binary_logloss: 0.469499
[17]	valid_0's binary_logloss: 0.462679
[18]	valid_0's binary_logloss: 0.456311
[19]	valid_0's binary_logloss: 0.450359
[20]	valid_0's binary_logloss: 0.444799
[21]	valid_0's binary_logloss: 0.439576
[22]	valid_0's binary_logloss: 0.434679
[23]	valid_0's binary_logloss: 0.430105
[24]	valid_0's bina

In [10]:
%%time
test = get_features(test)
test = test.fillna(-1)
print('done')

done
Wall time: 50min 20s


In [11]:
test.to_csv('feat_test.csv', index=False)

In [12]:
d_test = lgb.Dataset(test[col]) #xgb.DMatrix(test[col])
p_test = gbm.predict(test[col])

sub = pd.DataFrame()
sub['test_id'] = test['test_id']
sub['is_duplicate'] = p_test

#df['is_duplicate'] = df['is_duplicate'].map(lambda x: 0.000000000001 if x < 0.0001 else x)
#df['is_duplicate'] = df['is_duplicate'].map(lambda x: 0.999999999999 if x > 0.98 else x)

sub.to_csv('gbm_test.csv', index=False)
print('done')

done
