In [168]:
'''
"data/train.csv"
"data/pp1.csv"
"data/mat.csv"
"data/emb_mat.csv"
"data/feat_mat.csv"
"data/emb_mat_test.csv"

https://github.com/ryonion/kaggle_question_pair/tree/master/data

'''


import pandas as pd
from importlib import reload
import feature_generators as fg
import preproc as pp
import os
import numpy as np

from sklearn.metrics import roc_auc_score
reload(pp)
reload(fg)
import pickle
from keras.models import load_model
import xgboost as xgb
from sklearn.linear_model import LogisticRegression as lr

#### settings

In [169]:
data_needed = 80000

raw_file = "data/train.csv"
pp_file  = "data/pp1.csv"
mat_file = "data/mat.csv"
raw_drop = ["id", "question1", "question2"]
pos_tags = ["ADJ", "ADP", "ADV", "CONJ", "DET", "NOUN", "NUM", "PRT", "PRON", "VERB", "X"]

# pre-processing

In [165]:
raw = pd.read_csv(raw_file, nrows = data_needed)
try:
    pproc = pd.read_csv(pp_file)
    row_count = pproc.shape[0]
    if row_count < data_needed:
        print("Pre-Processing %d data..."%(data_needed - row_count))
        pproc = pproc.append(pp.stemmer(raw.loc[row_count:]))
        pproc.to_csv(pp_file, index=False)

except:
    # don't have preproessed data yet.
    print("Pre-Processing %d data..."%(data_needed))
    new_pp = pp.stemmer(raw)
    new_pp.to_csv(pp_file, index=False)
    
print("pre-processed data shape:", pd.read_csv(pp_file).shape)
pd.read_csv(pp_file).head(5)

Pre-Processing 40000 data...
pre-processed data shape: (80000, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor ( koh-i-noor ) d...,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely ? how can i solv...,find the remainder when [ math ] 23^ { 24 } [ ...,0
4,4,9,10,"which one dissolve in water quikly sugar , sal...",which fish would survive in salt water ?,0


In [57]:
#os.remove(mat_file)

# generating feature matrix

In [166]:
def add_feature(feat_func, pp_file, mat_file):
    ff_name = pp.get_method_abbre(feat_func.__name__)
    try:
        df_pp = pd.read_csv(pp_file)
        mat = pd.read_csv(mat_file)
        
        try:
            # check if feature already exists
            start_idx = mat[ff_name].last_valid_index()
            print(start_idx+1 ,"vs", df_pp.shape[0])
            if start_idx + 1 < df_pp.shape[0]:
                print("extending %s col from start_idx: %d..."%(ff_name,start_idx+1))
                df_pp = df_pp.loc[start_idx+1:]
                sub = '_sub_'
                mat = mat.rename(index=str, columns={ff_name:sub})
                cp_col = mat[sub][:start_idx+1].append(df_pp.apply(feat_func, axis=1, raw=True))
                cp_col = cp_col.to_frame(ff_name)
                cp_col.reset_index(drop=True, inplace=True)
                mat.reset_index(drop=True, inplace=True)
                new_df = cp_col.join(mat)
                new_df = new_df.drop([sub], axis = 1)
                new_df.to_csv(mat_file, index=False)
                
        # if feature doesn't alreay exist -> add a new col
        except:
            print("adding a new feature col")
            new_col = df_pp.apply(feat_func, axis=1, raw=True)
            new_col = new_col.to_frame(ff_name)
            new_df = pd.concat([mat,new_col],axis=1)
            new_df.to_csv(mat_file, index=False)

    # mat_file does not exist
    except:
        print("creating mat...")
        mat = pd.DataFrame()
        mat[ff_name] = df_pp.apply(feat_func, axis=1, raw=True)
        mat.to_csv(mat_file, index=False)

    print(ff_name,"applied, Mat shape: ", pd.read_csv(mat_file).shape)
    

In [167]:
add_feature(fg.shared_percentage, pp_file, mat_file, )
print("---------------------------------------")
print("---------------------------------------")

add_feature(fg.longest_common_substr_prop, pp_file, mat_file)
print("---------------------------------------")
print("---------------------------------------")
add_feature(fg.is_first_word_identical, pp_file, mat_file)
print("---------------------------------------")
print("---------------------------------------")
add_feature(fg.is_in_same_cat, pp_file, mat_file)
print("---------------------------------------")
print("---------------------------------------")
add_feature(fg.dif_wc, pp_file, mat_file)
pd.read_csv(mat_file).head(5)

40000 vs 80000
extending sp col from start_idx: 40000...
sp applied, Mat shape:  (80000, 5)
---------------------------------------
---------------------------------------
40000 vs 80000
extending lcsp col from start_idx: 40000...
lcsp applied, Mat shape:  (80000, 5)
---------------------------------------
---------------------------------------
40000 vs 80000
extending ifwi col from start_idx: 40000...
ifwi applied, Mat shape:  (80000, 5)
---------------------------------------
---------------------------------------
40000 vs 80000
extending iisc col from start_idx: 40000...
iisc applied, Mat shape:  (80000, 5)
---------------------------------------
---------------------------------------
40000 vs 80000
extending dw col from start_idx: 40000...
dw applied, Mat shape:  (80000, 5)


Unnamed: 0,dw,iisc,ifwi,lcsp,sp
0,0.071429,1.0,1.0,0.912,0.5
1,0.185185,0.0,1.0,0.455172,0.444444
2,0.153846,0.0,1.0,0.149254,0.230769
3,0.1875,0.0,0.0,0.031496,0.125
4,0.333333,1.0,1.0,0.117647,0.25


### adding embedding measurements into feature matrix

In [82]:
fm = pd.read_csv(mat_file)
fm.head(3)

Unnamed: 0,sp,lcsp,ifwi,iisc,dw
0,0.5,0.912,1,1,0.071429
1,0.444444,0.455172,1,0,0.185185
2,0.230769,0.149254,1,0,0.153846


In [83]:
em = pd.read_csv("data/emb_mat.csv")
em.head(3)

Unnamed: 0,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,X,sim_avg
0,0.0,0.969626,1.0,0.0,1.0,0.883462,0.0,0.0,1.0,1.0,0.0,0.975515
1,0.0,0.0,0.0,0.0,1.0,0.667299,0.0,0.0,1.0,0.260665,0.0,0.731991
2,0.0,0.355251,1.0,0.0,0.0,0.694622,0.0,0.0,0.0,0.602308,0.0,0.663045


In [80]:
comb = fm.join(em)

In [84]:
comb.head(3)

Unnamed: 0,sp,lcsp,ifwi,iisc,dw,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,X,sim_avg
0,0.5,0.912,1,1,0.071429,0.0,0.969626,1.0,0.0,1.0,0.883462,0.0,0.0,1.0,1.0,0.0,0.975515
1,0.444444,0.455172,1,0,0.185185,0.0,0.0,0.0,0.0,1.0,0.667299,0.0,0.0,1.0,0.260665,0.0,0.731991
2,0.230769,0.149254,1,0,0.153846,0.0,0.355251,1.0,0.0,0.0,0.694622,0.0,0.0,0.0,0.602308,0.0,0.663045


In [85]:
comb.to_csv("data/feat_mat.csv", index = False)

# split data for training, validation, testing

In [188]:
data.shape

(40000, 17)

In [190]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("data/feat_mat.csv")
targ = pd.read_csv(pp_file)

Y = targ['is_duplicate'][:40000]
X = data

XTR, xte, YTR, yte = train_test_split(X, Y, test_size=0.2, random_state=4242)
xtr, xva, ytr, yva = train_test_split(XTR, YTR, test_size=0.2, random_state=4242)

## Logistic regression

In [228]:
from sklearn.linear_model import LogisticRegression as lr

logi = lr(C=1, penalty='l1').fit(XTR,YTR)
predicted_LR = logi.predict(xte)
print('AUC:', roc_auc_score(yte, predicted_LR))
print('accuracy:', sum([int(int(l>0.5)==r) for l,r in zip(predicted_LR,yte)]), "/" ,len(predicted_LR), "=", \
      sum([int(int(l>0.5)==r) for l,r in zip(predicted_LR,yte)])/len(predicted_LR))

AUC: 0.6165172001250802
accuracy: 5272 / 8000 = 0.659


In [209]:
# grid search for best combination of C and penalty
from sklearn.grid_search import GridSearchCV
C_range = 10.**np.arange(-2, 3)
penalty_options = ['l1', 'l2']

parameters = {'C':C_range, 'penalty':penalty_options}
gs = GridSearchCV(lr(), parameters,cv=5, scoring='neg_log_loss')
gs.fit(XTR,YTR)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='neg_log_loss',
       verbose=0)

In [210]:
print(gs.best_score_)
print(gs.best_params_)

-0.5633145458600423
{'C': 1.0, 'penalty': 'l1'}


In [211]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

pl = Pipeline([
  ('feature_selection', SelectFromModel(lr(penalty="l1"))),
  ('classification', model)
])
pl.fit(XTR, YTR)

Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [212]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pl, X, Y, cv=5)

In [213]:
scores.mean()

0.6638249572738275

In [219]:
#pickle.dump(pl, open("models/lr_40000_full_feature", 'wb'))

## extreme gradient boost tree

In [64]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'error'
params['eta'] = 0.02
params['max_leaf_nodes'] = 20
params['subsample'] = 0.8

d_train = xgb.DMatrix(xtr, label=ytr)
d_valid = xgb.DMatrix(xva, label=yva)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=90, verbose_eval=10)

[0]	train-error:0.288789	valid-error:0.305156
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 90 rounds.
[10]	train-error:0.278672	valid-error:0.295
[20]	train-error:0.277734	valid-error:0.292812
[30]	train-error:0.274063	valid-error:0.293281
[40]	train-error:0.272383	valid-error:0.290156
[50]	train-error:0.270156	valid-error:0.288125
[60]	train-error:0.268047	valid-error:0.284531
[70]	train-error:0.265508	valid-error:0.284531
[80]	train-error:0.263906	valid-error:0.281875
[90]	train-error:0.2625	valid-error:0.283437
[100]	train-error:0.261094	valid-error:0.282813
[110]	train-error:0.260078	valid-error:0.2825
[120]	train-error:0.25957	valid-error:0.282187
[130]	train-error:0.258359	valid-error:0.282187
[140]	train-error:0.258008	valid-error:0.2825
[150]	train-error:0.257109	valid-error:0.282656
[160]	train-error:0.256563	valid-error:0.280781
[170]	train-error:0.256172	valid-error:0.28
[180]	train-er

In [65]:
d_te = xgb.DMatrix(xte)
p_te = bst.predict(d_te)
print('Original AUC:', roc_auc_score(yte, p_te))
pp_train = []
for i in p_te[:]:
    if i > 0.5:
        pp_train.append(1)
    else:
        pp_train.append(0)
print('accuracy:', sum([int(l==r) for l,r in zip(pp_train,yte)]), "/" ,len(pp_train), "=", \
      sum([int(l==r) for l,r in zip(pp_train,yte)])/len(pp_train))

Original AUC: 0.8028771909041527
accuracy: 5761 / 8000 = 0.720125


In [134]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model = xgb.XGBClassifier(**params)

kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Accuracy: 72.55% (0.55%)


  if diff:


In [66]:
pickle.dump(bst, open("models/xgb_40000_full_feature", 'wb'))

## neural network

In [220]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1)
nn.fit(XTR, YTR)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [222]:
p_nn = nn.predict(xte)
print('AUC:', roc_auc_score(yte, p_nn))
print('accuracy:', sum([int(int(l>0.5)==r) for l,r in zip(p_nn,yte)]), "/" ,len(p_nn), "=", \
      sum([int(int(l>0.5)==r) for l,r in zip(p_nn,yte)])/len(p_nn))

AUC: 0.6873816033955212
accuracy: 5565 / 8000 = 0.695625


In [223]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
scores = cross_val_score(nn, X, Y, cv=5)

In [224]:
scores.mean()

0.7051499387566397

In [226]:
#pickle.dump(nn, open("models/nn_40000_full_feature", 'wb'))

## recurrent neural network

In [21]:
import keras
from keras.models import Sequential

from keras.layers import LSTM
from keras.layers import Dense, Embedding, SimpleRNN

# ----------------training data----------------------

x_arr = xtr.values
y_arr = ytr

row = x_arr.shape[0]
col = x_arr.shape[1]

x_arr = x_arr.reshape(row,col,1)

# ----------------validation data----------------------

xva_arr = xva.values
yva_arr = yva.values

row = xva_arr.shape[0]
col = xva_arr.shape[1]
 
x_va = xva_arr.reshape(row,col,1)
y_va = yva_arr

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [22]:
rnn_model=Sequential()

# model.add(SimpleRNN(input_dim=1, output_dim=50))

rnn_model.add(LSTM(input_dim=1, output_dim=50))
rnn_model.add(Dense(output_dim=1, activation = "sigmoid"))
rnn_model.compile(loss="mse", optimizer="rmsprop")

adam = keras.optimizers.Adam(lr=0.001)
rnn_model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

rnn_model.fit(x_arr, y_arr, validation_data=(x_va, y_va), nb_epoch=10, batch_size=32)

  """
  """
  


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x209d6bc0a58>

In [23]:
xte_arr = xte.values
yte_arr = yte.values

row = xte_arr.shape[0]
col = xte_arr.shape[1]
 
xte_arr = xte_arr.reshape(row,col,1)
yte_arr = yte_arr    
    
scores = rnn_model.evaluate(xte_arr, yte_arr, verbose=1)
print("Elman RNN Test Accuracy: %.3f%%" % (scores[1]*100))

Elman RNN Test Accuracy: 67.062%


In [44]:
# rnn_model.save('models/rnn_40000_full_feature') 

## random forest

In [17]:
import mltools as ml
np.random.seed(0)  # Resetting the seed in case you ran other stuff.
n_bags = 20
bags = []   # self.learners
for l in range(n_bags):
    # Each boosted data is the size of the original data. 
    Xi, Yi = ml.bootstrapData(XTR.values, YTR.values, XTR.shape[0])

    # Train the model on that draw
    tree = ml.dtree.treeClassify(Xi, Yi, minParent=2**6,maxDepth=25, nFeatures=6)
    bags.append(tree)

In [6]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [19]:
bt = BaggedTree(bags)

In [20]:
# test Accuracy
rf_te_pred = bt.predictSoft(xte.values)

pred = rf_te_pred[:,0]
targ = yte
bingo = sum([ int(t-p<0.5) for p,t in zip(pred,targ) ])
print("Validation Accuracy: %d/%d %.2f%%"%(bingo, len(targ), 100*bingo/len(targ)) )

Validation Accuracy: 6078/8000 75.97%


In [48]:
# pickle.dump(bt, open("models/rf_40000_full_feature", 'wb'))

### build stacked feature matrix

In [185]:
pl = pickle.load(open("models/lr_40000_full_feature", 'rb'))
bst = pickle.load(open("models/xgb_40000_full_feature", 'rb'))
rnn_model = load_model("models/rnn_40000_full_feature")
nn = pickle.load(open("models/nn_40000_full_feature", 'rb'))
bt = pickle.load(open("models/rf_40000_full_feature", 'rb'))

In [191]:
pl_pred = pl.predict(X)

In [192]:
bst_pred = bst.predict(xgb.DMatrix(X))

In [193]:
X_arr = X.values

row = X_arr.shape[0]
col = X_arr.shape[1]
 
X_arr = X_arr.reshape(row,col,1)
rnn_pred = rnn_model.predict(X_arr)

In [194]:
nn_pred = nn.predict(X)
bt_pred = bt.predictSoft(X.values)

### train a model to make final predictions based on level-1 predictions

In [195]:
simple_stack = np.concatenate([bt_pred[:,0].reshape(40000,1), nn_pred.reshape(40000,1),\
                              bst_pred.reshape(40000,1),pl_pred.reshape(40000,1),rnn_pred], axis = 1)

In [88]:
# pickle.dump(simple_stack, open("data/40000_stacked_pred", 'wb'))

In [151]:
s_xtr, s_xte, s_ytr, s_yte = train_test_split(simple_stack, Y, test_size=0.5, random_state=4242)

In [152]:
slr = lr(C = 0.5, penalty = 'l1').fit(s_xtr, s_ytr)

In [153]:
pred_slr = slr.predict(s_xte)

In [154]:
pred = pred_slr
targ = s_yte
bingo = sum([ int(t-p<0.5) for p,t in zip(pred,targ) ])
print("Validation Accuracy: %d/%d %.2f%%"%(bingo, len(targ), 100*bingo/len(targ)) )

Validation Accuracy: 18024/20000 90.12%


In [155]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(slr, simple_stack, Y, cv=5)

In [156]:
scores

array([0.84164479, 0.845375  , 0.84175   , 0.8495    , 0.84148019])

### train a model to make final predictions based on all features and level-1 predictions

In [196]:
full_stack = np.concatenate([X, bt_pred[:,0].reshape(40000,1), nn_pred.reshape(40000,1),\
                              bst_pred.reshape(40000,1),pl_pred.reshape(40000,1),rnn_pred], axis = 1)

In [197]:
full_stack.shape

(40000, 22)

In [289]:
s_xtr, s_xte, s_ytr, s_yte = train_test_split(full_stack, Y, test_size=0.2, random_state=4242)

In [290]:
fslr = lr(C = 0.5, penalty = 'l1').fit(s_xtr, s_ytr)

In [291]:
pred_fslr = fslr.predict(s_xte)

In [292]:
pred = pred_fslr
targ = s_yte
bingo = sum([ int(t-p<0.5) for p,t in zip(pred,targ) ])
print("Validation Accuracy: %d/%d %.2f%%"%(bingo, len(targ), 100*bingo/len(targ)) )

Validation Accuracy: 6800/8000 85.00%


In [248]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr(C = 0.5, penalty = 'l1'), full_stack, Y, cv=5)

In [249]:
scores

array([0.84226972, 0.84575   , 0.84325   , 0.848875  , 0.84373047])

## use another subset of original training data for evaluation 

In [222]:
test = pd.read_csv(mat_file)[40000:]
test_emb = pd.read_csv("data/emb_mat_test.csv")
test_Y = pd.read_csv(pp_file)[40000:]

In [226]:
test.reset_index(drop=True, inplace=True)

In [232]:
test_comb = test.join(test_emb)

In [235]:
col_list = list(test_comb)
col_list[1], col_list[3] = col_list[3], col_list[1]
col_list[0], col_list[4] = col_list[4], col_list[0]
test_comb.columns = col_list

In [237]:
test_comb.head(2)

Unnamed: 0,sp,lcsp,ifwi,iisc,dw,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,X,sim_avg
0,0.176471,0.0,1.0,0.786517,0.470588,0.0,0.0,1.0,0.0,0.0,0.661447,0.0,0.0,0.0,1.0,0.0,0.887149
1,0.181818,0.0,1.0,0.817391,0.363636,1.0,0.922042,0.0,0.0,1.0,0.797719,0.0,0.0,1.0,1.0,0.0,0.953294


In [238]:
test_pl_pred = pl.predict(test_comb)
test_bst_pred = bst.predict(xgb.DMatrix(test_comb))

test_X_arr = test_comb.values
test_row = test_X_arr.shape[0]
test_col = test_X_arr.shape[1]
test_X_arr = test_X_arr.reshape(test_row,test_col,1)
test_rnn_pred = rnn_model.predict(test_X_arr)

test_nn_pred = nn.predict(test_comb)
test_bt_pred = bt.predictSoft(test_comb.values)

In [239]:
test_full_stack = np.concatenate([test_comb, test_bt_pred[:,0].reshape(40000,1), test_nn_pred.reshape(40000,1),\
                              test_bst_pred.reshape(40000,1),test_pl_pred.reshape(40000,1),test_rnn_pred], axis = 1)

In [284]:
test_pred_fslr = fslr.predict(test_full_stack)

In [285]:
pred = test_pred_fslr
targ = test_Y['is_duplicate']
bingo = sum([ int(t-p<0.5) for p,t in zip(pred,targ) ])
print("Validation Accuracy: %d/%d %.2f%%"%(bingo, len(targ), 100*bingo/len(targ)) )

Validation Accuracy: 26134/40000 65.33%
