In [1]:
import pickle
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import xgboost as xgb
import numpy as np
import pandas as pd

data_folder = '/home/sidsvash26/kaggle_quora/data/'

train_X1 = pickle.load(open(data_folder + 'feats1_tfidf_train.sav', 'rb'))
train_X2 = pickle.load(open(data_folder + 'feats2_match_train.sav', 'rb'))
train_X3 = pickle.load(open(data_folder + 'feats3_glove_train.sav', 'rb'))
train_X4 = pickle.load(open(data_folder + 'feats4_word2vec.sav', 'rb'))
train_X6 = pickle.load(open(data_folder + 'feats6_whq_jaccard.sav', 'rb'))
train_X10 = pickle.load(open(data_folder + 'feats10_locations.sav', 'rb'))

#Magic feats

train_X8 = pickle.load(open(data_folder + 'feats8_kcore_v1.sav', 'rb'))
train_X9 = pickle.load(open(data_folder + 'feats9_all_magic.sav', 'rb'))

#Concatenate all features
train_X = np.concatenate((train_X1, train_X2, train_X3, train_X4, train_X6,  train_X8, train_X9, train_X10), axis=1)




In [2]:
#Load training target variable
data = pd.read_csv(data_folder + 'train.csv')
train_y = np.array(data.is_duplicate)

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0):
        params = {}
        params["objective"] = "binary:logistic"
        params['eval_metric'] = 'logloss'
        params["eta"] = 0.02
        params["subsample"] = 0.7
        params["min_child_weight"] = 1
        params["colsample_bytree"] = 0.7
        params["max_depth"] = 4
        params["silent"] = 1
        params["seed"] = seed_val
        num_rounds = 1100 
        plst = list(params.items())
        xgtrain = xgb.DMatrix(train_X, label=train_y)

        if test_y is not None:
                xgtest = xgb.DMatrix(test_X, label=test_y)
                watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
                model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=10)
        else:
                xgtest = xgb.DMatrix(test_X)
                model = xgb.train(plst, xgtrain, num_rounds)
                
        pred_test_y = model.predict(xgtest)

        loss = 1
        if test_y is not None:
                loss = log_loss(test_y, pred_test_y)
                return pred_test_y, loss, model
        else:
            return pred_test_y, loss, model
        
#Re-sampling the data
train_X_dup = train_X[train_y==1]
train_X_non_dup = train_X[train_y==0]

train_X = np.vstack([train_X_non_dup, train_X_dup, train_X_non_dup, train_X_non_dup])
train_y = np.array([0]*train_X_non_dup.shape[0] + [1]*train_X_dup.shape[0] + [0]*train_X_non_dup.shape[0] + [0]*train_X_non_dup.shape[0])
del train_X_dup
del train_X_non_dup
print("Mean target rate : ",train_y.mean())


kf = KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
    dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    preds, lloss, model = runXGB(dev_X, dev_y, val_X, val_y)
    break

pickle.dump(model, open(data_folder + 'model9_feat123468910.sav', 'wb'))

Mean target rate :  0.163245999318
[0]	train-logloss:0.678481	test-logloss:0.678523
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 100 rounds.
[10]	train-logloss:0.558897	test-logloss:0.55931
[20]	train-logloss:0.474895	test-logloss:0.475608
[30]	train-logloss:0.413094	test-logloss:0.414048
[40]	train-logloss:0.366517	test-logloss:0.367694
[50]	train-logloss:0.331245	test-logloss:0.332608
[60]	train-logloss:0.303882	test-logloss:0.305417
[70]	train-logloss:0.282095	test-logloss:0.283791
[80]	train-logloss:0.265329	test-logloss:0.267149
[90]	train-logloss:0.251436	test-logloss:0.25336
[100]	train-logloss:0.240428	test-logloss:0.242474
[110]	train-logloss:0.231652	test-logloss:0.233788
[120]	train-logloss:0.22432	test-logloss:0.226555
[130]	train-logloss:0.218021	test-logloss:0.220351
[140]	train-logloss:0.213069	test-logloss:0.215462
[150]	train-logloss:0.209007	test-logloss:0.211455
[160]	train-l

In [3]:
#Submission Script

data_folder = '/home/sidsvash26/kaggle_quora/data/'
#Load test data features
#For creation of the below pickle see Model4 code above
'''  Uncomment if running for the first time -- running the code seperately due to low RAM '''
test_X1 = pickle.load(open(data_folder + 'feats1_tfidf_test.sav', 'rb'))
test_X2 = pickle.load(open(data_folder + 'feats2_match_test.sav', 'rb'))
test_X3 = pickle.load(open(data_folder + 'feats3_glove_test.sav', 'rb'))
test_X4 = pickle.load(open(data_folder + 'feats4_word2vec_test.sav', 'rb'))
test_X6 = pickle.load(open(data_folder + 'feats6_whq_jaccard_for_test.sav', 'rb'))
test_X10 = pickle.load(open(data_folder + 'feats10_locations_for_test.sav', 'rb'))

#magic feats
test_X8 = pickle.load(open(data_folder + 'feats8_kcore_v1_for_test.sav', 'rb'))
test_X9 = pickle.load(open(data_folder + 'feats9_all_magic_for_test.sav', 'rb'))

test_X = np.concatenate((test_X1,test_X2,test_X3,test_X4, test_X6,test_X8, test_X9, test_X10), axis=1)

xg_model = pickle.load(open(data_folder + 'model9_feat123468910.sav', 'rb')) 

#Predictions using model
xgtest = xgb.DMatrix(test_X)
print('predicting values')
preds = xg_model.predict(xgtest)
print('predictions done!!')
#Load test ids
test_data = pd.read_csv(data_folder + 'sample_submission.csv')
ids = test_data.test_id


out_df = pd.DataFrame({"test_id": ids, "is_duplicate":preds})

list_col = out_df.columns.tolist()
list_col = list_col[-1:] + list_col[:-1]

out_df = out_df[list_col]

sub_folder = '/home/sidsvash26/kaggle_quora/submissions/'
out_df.to_csv(sub_folder + "model9_feat123468910.csv", index=False)

predicting values
predictions done!!


In [6]:
#