In [2]:
import time
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

from sklearn import model_selection
from sklearn import linear_model

from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#matplotlib.style.use('fivethirtyeight')


In [3]:
trainDF = pd.read_csv('../input/train.csv')
trainDF = trainDF.dropna(how="any").reset_index(drop=True)


In [None]:
featureExtractionStartTime = time.time()

maxNumFeatures = 300000

# bag of letter sequences (chars)
BagOfWordsExtractor = CountVectorizer(max_df=0.999, min_df=50, max_features=maxNumFeatures, 
                                      analyzer='char', ngram_range=(1,6), 
                                      binary=True, lowercase=True)
# bag of words
#BagOfWordsExtractor = CountVectorizer(max_df=0.999, min_df=10, max_features=maxNumFeatures, 
#                                      analyzer='word', ngram_range=(1,6), stop_words='english', 
#                                      binary=True, lowercase=True)

BagOfWordsExtractor.fit(pd.concat((trainDF.ix[:,'question1'],trainDF.ix[:,'question2'])).unique())

trainQuestion1_BOW_rep = BagOfWordsExtractor.transform(trainDF.ix[:,'question1'])
trainQuestion2_BOW_rep = BagOfWordsExtractor.transform(trainDF.ix[:,'question2'])
lables = np.array(trainDF.ix[:,'is_duplicate'])

featureExtractionDurationInMinutes = (time.time()-featureExtractionStartTime)/60.0
print("feature extraction took %.2f minutes" % (featureExtractionDurationInMinutes))

In [None]:
from scipy.sparse import csr_matrix
import lightgbm as lgbm
from sklearn.metrics import fbeta_score, make_scorer

a = 0.165 / 0.37
b = (1 - 0.165) / (1 - 0.37)

def kappa(preds, y):
    score = []
    for pp,yy in zip(preds, y.get_label()):
        score.append(a * yy * np.log (pp) + b * (1 - yy) * np.log(1-pp))
    score = -np.sum(score) / len(score)

    return 'kappa', float(score),False


def run_lgb_native(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=500000,e_stoping_r=50):
   
    params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    #'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.03,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 7,
    'verbose': 0,
    #'regression_l1':107
    #'scale_pos_weight':1.36,
       # 'is_unbalance':True
        }
    if test_y is not None:
        lgb_train = lgbm.Dataset(train_X, train_y,
                        free_raw_data=False)
        lgb_eval = lgbm.Dataset(test_X, test_y, reference=lgb_train,
                        free_raw_data=False)
        model = lgbm.train(params,lgb_train, num_boost_round=num_rounds, feval=kappa,valid_sets=lgb_eval,verbose_eval=10)
        #model.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X, test_y)],verbose=100,early_stopping_rounds=e_stoping_r,eval_metric=fun_loss)
    else:
        lgb_train = lgbm.Dataset(train_X, train_y,
                        free_raw_data=False)
        model=lgbm.train(params,lgb_train,num_boost_round=num_rounds, feval=kappa)
        
    pred_test_y = model.predict(test_X)
    return pred_test_y, model

In [None]:
X = -(trainQuestion1_BOW_rep != trainQuestion2_BOW_rep).astype(int)
#X = -(trainQuestion1_BOW_rep != trainQuestion2_BOW_rep).astype(int) + \
#      trainQuestion1_BOW_rep.multiply(trainQuestion2_BOW_rep)
y = lables

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
preds, model = run_lgb_native(X_train, y_train,X_test,y_test,num_rounds=10000)