In [1]:
from scipy.sparse import csr_matrix
import lightgbm as lgbm
from sklearn.metrics import fbeta_score, make_scorer

a = 0.165 / 0.37
b = (1 - 0.165) / (1 - 0.37)

def kappa(preds, y):
    score = []
    for pp,yy in zip(preds, y.get_label()):
        score.append(a * yy * np.log (pp) + b * (1 - yy) * np.log(1-pp))
    score = -np.sum(score) / len(score)

    return 'kappa', float(score),False

fun_loss  = make_scorer(kappa, greater_is_better=False)

def runlgbm(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=500000,e_stoping_r=50): 
    t4_params = {
        'boosting_type': 'gbdt', 'objective': 'binary', 'nthread': -1, 'silent': True,
        'num_leaves': 6, 'learning_rate': 0.03, 'max_depth': 6,
        'max_bin': 255, 'subsample_for_bin': 50000,
        'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'reg_alpha':1, 'reg_lambda':0,
        'min_split_gain': 0.5, 'min_child_weight': 1, 'min_child_samples': 10, 'scale_pos_weight': 1.36}

    # they can be used directly to build a LGBMClassifier (which is wrapped in a sklearn fashion)
    model = lgbm.sklearn.LGBMClassifier(n_estimators=num_rounds, seed=0, **t4_params)
    
    if test_y is not None:
        model.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X, test_y)],verbose=100,early_stopping_rounds=e_stoping_r)
    else:
        model.fit(train_X,train_y)
    pred_test_y = model.predict_proba(test_X)
    return pred_test_y, model




In [2]:
import pandas as pd
import numpy as np
from scipy import sparse as ssp
from sklearn.model_selection import KFold
from sklearn.datasets import dump_svmlight_file,load_svmlight_file
from sklearn.utils import resample,shuffle
from sklearn.preprocessing import MinMaxScaler
seed=1024
np.random.seed(seed)
path = "../input/"
train = pd.read_csv(path+"train_porter.csv")


# tfidf
train_question1_tfidf = pd.read_pickle(path+'train_question1_tfidf.pkl')[:]
test_question1_tfidf = pd.read_pickle(path+'test_question1_tfidf.pkl')[:]

train_question2_tfidf = pd.read_pickle(path+'train_question2_tfidf.pkl')[:]
test_question2_tfidf = pd.read_pickle(path+'test_question2_tfidf.pkl')[:]


train_question1_porter_tfidf = pd.read_pickle(path+'train_question1_porter_tfidf.pkl')[:]
test_question1_porter_tfidf = pd.read_pickle(path+'test_question1_porter_tfidf.pkl')[:]

train_question2_porter_tfidf = pd.read_pickle(path+'train_question2_porter_tfidf.pkl')[:]
test_question2_porter_tfidf = pd.read_pickle(path+'test_question2_porter_tfidf.pkl')[:]


train_interaction = pd.read_pickle(path+'train_interaction.pkl')[:].reshape(-1,1)
test_interaction = pd.read_pickle(path+'test_interaction.pkl')[:].reshape(-1,1)

train_porter_interaction = pd.read_pickle(path+'train_porter_interaction.pkl')[:].reshape(-1,1)
test_porter_interaction = pd.read_pickle(path+'test_porter_interaction.pkl')[:].reshape(-1,1)


train_jaccard = pd.read_pickle(path+'train_jaccard.pkl')[:].reshape(-1,1)
test_jaccard = pd.read_pickle(path+'test_jaccard.pkl')[:].reshape(-1,1)

train_porter_jaccard = pd.read_pickle(path+'train_porter_jaccard.pkl')[:].reshape(-1,1)
test_porter_jaccard = pd.read_pickle(path+'test_porter_jaccard.pkl')[:].reshape(-1,1)

train_len = pd.read_pickle(path+"train_len.pkl")
test_len = pd.read_pickle(path+"test_len.pkl")
scaler = MinMaxScaler()
scaler.fit(np.vstack([train_len,test_len]))
train_len = scaler.transform(train_len)
test_len =scaler.transform(test_len)



In [3]:
abhitest=pd.read_csv("../rader/test_features.csv")

abhitrain=pd.read_csv("../rader/train_features.csv")

In [4]:
abhife=abhitrain.columns.tolist()
abhife.remove('question1')
abhife.remove('question2')

In [5]:
abhife=['fuzz_qratio','fuzz_WRatio','fuzz_partial_token_set_ratio','fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'wmd', 'norm_wmd', 'cityblock_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance', 'braycurtis_distance', 'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']
#abhitest[abhife].shape

In [6]:
abhitest[abhife]=abhitest[abhife].replace(np.inf, np.nan)
abhitest[abhife]=abhitest[abhife].fillna(0)

abhitrain[abhife]=abhitrain[abhife].replace(np.inf, np.nan)
abhitrain[abhife]=abhitrain[abhife].fillna(0)

In [7]:
path="../input/"
train_porter_w2vecsim =np.array(pd.read_pickle(path+'train_porter_w2vecsim.pkl'))[:].reshape(-1,1)
train_porter_w2vecdest = np.array(pd.read_pickle(path+'train_porter_w2vecdist.pkl'))[:].reshape(-1,1)

In [8]:
test_porter_w2vecsim = np.array(pd.read_csv(path+'wors2vecsim_test.csv')['w2v_sim']).reshape(-1,1)
test_porter_w2vecdest = np.array(pd.read_csv(path+'wors2vecsim_test.csv')['w2v_dist']).reshape(-1,1)

In [9]:
path="../input/"
train_cosinesim=pd.read_pickle(path+"train_porter_cosine_dist.pkl")[:].reshape(-1,1)
test_cosinesim=pd.read_pickle(path+"test_porter_cosine_dist.pkl")[:].reshape(-1,1)
y = train['is_duplicate'].values[:]

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [10]:
path="../input/wordvecmodels/w2vecsimvectors/"
glove42B300d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.42B.300d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B100d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.6B.100d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glove6B200d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.6B.200d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove840B300d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.840B.300d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B50d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.6B.50d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B100d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.6B.100d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glovetwitter27B200d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.twitter.27B.200d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glovetwitter27B200d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.twitter.27B.200d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove42B300d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.42B.300d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glovetwitter27B100d_train_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.twitter.27B.100d_train_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B200d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.6B.200d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glove6B50d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.6B.50d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glove840B300d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.840B.300d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glovetwitter27B100d_train_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.twitter.27B.100d_train_porter_w2vecdist.pkl"))[:].reshape(-1,1)


In [11]:
#glove42B300d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.42B.300d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B100d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.6B.100d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glove6B200d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.6B.200d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
#glove840B300d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.840B.300d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B50d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.6B.50d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B100d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.6B.100d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glovetwitter27B200d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.twitter.27B.200d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glovetwitter27B200d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.twitter.27B.200d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
#glove42B300d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.42B.300d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glovetwitter27B100d_test_porter_w2vecsim=np.array(pd.read_pickle(path+"glove.twitter.27B.100d_test_porter_w2vecsim.pkl"))[:].reshape(-1,1)
glove6B200d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.6B.200d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glove6B50d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.6B.50d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)
#glove840B300d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.840B.300d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)
glovetwitter27B100d_test_porter_w2vecdist=np.array(pd.read_pickle(path+"glove.twitter.27B.100d_test_porter_w2vecdist.pkl"))[:].reshape(-1,1)


In [17]:
#train_q1_svd=pd.read_pickle("../input/wordvecmodels/svd/train_q1_svd.pkl")
#train_q2_svd=pd.read_pickle("../input/wordvecmodels/svd/train_q2_svd.pkl")
from sklearn.preprocessing import StandardScaler
train_q1_svd=pd.read_pickle("../input/train_q1_svd.pkl")[:]
train_q2_svd=pd.read_pickle("../input/train_q2_svd.pkl")[:]
train_q1_svd = StandardScaler().fit_transform(train_q1_svd)
train_q2_svd = StandardScaler().fit_transform(train_q2_svd)


test_q1_svd=pd.read_pickle("../input/test_q1_svd.pkl")[:]
test_q2_svd=pd.read_pickle("../input/test_q2_svd.pkl")[:]

test_q1_svd = StandardScaler().fit_transform(test_q1_svd)
test_q2_svd = StandardScaler().fit_transform(test_q2_svd)

In [12]:
#glove840B300d_train_porter_w2vecvector_diff=pd.read_pickle(path+"glove.840B.300d_train_porter_w2vecvector_diff.pkl")
glocefeatures_train=np.concatenate([
    #glove42B300d_train_porter_w2vecsim,
    #glove6B100d_train_porter_w2vecdist,
    glove6B200d_train_porter_w2vecsim,
    #glove840B300d_train_porter_w2vecsim,
    glove6B50d_train_porter_w2vecsim,
    glove6B100d_train_porter_w2vecsim,
    #glovetwitter27B200d_train_porter_w2vecdist,
    glovetwitter27B200d_train_porter_w2vecsim,
    #glove42B300d_train_porter_w2vecdist,
    glovetwitter27B100d_train_porter_w2vecsim,
    #glove6B200d_train_porter_w2vecdist,
    #glove6B50d_train_porter_w2vecdist,
    #glove840B300d_train_porter_w2vecdist,
    glovetwitter27B100d_train_porter_w2vecdist
    ],axis=1)

glocefeatures_test=np.concatenate([
    
    #glove42B300d_test_porter_w2vecsim,
    #glove6B100d_test_porter_w2vecdist,
    glove6B200d_test_porter_w2vecsim,
    #glove840B300d_test_porter_w2vecsim,
    glove6B50d_test_porter_w2vecsim,
    glove6B100d_test_porter_w2vecsim,
    #glovetwitter27B200d_test_porter_w2vecdist,
    glovetwitter27B200d_test_porter_w2vecsim,
    #glove42B300d_test_porter_w2vecdist,
    glovetwitter27B100d_test_porter_w2vecsim,
    #glove6B200d_test_porter_w2vecdist,
    #glove6B50d_test_porter_w2vecdist,
    #glove840B300d_test_porter_w2vecdist,
    glovetwitter27B100d_test_porter_w2vecdist
    ],axis=1)

In [13]:
y = train['is_duplicate'].values[:]

In [14]:
X = ssp.hstack([
    train_porter_w2vecsim,
    train_porter_w2vecdest,
    glocefeatures_train,
    csr_matrix(abhitrain[abhife]),
    train_cosinesim,
    train_question1_tfidf,
    train_question2_tfidf,
    train_interaction,
    train_porter_interaction,
    train_jaccard,
    train_porter_jaccard,
    train_len,
    ]).tocsr()

X_t = ssp.hstack([
    test_porter_w2vecsim,
    test_porter_w2vecdest,
    glocefeatures_test,
    csr_matrix(abhitest[abhife]),
    test_cosinesim,
    test_question1_tfidf,
    test_question2_tfidf,
    test_interaction,
    test_porter_interaction,
    test_jaccard,
    test_porter_jaccard,
    test_len,
    ]).tocsr()

In [15]:
def run_lgb_native(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=500000,e_stoping_r=50):
   
    params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    #'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.03,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 7,
    'verbose': 0,
    #'regression_l1':107
    #'scale_pos_weight':1.36,
       # 'is_unbalance':True
        }
    if test_y is not None:
        lgb_train = lgbm.Dataset(train_X, train_y,
                        free_raw_data=False)
        lgb_eval = lgbm.Dataset(test_X, test_y, reference=lgb_train,
                        free_raw_data=False)
        model = lgbm.train(params,lgb_train, num_boost_round=num_rounds, feval=kappa,valid_sets=lgb_eval,verbose_eval=10)
        #model.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X, test_y)],verbose=100,early_stopping_rounds=e_stoping_r,eval_metric=fun_loss)
    else:
        lgb_train = lgbm.Dataset(train_X, train_y,
                        free_raw_data=False)
        model=lgbm.train(params,lgb_train,num_boost_round=num_rounds, feval=kappa)
        
    pred_test_y = model.predict(test_X)
    return pred_test_y, model

def kappa1(preds, y):
    score = []
    for pp,yy in zip(preds, y.get_label()):
        score.append(a * yy * np.log (pp) + b * (1 - yy) * np.log(1-pp))
    score = -np.sum(score) / len(score)

    return 'kappa', float(score)
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=2000):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = "logloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20,verbose_eval=50,feval=kappa1)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds,feval=kappa1)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model



In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
preds, model = run_lgb_native(X_train, y_train,X_test,y_test,num_rounds=10000)

In [None]:
preds, model = run_lgb_native(X, y,X_t,num_rounds=800)

In [None]:
preds, model = run_lgb_native(X, y,X_t,num_rounds=1000)

In [None]:
om scipy.sparse import csr_matrix
path="/home/udit/ipython/notebook/quora/input/input/FMModel/stacknet/"
fromsparsetofile(path+"train1.sparse", X, deli1=" ", deli2=":",ytarget=y)    
fromsparsetofile(path+"test1.sparse", X_t, deli1=" ", deli2=":",ytarget=None)       

In [39]:
path="/home/udit/ipython/notebook/quora/input/input/FMModel/input/"
skf = KFold(n_splits=5, shuffle=True, random_state=seed).split(X)
for ind_tr, ind_te in skf:
    X_train = X[ind_tr]
    X_test = X[ind_te]

    y_train = y[ind_tr]
    y_test = y[ind_te]
    break

dump_svmlight_file(X,y,path+"X_tfidf.svm")
#del X
dump_svmlight_file(X_t,np.zeros(X_t.shape[0]),path+"X_t_tfidf.svm")
#del X_t

def oversample(X_ot,y,p=0.165):
    pos_ot = X_ot[y==1]
    neg_ot = X_ot[y==0]
    #p = 0.165
    scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
    while scale > 1:
        neg_ot = ssp.vstack([neg_ot, neg_ot]).tocsr()
        scale -=1
    neg_ot = ssp.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]]).tocsr()
    ot = ssp.vstack([pos_ot, neg_ot]).tocsr()
    y=np.zeros(ot.shape[0])
    y[:pos_ot.shape[0]]=1.0
    print y.mean()
    return ot,y

X_train,y_train = oversample(X_train.tocsr(),y_train,p=0.165)
X_test,y_test = oversample(X_test.tocsr(),y_test,p=0.165)

X_train,y_train = shuffle(X_train,y_train,random_state=seed)

dump_svmlight_file(X_train,y_train,path+"X_train_tfidf.svm")
dump_svmlight_file(X_test,y_test,path+"X_test_tfidf.svm")

0.191269277687
0.191144081052


In [40]:
def fromsparsetofile(filename, array, deli1=" ", deli2=":",ytarget=None):    
    zsparse=csr_matrix(array)
    indptr = zsparse.indptr
    indices = zsparse.indices
    data = zsparse.data
    print(" data lenth %d" % (len(data)))
    print(" indices lenth %d" % (len(indices)))    
    print(" indptr lenth %d" % (len(indptr)))
    
    f=open(filename,"w")
    counter_row=0
    for b in range(0,len(indptr)-1):
        #if there is a target, print it else , print nothing
        if ytarget!=None:
             f.write(str(ytarget[b]) + deli1)     
             
        for k in range(indptr[b],indptr[b+1]):
            if (k==indptr[b]):
                if np.isnan(data[k]):
                    f.write("%d%s%f" % (indices[k],deli2,-1))
                else :
                    f.write("%d%s%f" % (indices[k],deli2,data[k]))                    
            else :
                if np.isnan(data[k]):
                     f.write("%s%d%s%f" % (deli1,indices[k],deli2,-1))  
                else :
                    f.write("%s%d%s%f" % (deli1,indices[k],deli2,data[k]))
        f.write("\n")
        counter_row+=1
        if counter_row%10000==0:    
            print(" row : %d " % (counter_row))    
    f.close()  
   