First we import some modules and set some paths.

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np

from util import dump_submission
from feature import get_feature, get_tokenizer
from model import train_model
from config import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## load

Then we read the input from the training files.

In [3]:
df_en_train = pd.read_csv(english_train_path, sep='\t', names=['en0', 'es0', 'en1', 'es1', 'label'])

df_es_train = pd.read_csv(spanish_train_path, sep='\t', names=['es0', 'en0', 'es1', 'en1', 'label'])
df_es2en = pd.read_csv(unlabel_spanish_train_path, sep='\t', names=['es', 'en'])
df_test = pd.read_csv(test_path, sep='\t', names=['es0', 'es1'])

In [4]:
len(df_es_train), len(df_test)

(1400, 5000)

In [5]:
tokenizer = get_tokenizer([df_es_train['es0'], df_es_train['es1'], df_test['es0'], df_test['es1']])    

## word2vec

Then we load the pretrained embedding vectors.

In [6]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
f = open(es_vec_path, encoding="utf8")
f.readline()
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in f)

'985667 300\n'

In [7]:
embeddings_index['de']

array([-0.13075  , -0.087659 , -0.11427  , -0.020641 ,  0.11753  ,
        0.19687  ,  0.054257 , -0.0028717,  0.062278 , -0.10023  ,
       -0.050123 , -0.026275 , -0.057605 , -0.13072  ,  0.10147  ,
        0.15849  ,  0.095493 ,  0.051555 ,  0.015874 , -0.046374 ,
        0.098467 ,  0.034867 ,  0.039933 , -0.1208   ,  0.065478 ,
       -0.0098815, -0.13914  , -0.043732 , -0.015622 ,  0.05665  ,
       -0.01476  , -0.0054753, -0.047127 , -0.21595  , -0.015154 ,
       -0.0034798,  0.058253 ,  0.036444 , -0.25157  ,  0.060459 ,
        0.23842  ,  0.017983 ,  0.10673  , -0.15889  ,  0.23043  ,
       -0.078636 ,  0.075394 , -0.18431  , -0.31417  ,  0.084773 ,
       -0.14912  ,  0.036904 , -0.1144   ,  0.025056 ,  0.058607 ,
        0.059822 , -0.17929  ,  0.028468 ,  0.16728  , -0.020946 ,
        0.019714 ,  0.0083937,  0.032227 ,  0.013204 ,  0.06393  ,
       -0.19616  , -0.043487 ,  0.10124  , -0.032762 ,  0.17206  ,
       -0.062339 , -0.10172  , -0.31708  ,  0.079012 , -0.1232

Then we compute the mean and standard variance of this embedding matrix.

In [8]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print('emb_mean:',emb_mean,'emb_std:',emb_std)

emb_mean: -0.007466377 emb_std: 0.2691353


## Feature

Now we begin to generate some features.

In [14]:
from feature import *

In [15]:
feature_train_es = get_feature(df_es_train, tokenizer)

NameError: name 'embeddings_index' is not defined

## Model

In [None]:
import lightgbm as lgb
import gc

def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='label', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=50, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.04,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0.99,  # L1 regularization term on weights
        'reg_lambda': 0.9,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 1,
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    del dtrain
    del dvalid
    gc.collect()

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[ xgvalid], 
                     valid_names=['valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])

    return (bst1,bst1.best_iteration)

In [None]:
len(df_es_train)

In [None]:
params = {
    'learning_rate': 0.04,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 31,  # 2^max_depth - 1
    'max_depth': -1,  # -1 means no limit
    'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 255,  # Number of bucketed bin for feature values
    'subsample': 0.6,  # Subsample ratio of the training instance.
    'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0.99,  # L1 regularization term on weights
    'reg_lambda': 0.9,  # L2 regularization term on weights
    'scale_pos_weight':200 # because training data is extremely unbalanced 
}

predictors = ['word2vec_dot']

(bst,best_iteration) = lgb_modelfit_nocv(params, 
                        df_es_train[:1200], 
                        df_es_train[1200:], 
                        predictors, 
                        objective='binary', 
                        metrics='auc',
                        early_stopping_rounds=30, 
                        verbose_eval=True, 
                        num_boost_round=1000)

In [None]:
sub = pd.DataFrame()
sub['is_attributed'] = bst.predict(df_test[predictors],num_iteration=best_iteration)
#     if not debug:
#         print("writing...")
sub.to_csv('sub_it%d.csv'%(fileno),index=False,float_format='%.9f')
print("done...")