In [8]:
from pathlib import Path

import pandas as pd
from common.constants import NUM_PROCESSES, OUTPUT_DIR, TRAIN_CSV_PATH
from common.utils import compute_weights
from features.match import build_match_features
from features.length import build_length_features
from features.edit_distance import build_edit_distance_features
from texts.preprocessing import PreprocessingKey, StopwordsKey
from experiments.gbm_common import run_kfold

In [9]:
model_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting": "gbdt",
    "num_leaves": 64,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "feature_fraction": 0.8,
    "learning_rate": 0.1,
    "seed": 1,
    "num_threads": NUM_PROCESSES,
}

In [10]:
trn_df = pd.read_csv(TRAIN_CSV_PATH, na_filter=False)
trn_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [11]:
features = pd.concat(
    [
        build_match_features(PreprocessingKey.NLTK_STEMMING, StopwordsKey.NLTK_STEMMED, 1),
        build_match_features(PreprocessingKey.NLTK_STEMMING, StopwordsKey.NLTK_STEMMED, 2),
        build_match_features(PreprocessingKey.SPACE_TOKENIZATION, StopwordsKey.NLTK, 1),
        build_match_features(PreprocessingKey.SPACE_TOKENIZATION, StopwordsKey.NLTK, 2),
        
        build_length_features(PreprocessingKey.NLTK_STEMMING, StopwordsKey.NLTK_STEMMED, 1),
        build_length_features(PreprocessingKey.NLTK_STEMMING, StopwordsKey.NLTK_STEMMED, 2),
        build_length_features(PreprocessingKey.SPACE_TOKENIZATION, StopwordsKey.NLTK, 1),
        build_length_features(PreprocessingKey.SPACE_TOKENIZATION, StopwordsKey.NLTK, 2),
        
        build_edit_distance_features(PreprocessingKey.NLTK_STEMMING, StopwordsKey.NONE),
        build_edit_distance_features(PreprocessingKey.SPACE_TOKENIZATION, StopwordsKey.NONE),
    ],
    axis=1
)

In [12]:
save_dir = OUTPUT_DIR / "ch5_2_1_003"
save_dir.mkdir(exist_ok=True, parents=True)

In [13]:
weights = compute_weights(trn_df["is_duplicate"], 0.174)

In [14]:
run_kfold(
    features=features,
    trn_targets=trn_df.is_duplicate,
    n_splits=5,
    save_dir=save_dir,
    model_params=model_params,
    weights=weights,
)

[LightGBM] [Info] Number of positive: 119411, number of negative: 204021
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11725
[LightGBM] [Info] Number of data points in the train set: 323432, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.174001 -> initscore=-1.557531
[LightGBM] [Info] Start training from score -1.557531
Training until validation scores don't improve for 20 rounds
[30]	trn's binary_logloss: 0.296645	val's binary_logloss: 0.301321
[60]	trn's binary_logloss: 0.282811	val's binary_logloss: 0.290881
[90]	trn's binary_logloss: 0.276169	val's binary_logloss: 0.287657
[120]	trn's binary_logloss: 0.271055	val's binary_logloss: 0.286008
[150]	trn's binary_logloss: 0.266529	val's binary_logloss: 0.286079
Early stopping, best iteration is:
[135]	trn's binary_logloss: 0.268681	val's binary_logloss: 0.28531
[LightGBM] [Info] Number of positive: 119411, number of negative: 204021
You can set `force_col_wise=true`