In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm.sklearn import LGBMClassifier

## Load Clean Dataset

In [None]:
PROJECT_DIR = '/content/drive/MyDrive/Colab Projects/product-pair-matching/'
DATA_DIR = PROJECT_DIR+'data/'
OUTPUTS_DIR = PROJECT_DIR+'outputs/'

# load image feature
train_image_df = pd.read_csv(DATA_DIR+'clean/train_image_df.csv')
test_image_df = pd.read_csv(DATA_DIR+'clean/test_image_df.csv')
# load text feature
train_text_df = pd.read_csv(DATA_DIR+'clean/train_text_df.csv')
test_text_df = pd.read_csv(DATA_DIR+'clean/test_text_df.csv')

In [None]:
train_df = pd.concat([train_image_df.drop('Label', axis=1), train_text_df], axis=1, join='outer')
test_df = pd.concat([test_image_df, test_text_df], axis=1, join='outer')

feat = []
for f in train_df.columns:
    if f != 'Label':
        feat.append(f)

DEBUG = False
if DEBUG:
    train_df = train_df.head(100)

## Hyperparameter search using RandomizedSearchCV

In [None]:
def params_search(X, y, features, FOLD=5, RANDOM_STATE=42, N_ITER=50):
    lgbm_default = LGBMClassifier(
        boosting_type='gbdt', #'gbdt'
        n_estimators=500, #100
        objective='binary', #'binary' or 'multiclass'
        )

    lgbm_params = {
        'num_leaves': [8, 16, 32, 64], #31
        'max_depth': [3, 7, 14, 21], #-1
        'learning_rate': [0.01, 0.1, 1], #0.1
        'min_data_in_leaf': [20, 40, 80], #20
        'min_sum_hessian_in_leaf': [1e-5, 1e-2, 1, 1e2, 1e4], #1e-3
        'bagging_fraction': [i / 10.0 for i in range(7, 11)], #0.7
        'bagging_freq': [0, 5, 10, 20, 30], #0
        'feature_fraction': [i / 10.0 for i in range(3, 7)], #1
        'lambda_l1': [0, 1e-3, 1e-1],
        'lambda_l2': [0, 1e-3, 1e-1]
        }

    print('PARAMETER GRID')
    print(lgbm_params)
    lgbm_search = RandomizedSearchCV(
        estimator=lgbm_default,
        param_distributions=lgbm_params,
        scoring='f1_macro',
        # n_jobs=-1,
        pre_dispatch='2*n_jobs',
        cv=FOLD,
        verbose=1,
        random_state=RANDOM_STATE,
        n_iter=N_ITER
        )
    
    print()
    print('SEARCHING...')
    lgbm_search.fit(X=X[features], y=y)
    print('Best parameter: {}'.format(lgbm_search.best_params_))
    print('Best score: {}'.format(lgbm_search.best_score_))

    print()
    print('FINAL FEATURE')
    feature_importance = pd.DataFrame(
        lgbm_search.best_estimator_.feature_importances_,
        index=feat,
        columns=['importance']
    ).sort_values('importance', ascending=False)
    final_features = feature_importance[feature_importance['importance'] > 0].index
    print('All feature: {}'.format(len(features)))
    print('CV feature: {}'.format(len(final_features)))

    return lgbm_search.best_params_, final_features, lgbm_search.cv_results_

In [None]:
best_params, final_features, cv_result = params_search(train_df, train_df['Label'], feat, FOLD=5, N_ITER=20)

PARAMETER GRID
{'num_leaves': [8, 16, 32, 64], 'max_depth': [3, 7, 14, 21], 'learning_rate': [0.01, 0.1, 1], 'min_data_in_leaf': [20, 40, 80], 'min_sum_hessian_in_leaf': [1e-05, 0.01, 1, 100.0, 10000.0], 'bagging_fraction': [0.7, 0.8, 0.9, 1.0], 'bagging_freq': [0, 5, 10, 20, 30], 'feature_fraction': [0.3, 0.4, 0.5, 0.6], 'lambda_l1': [0, 0.001, 0.1], 'lambda_l2': [0, 0.001, 0.1]}

SEARCHING...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 38.7min finished


Best parameter: {'num_leaves': 32, 'min_sum_hessian_in_leaf': 1e-05, 'min_data_in_leaf': 80, 'max_depth': 14, 'learning_rate': 0.1, 'lambda_l2': 0, 'lambda_l1': 0.001, 'feature_fraction': 0.4, 'bagging_freq': 20, 'bagging_fraction': 1.0}
Best score: 0.8540954492587722

FINAL FEATURE
All feature: 1331
CV feature: 1327


In [None]:
joblib.dump((best_params, final_features, cv_result), OUTPUTS_DIR+'models/randomsearch_result.pkl')

['/content/drive/MyDrive/Colab Projects/product-pair-matching/outputs/models/randomsearch_result.pkl']

## Train Model

### Cross Validation

In [None]:
def lgb_f1_score(y_true, y_hat):
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1_macro', f1_score(y_true, y_hat, 'macro'), True

In [None]:
def cv_model(X, y, feat, params, stratified=False):
    
    classifier = []
        
    if stratified:
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        folds = KFold(n_splits=5, shuffle=True, random_state=42)
        
    oof = np.zeros(len(X))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X[feat].values, y)):
        X_train = X[feat].iloc[trn_idx]
        y_train = y.iloc[trn_idx]
        X_valid = X[feat].iloc[val_idx]
        y_valid = y.iloc[val_idx]

        print("Fold n°{}".format(fold_))
        len_train = len(X.iloc[trn_idx])
        print('Train: {0} data, 0: {1}, 1: {2}'.format(len_train,
                                                       len_train-np.sum(y[trn_idx]),
                                                       np.sum(y[trn_idx]))) 
        len_valid = len(X.iloc[val_idx])
        print('Valid: {0} data, 0: {1}, 1: {2}'.format(len_valid,
                                                       len_valid-np.sum(y[val_idx]),
                                                       np.sum(y[val_idx])))
        
        clf = LGBMClassifier(**params) 
        clf.fit(X_train, y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                eval_names=['train', 'valid'],
                eval_metric=lgb_f1_score,
                early_stopping_rounds=250,
                verbose=100
                )

        
        oof[val_idx] = clf.predict(X_valid, num_iteration=clf.best_iteration_)
        classifier.append(clf)
        print()

    print('OOF:', f1_score(y, np.round(oof)))
    return classifier, oof

In [None]:
params = best_params
params['objective']='binary'
params['metric']='f1_macro'
params['n_estimators']=1000
params['is_unbalance']=True

In [None]:
cv_classifier, oof = cv_model(train_df, train_df['Label'], final_features, params, stratified=False)
print('Save model...')
joblib.dump(cv_classifier, OUTPUTS_DIR+'models/cv_classifier.pkl')

Fold n°0
Train: 8144 data, 0: 3481, 1: 4663
Valid: 2037 data, 0: 856, 1: 1181
Training until validation scores don't improve for 250 rounds.
[100]	train's f1_macro: 0.974956	valid's f1_macro: 0.856626
[200]	train's f1_macro: 0.999678	valid's f1_macro: 0.866282
[300]	train's f1_macro: 0.999785	valid's f1_macro: 0.869951
[400]	train's f1_macro: 0.999785	valid's f1_macro: 0.871068
Early stopping, best iteration is:
[204]	train's f1_macro: 0.999785	valid's f1_macro: 0.867556

Fold n°1
Train: 8145 data, 0: 3471, 1: 4674
Valid: 2036 data, 0: 866, 1: 1170
Training until validation scores don't improve for 250 rounds.
[100]	train's f1_macro: 0.974808	valid's f1_macro: 0.848428
[200]	train's f1_macro: 0.999572	valid's f1_macro: 0.873511
[300]	train's f1_macro: 0.999893	valid's f1_macro: 0.876026
[400]	train's f1_macro: 0.999893	valid's f1_macro: 0.873175
[500]	train's f1_macro: 0.999893	valid's f1_macro: 0.875965
Early stopping, best iteration is:
[294]	train's f1_macro: 0.999893	valid's f1_mac

['/content/drive/MyDrive/Colab Projects/product-pair-matching/outputs/models/cv_classifier.pkl']

### All Data

In [None]:
print('TRAIN CLASSIFIER WITH ALL DATA')
all_classifier = LGBMClassifier(**params)
all_classifier.fit(train_df[final_features], train_df['Label'])
val_preds = all_classifier.predict(train_df[final_features])
print('F1: {}'.format(f1_score(train_df['Label'], val_preds)))

print('Save model...')
joblib.dump(all_classifier, OUTPUTS_DIR+'models/all_classifier.pkl')

TRAIN CLASSIFIER WITH ALL DATA
F1: 0.999828855040219
Save model...


['/content/drive/MyDrive/Colab Projects/product-pair-matching/outputs/models/all_classifier.pkl']

### Combine Model

In [None]:
print('ENSEMBLING MODEL')
list_model = cv_classifier + [all_classifier]*5
print('Save model...')
joblib.dump(list_model, OUTPUTS_DIR+'models/cv_and_all_classifier.pkl')

ENSEMBLING MODEL
Save model...


['/content/drive/MyDrive/Colab Projects/product-pair-matching/outputs/models/cv_and_all_classifier.pkl']

## Predict Test Data

In [None]:
def create_submission(temp_df, list_model, feat, csv=False, name='test.csv'):
    df = temp_df.copy()
    df['sum_na'] = df.apply(lambda x: sum(x.isnull().values), axis=1)
    preds = []
    for model in list_model:
        y_hat = model.predict(df[feat])
        preds.append(y_hat)
        print(y_hat)
    
    df['label'] = np.round(np.mean(preds, axis=0))
    submission = pd.DataFrame()
    submission['label'] = df['label'].astype(int) 
    if csv:
        submission['label'].to_csv(name, index=True, index_label='pair_index')
    return submission[['label']]

In [None]:
# Predict using all model and use average result
sub_df = create_submission(test_df, list_model, final_features, csv=True, name='fine_tuning_all.csv')
sub_df.head()
#0.85473

[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]


Unnamed: 0,label
0,0
1,0
2,0
3,1
4,1


In [None]:
# We only got 0.85.. (ranked 16 in Private LB (late submission))
# Perhaps we have problem in feature extraction
# Suggestion:
# - Extract image feature with more simple pretrained model
#   and only use a few first layer
# - Extract text feature using smaller embedding size (maybe 64)
# Or we have silent bug (data maybe shuffled)