## Libraries

In [None]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

## Parameters

In [None]:
N_SPLITS = 10
SEED = 0
EARLY_STOPPING_ROUNDS = 300
VERBOSE = 1000
PARAMS = {'n_estimators': 1000, 'num_leaves': 10, 'min_child_samples': 120}

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

set_seed(SEED)

## Datasets

In [None]:
INPUT = "../input/petfinder-pawpularity-score/"
train = pd.read_csv(INPUT + "train.csv")
test = pd.read_csv(INPUT + "test.csv")
sample_submission = pd.read_csv(INPUT + "sample_submission.csv")
train.shape

In [None]:
train.head()

## Feature engineering

In [None]:
train['collage_and_info'] = train['Collage'] * train['Info']
train['collage_or_info'] = train['Collage'] + train['Info']
train['occlusion_and_human'] = train['Occlusion'] * train['Human']
train['not_blur_and_eyes'] = (1-train['Blur']) * train['Eyes']
train['not_collage_and_info_or_not_blur_or_group_or_accessory'] = (1-train['Collage']*train['Info']) + (1-train['Blur']) + train['Group'] + train['Accessory']

test['collage_and_info'] = test['Collage'] * test['Info']
test['collage_or_info'] = test['Collage'] + test['Info']
test['occlusion_and_human'] = test['Occlusion'] * test['Human']
test['not_blur_and_eyes'] = (1-test['Blur']) * test['Eyes']
test['not_collage_and_info_or_not_blur_or_group_or_accessory'] = (1-test['Collage']*test['Info']) + (1-test['Blur']) + test['Group'] + test['Accessory']

In [None]:
train.head()

## Model

In [None]:
X_train, X_test = train.drop(['Pawpularity','Id'], axis=1), test.drop(['Id'], axis=1)
y_train = train['Pawpularity']

In [None]:
cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_df = pd.DataFrame({'Id': train['Id'], 'pred': np.zeros(train.shape[0]), 'Pawpularity': train['Pawpularity']})
test_preds = np.zeros(X_test.shape[0])
for fold, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
    X_trn, X_val = X_train.loc[trn_idx,:], X_train.loc[val_idx,:]
    y_trn, y_val = y_train[trn_idx], y_train[val_idx]
    
    clf = LGBMRegressor(**PARAMS)
    clf.fit(X_trn,
            y_trn,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            verbose=VERBOSE)
    
    trn_preds = clf.predict(X_trn)
    val_preds = clf.predict(X_val)
    oof_df.loc[val_idx,'pred'] = val_preds
    
    test_preds += clf.predict(X_test)/N_SPLITS
    
    print(f"==== Fold {fold} ====")
    print(f"Trn AUC: {mean_squared_error(y_trn, trn_preds, squared=False):.4f}")
    print(f"Val AUC: {mean_squared_error(y_val, val_preds, squared=False):.4f}")
    
print("==== Results ====")
print(f"OOF AUC: {mean_squared_error(oof_df['Pawpularity'], oof_df['pred'], squared=False):.4f}")
oof_df.to_csv('oof.csv', index=False)

## Submission

In [None]:
submission = pd.DataFrame({'Id': test['Id'], 'Pawpularity': test_preds})
submission.to_csv('submission.csv', index=False)