In [None]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm.notebook import tqdm

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [9, 9]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 350)

In [None]:
PATH = '../input/ubiquant-parquet'

In [None]:
os.listdir(PATH)

In [None]:
gc.collect()

In [None]:
features = [f'f_{i}' for i in range(300)]

## Modelling

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn import linear_model
from sklearn import metrics
from scipy.stats import pearsonr
import pickle

In [None]:
def pkl_save(filename, file):
    output = open(filename, 'wb')
    pickle.dump(file, output)
    output.close()


def pkl_load(filename):
    pkl_file = open(filename, 'rb')
    file = pickle.load(pkl_file)
    pkl_file.close()
    return file

In [None]:
# load models
FOLDS = 5
models = []
for i in range(FOLDS):
    model_name = '../input/yhymodelstack/linear'+str(i)+'.pkl'
    with open(model_name, 'rb') as f:
        model = pickle.load(f)
        models.append(model)

In [None]:
# load xgboost models

FOLDS = 5
models2 = []
for i in range(FOLDS):
    model_name = '../input/yhymodelstack/lgbm_seed21_'+str(i)+'.pkl'
    with open(model_name, 'rb') as f:
        model = pickle.load(f)
        models2.append(model)

## Submission

In [None]:
import ubiquant
from sklearn.preprocessing import LabelEncoder
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission


In [None]:

for (test_df, sample_prediction_df) in iter_test:

    linear_pred = [models[fold].predict(test_df[features]) for fold in range(FOLDS)]
    linear_preds = np.mean(np.stack(linear_pred), axis=0) # make your predictions here
    lgbm_pred = [models2[fold].predict(test_df[features]) for fold in range(FOLDS)]
    lgbms_preds = np.mean(np.stack(lgbm_pred), axis=0) # make your predictions here
    sample_prediction_df['target'] = lgbms_preds*0.55 + linear_preds*0.45
    sample_prediction_df['target'] = sample_prediction_df['target'].replace(np.inf,np.NaN).fillna(0.0)
    env.predict(sample_prediction_df)   # register your predictions
    #display(sample_prediction_df)
    

In [None]:
# display(prediction_df.head())

# prediction_df.to_csv("submission.csv")

pd.read_csv("submission.csv")