In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from argparse import Namespace
from collections import defaultdict

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler

import lightgbm as lgb

from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices


SEED = 1111

tf.random.set_seed(SEED)
np.random.seed(SEED)


import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
#args = Namespace(
#    seed=21,
#   folds=5,
#    workers=4,
#    samples=2500000,
#    data_path=Path("../input/ubiquant-parquet/"),
#)
#seed_everything(1111)
data_path=Path("../input/ubiquant-parquet/")
samples=2500000

In [None]:
%%time
train = pd.read_parquet(data_path.joinpath("train_low_mem.parquet"))
assert train.isnull().any().sum() == 0, "null exists."

In [None]:
#if args.samples is not None:
    #train = train.sample(args.samples, random_state=args.seed).reset_index(drop=True)
   # train = train[-args.samples:].reset_index(drop=True)
    #gc.collect()
train.shape

In [None]:
#cat_features = []
#num_features = list(train.filter(like="f_").columns)
#features = num_features + cat_features
features = [c for c in train.columns if "f_" in c]
#scaler = StandardScaler()
#train[num_features] = scaler.fit_transform(train[num_features])
train = reduce_mem_usage(train)
#joblib.dump(scaler, 'scaler.joblib')
gc.collect()
len(features)

In [None]:
# corr_matrix = train.filter(like="f_").corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# # Find features with correlation greater than 0.97
# to_drop = [column for column in upper.columns if any(upper[column] >= 0.97)]
# sorted(to_drop)

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', rmse(y_true, y_pred), False

train.fillna(train.mean(),inplace=True)

X_train = train.loc[:, train.columns.str.contains('f_')]
    
y_train = train['target']
    

In [None]:
#features_importance = run()
#print(f"lgbm {args.folds} folds mean rmse: {rmse(train.target, train.preds)}, mean pearsonr: {pearsonr(train.target, train.preds)[0]}")

In [None]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("tanh")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        #optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        #loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.MeanAbsolutePercentageError(name='mean_absolute_percentage_error'),
        #loss = [NegCorrelation],
        metrics=tf.keras.metrics.RootMeanSquaredError(name='root_mean_squared_error'),
        #metrics= [Correlation]

    )

    return model


batch_size = 5000
hidden_units = [150, 150, 150]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

model = create_mlp(
    len(features), 1, hidden_units, dropout_rates, label_smoothing, learning_rate
    )

model.fit(X_train, y_train, epochs=100, batch_size=5000)



In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

#scaler = joblib.load('scaler.joblib')
#models = [joblib.load(f'lgbm_seed{args.seed}_{fold}.pkl') for fold in range(args.folds)]

for (test_df, sample_prediction_df) in iter_test:
    #test_df[num_features] = scaler.fit_transform(test_df[num_features]) 
    #final_pred = [models[fold].predict(test_df[features]) for fold in range(args.folds)]
    #sample_prediction_df['target'] = np.mean(np.stack(final_pred), axis=0)
    x_tt = test_df.loc[:, features].values
    sample_prediction_df['target'] = model(x_tt, training = False).numpy()
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)