# Read Files

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datatable as dt
%whos

In [None]:
%%time
data = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
submission = dt.fread('../input/tabular-playground-series-jan-2021/test.csv').to_pandas()

In [None]:
display(data.head())

In [None]:
y = data.target.values
X = data.drop(['id','target'], axis=1).values
X.shape, y.shape

# EDA

In [None]:
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import scale

In [None]:
print(f'Number of NA values in features: {pd.isnull(X).sum()}')

In [None]:
plt.figure(figsize=(12,8))
plt.title('Distribution of target variable')
sns.distplot(y)
plt.axis('off')

In [None]:
%%time
plt.figure(figsize=(12,6))
sns.pairplot(data.drop(['id'], axis=1))

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(121)
plt.title('Features Corr Matrix')
corr_mat=np.corrcoef(X.T)
sns.heatmap(abs(corr_mat), cmap='rocket_r')

plt.subplot(122)
plt.title('Target vs Features')
sns.heatmap(abs(np.corrcoef(y,X.T)[1:,0].reshape(-1,1)), cmap='rocket_r')

# Machine learning

## Pipeline

In [None]:
EPOCHS = 100
DROP_Z_SCORE = 2.3

In [None]:
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from scipy import stats
import kerastuner as kt


In [None]:
def _drop_outlier(df):
    z_scores = stats.zscore(df)

    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < DROP_Z_SCORE).all(axis=1)
    return df[filtered_entries]

def _custom_eng(x):
    return x.drop(['id'],axis=1).values

In [None]:
feature_pipeline=Pipeline(steps=[
    ('feature_eng', FunctionTransformer(_custom_eng, check_inverse=False)),
#    ('outlier_dropper', FunctionTransformer(_drop_outlier, check_inverse=False)),
    ('trnsfrmer', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler()),
    ('pca', PCA())
])

In [None]:
y_bins=pd.qcut(data.target, q=10)
train,test = train_test_split(data, test_size=.2, random_state=42, stratify=y_bins)

In [None]:
train,test=map(_drop_outlier,[train,test])

In [None]:
train.shape, test.shape

In [None]:
train_pipe_before = train.drop('target',axis=1)
test_pipe_before = test.drop('target',axis=1)
trainc = feature_pipeline.fit_transform(train_pipe_before)
testc = feature_pipeline.transform(test_pipe_before)

X_train, y_train = trainc, train.target.values
X_test, y_test = testc, test.target.values

list(map(lambda x: x.shape,[X_train, y_train, X_test, y_test]))

## Fit several models
### KERAS Feed Forward NN

In [None]:
SHAPE = X_train.shape

In [None]:
def build_model(hp):
    model = tf.keras.models.Sequential()
    
    # Layer 1
    model.add(tf.keras.layers.Dense(hp.Int('units',min_value=10, max_value=100, step=32), activation='relu', input_shape=SHAPE))
    model.add(tf.keras.layers.Dropout(hp.Float('rate', min_value=.05,max_value=.7,step=10)))
    
    # Layer 2
    model.add(tf.keras.layers.Dense(hp.Int('units',min_value=10, max_value=100, step=32), activation='relu'))
    model.add(tf.keras.layers.Dropout(hp.Float('rate', min_value=.05,max_value=.7,step=10)))
    
    # Layer 3
    model.add(tf.keras.layers.Dense(hp.Int('units',min_value=10, max_value=100, step=32), activation='relu'))
    model.add(tf.keras.layers.Dropout(hp.Float('rate', min_value=.05,max_value=.7,step=10)))
    
    # Last
    model.add(tf.keras.layers.Dense(1, activation='linear'))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='mse',
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    return model

tuner = kt.tuners.RandomSearch(
    build_model,
    objective=kt.Objective("val_root_mean_squared_error", direction="min"),
    max_trials=5,
    executions_per_trial=3,
    directory='fine_tune_results',
    project_name='playground')

tuner.search_space_summary()

In [None]:
%%time 
tuner.search(X_train, y_train,
             epochs=5,
             validation_data=(X_test, y_test))

model = tuner.get_best_models(num_models=1)[0]

### Lightgbm

In [None]:
import lightgbm as lgb

lgb.__version__

In [None]:
lgb_train = lgb.Dataset(
    X_train, y_train,
    feature_name = ['cont'+str(i) for i in range(14)],
)

params = {
    'boosting':'gbdt',
    'objective': 'regression',
    'metric': 'rmsle'
}

In [None]:
lgb_model=lgb.train(params, lgb_train,
                   )

# Submission

In [None]:
id_col=submission.id
print(submission.shape)

sub_ready=feature_pipeline.transform(submission)
print(sub_ready.shape)
sub_ready

In [None]:
predictions=(lgb_model.predict(sub_ready).reshape(-1,1)+model.predict(sub_ready))/2

In [None]:
subm=id_col.to_frame()
subm['target'] = predictions
subm.set_index('id',inplace=True)
display(subm)

In [None]:
subm.to_csv('./submission.csv')