In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
print(set(train.dtypes))
print(train.describe())

In [None]:
train.head()

In [None]:
categorical_cols = [cname for cname in train.columns if 'cat' in cname]
numerical_cols = [cname for cname in train.columns if 'cont' in cname]


In [None]:
set(train.columns) - set(categorical_cols) - set(numerical_cols)

In [None]:
nan_count = train.isna().sum()
print(nan_count == 0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ],
)

In [None]:
from sklearn.model_selection import train_test_split
X = train.drop('target', axis=1)
y = train.target
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=1
)

In [None]:
tr = preprocessor.fit_transform(X)

In [None]:
print(len(train))
print(len(X_train))
print(len(X_valid))

In [None]:
tr.shape

In [None]:
list(map(print, (train[f'cat{i}'].value_counts() for i in range(10))))

In [None]:
list(map(print, (set(test[f'cat{i}'].value_counts().index) - set(train[f'cat{i}'].value_counts().index) for i in range(10))))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
def evaluate_RFR(**kwargs):
    model = RandomForestRegressor(**kwargs)
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_valid)
    score = np.sqrt(mean_squared_error(y_valid, prediction))
    return score, model


In [None]:
test_models = 5
n_estimators = (10**np.random.uniform(1, 2.5, test_models)).astype(int)
max_depth = (10**np.random.uniform(0, 1.3, test_models)).astype(int)
min_samples_leaf = (10 ** np.random.uniform(0, 1, test_models)).astype(int)

best_score = float('inf')
best_model = None

for args in zip(n_estimators, max_depth, min_samples_leaf):
    kwargs = {
        'n_estimators': args[0],
        'max_depth': args[1],
        'min_samples_leaf': args[2],
    }
    print(kwargs)
    score, model = evaluate_RFR(**kwargs, random_state=1)
    print('score', score)
    print()
    if score < best_score:
        best_score = score
        best_model = model

print('best score:', best_score)

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

def evaluate_dense(lr, hidden_layers, batch_size=64, epochs=10):
    model = tf.keras.Sequential([
        layers.InputLayer((70,)),
        *[layers.Dense(hidden_layer, activation='relu') for hidden_layer in hidden_layers],
        layers.Dense(1)
    ])
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath='model.h5', monitor='val_loss', save_best_only=True),
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='mean_squared_error',
    )
    
    
    history = model.fit(
        X_train_processed, y_train,
        epochs=epochs,
        validation_data=(X_valid_processed, y_valid),
        callbacks=callbacks,
        batch_size=batch_size
    )
    
    score = np.sqrt(min(history.history['val_loss']))
    return score, model
    


In [None]:
# model validation (rough)
test_models = 10
lr = 10**np.random.uniform(-5, 0, test_models)
num_layers = np.random.randint(1, 5, test_models) # 1 to 4
hidden_layers = [np.random.randint(2, 71, num_layer) for num_layer in num_layers] # each with 2 to 70 channel
epochs = [2]*test_models # fast & rough testing
batch_size = (10 ** np.random.uniform(1, 2.5, test_models)).astype(int)

for args in zip(lr, hidden_layers, epochs, batch_size):
    kwargs = {
        'lr': args[0],
        'hidden_layers': args[1],
        'batch_size': args[3],
        'epochs': args[2]
    }
    print(kwargs)
    score, model = evaluate_dense(**kwargs)
    print('score', score)
    print()
    if score < best_score:
        best_score = score
        best_model = model

print('best score:', best_score)

In [None]:
# model validation (finer)
test_models = 10
lr = 10**np.random.uniform(-5, -1, test_models)
num_layers = np.random.randint(2, 6, test_models) # 1 to 4
hidden_layers = [np.random.randint(2, 71, num_layer) for num_layer in num_layers] # each with 2 to 70 channel
epochs = [10]*test_models
batch_size = (10 ** np.random.uniform(1.3, 2.5, test_models)).astype(int)

for args in zip(lr, hidden_layers, epochs, batch_size):
    kwargs = {
        'lr': args[0],
        'hidden_layers': args[1],
        'batch_size': args[3],
        'epochs': args[2]
    }
    print(kwargs)
    score, model = evaluate_dense(**kwargs)
    print('score', score)
    print()
    if score < best_score:
        best_score = score
        best_model = model

print('best score:', best_score)

In [None]:
X_processed = preprocessor.fit_transform(X)

In [None]:
# use all training data w/ best setup
model = tf.keras.Sequential([
    layers.InputLayer((70,)),
    *[layers.Dense(hidden_layer, activation='relu') for hidden_layer in [34, 60, 66, 55]],
    layers.Dense(1)
])
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath='model.h5', monitor='val_loss', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0030348092545886673),
    loss='mean_squared_error',
)

history = model.fit(
    X_processed, y,
    epochs=10,
    validation_data=(X_valid_processed, y_valid),
    callbacks=callbacks,
    batch_size=37
)

score = np.sqrt(min(history.history['val_loss']))
print(score)

In [None]:
best_model = model
best_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', best_model)
    ])
result = best_pipeline.predict(test)


In [None]:
print(result)

In [None]:
sample_submission.head()

In [None]:
submission = pd.DataFrame({
    'id': test.id,
    'target': result.flatten()
})

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)