In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from IPython import display

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import KBinsDiscretizer

# 1. Explore data

In [None]:
home = "../input/tabular-playground-series-jan-2021/"
train_df = pd.read_csv(home+"train.csv", index_col="id")
test_df = pd.read_csv(home+"test.csv", index_col="id")
submission_df = pd.read_csv(home+"sample_submission.csv", index_col="id")

display.display(train_df)
display.display(test_df)
display.display(submission_df)

In [None]:
train_df.describe().T

In [None]:
sns.pairplot(train_df)

In [None]:
train_df.loc[train_df["target"]<5, "target"] = 5

In [None]:
X_train = train_df.drop("target", axis=1)
Y_train = train_df["target"]

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)

In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10)
# X_train = pca.fit_transform(X_train)
# X_val = pca.transform(X_val)



# 2. Dummy Model

In [None]:
from sklearn.dummy import DummyRegressor

dummy_model = DummyRegressor(strategy="median")
dummy_model.fit(X_train, Y_train)
print("Train MSE:", mean_squared_error(Y_train, dummy_model.predict(X_train)))
print("Validation MSE:", mean_squared_error(Y_val, dummy_model.predict(X_val)))

# 3. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(X_train, Y_train)
print("Train MSE:", mean_squared_error(Y_train, model_lr.predict(X_train)))
print("Validation MSE:", mean_squared_error(Y_val, model_lr.predict(X_val)))

# Train MSE: 0.5287369459854929
# Validation MSE: 0.5335127937563384

# 4. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(verbose=1, oob_score=True,n_jobs=-1, max_depth=5, n_estimators=500)
model_rf.fit(X_train, Y_train)
print("Train MSE:", mean_squared_error(Y_train, model_rf.predict(X_train)))
print("Validation MSE:", mean_squared_error(Y_val, model_rf.predict(X_val)))

# 5. Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, criterion="mse", max_depth=3)
model_gbr.fit(X_train, Y_train)
print("Train MSE:", mean_squared_error(Y_train, model_gbr.predict(X_train)))
print("Validation MSE:", mean_squared_error(Y_val, model_gbr.predict(X_val)))

In [None]:
def create_submission_file(model, test_df):
    Y_pred_test = model.predict(test_df)
    result = pd.DataFrame()
    result["id"] = test_df.index.tolist()
    result["target"] = Y_pred_test
    result.to_csv("result.csv", index=False)

In [None]:
Y_pred_test = model_gbr.predict(test_df)

In [None]:
result = pd.DataFrame()
result["id"] = test_df.index.tolist()
result["target"] = Y_pred_test
result

In [None]:
result.to_csv("result.csv", index=False)

# 6. XGB Regressor

In [None]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(verbosity=1, max_depth=5, learning_rate=0.01, n_estimators=1000)
model_xgb.fit(X_train, Y_train)
print("Train MSE:", mean_squared_error(Y_train, model_xgb.predict(X_train)))
print("Validation MSE:", mean_squared_error(Y_val, model_xgb.predict(X_val)))

# Train MSE: 0.48734597414839487
# Validation MSE: 0.5006409058173809

# 7. MLP

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation="selu", input_shape=[X_train.shape[1]]),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation="selu"),
    tf.keras.layers.Dense(128, activation="selu"),
    tf.keras.layers.Dense(128, activation="selu"),
    tf.keras.layers.Dense(1)
])

display.display(model.summary())

model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001))
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size=512,
                    epochs=10)

In [None]:
def create_submission_file(model, test_df):
    Y_pred_test = model.predict(test_df)
    result = pd.DataFrame()
    result["id"] = test_df.index.tolist()
    result["target"] = Y_pred_test
    result.to_csv("result.csv", index=False)

In [None]:
create_submission_file(model, test_df)