## In this Notebook, we will compare a number of regression models. It includes examples of how to use them.
## このNotebookでは、たくさんの回帰モデルを比較します。簡単な使い方の例も載せているので参考にしてください。

Regressors introduced in this Notebook. / このNotebookで紹介する回帰モデル  
  
Linear Regressor / 線形回帰  
  
Ridge / リッジ回帰(L2正則化)  
  
Lasso / ラッソ回帰(L1正則化)  
  
ElasticNet Regressor(= Ridge + Lasso) / ElasticNet回帰(= リッジ + ラッソ)  
  
K Nearest Neighbors / k近傍法  
  
AdaBoost  
  
DecisionTree / 決定木  
  
RandomForest / ランダムフォレスト  
  
ExtraTrees
  
XGBoost  
  
LightGBM  
  
Neural Network(=MLP) / ニューラルネットワーク(=多層パーセプトロン)

### Because I am beginner, I may not be good at writing code...  
### 初心者なのでコードが汚いかもしれません、すみません、、、

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import seaborn as sns
sns.set_context('talk')

SEED = 2021

# Preparation Data (データの準備)

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")

# In this Notebook, we use 50000 samples because to reduce calculation time
# 計算時間短縮のため、今回は50000個のデータのみ使うことにする。
train = train.sample(50000, random_state=SEED)

X = train.drop(["id", "target"], axis=1)
y = train.target

print(f"X.shape: {X.shape}")
print(f"y.shape: {y.shape}")

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training data and validation data.
# 学習用データと検証用データに分割する。
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

print(f"X_train.shape: {X_train.shape}")
print(f"X_val.shape: {X_val.shape}")

# Modeling (モデリング)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb
import lightgbm as lgb
from keras import Sequential, layers
from keras.callbacks import EarlyStopping

In [None]:
from sklearn.metrics import mean_squared_error as mse

def rmse(pred, true):
    return np.sqrt(mse(pred, true))

### Linear Regression / 線形回帰

In [None]:
reg = LinearRegression()

reg.fit(X_train, y_train)
pred_reg = reg.predict(X_val)

score_reg = rmse(pred_reg, y_val)
print(score_reg)

### Ridge / リッジ回帰

In [None]:
ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)
pred_ridge = ridge.predict(X_val)

score_ridge = rmse(pred_ridge, y_val)
print(score_ridge)

### Lasso / ラッソ回帰

In [None]:
lasso = Lasso(alpha=0.9, max_iter=500)

lasso.fit(X_train, y_train)
pred_lasso = lasso.predict(X_val)

score_lasso = rmse(pred_lasso, y_val)
print(score_lasso)

### ElasticNet

In [None]:
en = ElasticNet(alpha=1.2, l1_ratio=0.8, max_iter=1000)

en.fit(X_train, y_train)
pred_en = en.predict(X_val)

score_en = rmse(pred_en, y_val)
print(score_en)

### K Nearest Neighbors / k近傍法

In [None]:
knr = KNeighborsRegressor(n_neighbors=7)

knr.fit(X_train, y_train)
pred_knr = knr.predict(X_val)

score_knr = rmse(pred_knr, y_val)
print(score_knr)

### AdaBoost

In [None]:
abr = AdaBoostRegressor(n_estimators=100, learning_rate=0.08, random_state=SEED)

abr.fit(X_train, y_train)
pred_abr = abr.predict(X_val)

score_abr = rmse(pred_abr, y_val)
print(score_abr)

### DecisionTree / 決定木

In [None]:
dt = DecisionTreeRegressor(max_depth=6)

dt.fit(X_train, y_train)
pred_dt = dt.predict(X_val)

score_dt = rmse(pred_dt, y_val)
print(score_dt)

### RandomForest / ランダムフォレスト

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=SEED)

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_val)

score_rf = rmse(pred_rf, y_val)
print(score_rf)

### ExtraTrees

In [None]:
et = ExtraTreesRegressor(n_estimators=100, max_depth=6, random_state=SEED)

et.fit(X_train, y_train)
pred_et = et.predict(X_val)

score_et = rmse(pred_et, y_val)
print(score_et)

### XGBoost

In [None]:
params_xgb = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.05,
    "colsample_bytree": 0.7,
    "subsample": 0.6,
    "random_state": SEED
}

In [None]:
d_train = xgb.DMatrix(X_train, label=y_train)
d_val = xgb.DMatrix(X_val, label=y_val)

In [None]:
model_xgb = xgb.train(params=params_xgb,
                      dtrain=d_train,
                      num_boost_round=10000,
                      early_stopping_rounds=20,
                      verbose_eval=20,
                      evals=[(d_train, "train"), (d_val, "val")])

In [None]:
pred_xgb = model_xgb.predict(d_val, ntree_limit=model_xgb.best_ntree_limit)
score_xgb = rmse(pred_xgb, y_val)
print(score_xgb)

### LightGBM

In [None]:
params_lgb = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.7,
    "random_state": SEED
}

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

In [None]:
model_lgb = lgb.train(params=params_lgb,
                      train_set=lgb_train,
                      valid_sets=(lgb_train, lgb_val),
                      num_boost_round=10000,
                      early_stopping_rounds=20,
                      verbose_eval=20)

In [None]:
pred_lgb = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
score_lgb = rmse(pred_lgb, y_val)
print(score_lgb)

### Neural Network(MLP) / ニューラルネットワーク(多層パーセプトロン)

In [None]:
callbacks = [EarlyStopping(monitor="val_mse", patience=20)]

In [None]:
NN = Sequential()

NN.add(layers.Dense(128, activation="relu", input_shape=(X_train.shape[1], )))
NN.add(layers.Dense(64, activation="relu"))
NN.add(layers.Dense(64, activation="relu"))
NN.add(layers.Dense(32, activation="relu"))
NN.add(layers.Dense(32, activation="relu"))
NN.add(layers.Dense(10, activation="relu"))
NN.add(layers.Dense(1, activation="linear"))
NN.compile(optimizer="adam", loss="mse", metrics=["mse"])

NN.summary()

In [None]:
history = NN.fit(x=X_train,
                 y=y_train,
                 epochs=10000,
                 batch_size=128,
                 verbose=2,
                 callbacks=callbacks,
                 validation_data=(X_val, y_val))

In [None]:
pred_nn = NN.predict(X_val)
score_nn = rmse(pred_nn, y_val)
print(score_nn)

# Results Comparison / 結果を比較

In [None]:
regressors = pd.DataFrame(["Linear Reg", "Rdige", "Lasso", "ElasticNet", "KNN", "AdaBoost", "DecisionTree", "RandomForest", "ExtraTrees", "XGBoost", "LightGBM", "NeuralNetwork"], columns=["regressor"])

scores = pd.DataFrame([score_reg, score_ridge, score_lasso, score_en, score_knr, score_abr, score_dt, score_rf, score_et, score_xgb, score_lgb, score_nn], columns=["RMSE"])

results = pd.concat([regressors, scores], axis=1)
results

In [None]:
results_sorted = results.sort_values("RMSE")

plt.figure(figsize=(10, 6))
sns.barplot(x="RMSE", y="regressor", data=results_sorted)
plt.xlabel("RMSE", fontsize=20)
plt.ylabel("Regressor", fontsize=20)
plt.xlim(0.6, 0.8)
plt.show()

======================================================================================================
## 参考になったらUpVoteしていただけると嬉しいです、、、  
## If you find it helpful, I'd appreciate an UpVote!  
======================================================================================================

質問・コメント等あれば"本当に"気軽にどうぞ！  
答えられる範囲で答えます。