In this notebook, you will learn how to make your first submission to the [Tabular Playground Series - Feb 2021 competition.](http://https://www.kaggle.com/c/tabular-playground-series-feb-2021)

# Make the most of this notebook!

You can use the "Copy and Edit" button in the upper right of the page to create your own copy of this notebook and experiment with different models. You can run it as is and then see if you can make improvements.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
#XGBoost
import xgboost as xgb

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# %cd /content/drive/MyDrive/kaggle

# Read in the data files

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

In [None]:
df_train = pd.read_csv(input_path/'train.csv', header=0)
display(df_train.head())

In [None]:
df_test = pd.read_csv(input_path/'test.csv', header=0)
display(df_test.head())

In [None]:
submission = pd.read_csv(input_path/'sample_submission.csv', header=0)
display(submission.head())

In [None]:
print(len(df_train))
print(len(df_test))

In [None]:
df_all

In [None]:
#データ結合
df_train["TrainFlag"] = True
df_test["TrainFlag"] = False

df_all = df_train.append(df_test)
df_all.index = df_all["id"]
df_all.drop("id", axis = 1, inplace = True)

In [None]:
df_all

In [None]:
df_all = pd.get_dummies(df_all, drop_first=True)
df_all

In [None]:
#df_allを訓練データとテストデータに再度分ける
df_train = df_all[df_all["TrainFlag"] == True]
df_train = df_train.drop(["TrainFlag"], axis = 1)

df_test = df_all[df_all["TrainFlag"] == False]
df_test = df_test.drop(["TrainFlag"], axis = 1)
df_test = df_test.drop(["target"], axis = 1)

In [None]:
df_train

In [None]:
y=df_train['target']

In [None]:
#データ分割
y = df_train["target"].values
X = df_train.drop("target", axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [None]:
X_train

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(df_test.values)

In [None]:
dtrain

In [None]:
params = {
        'objective': 'reg:squarederror','silent':1, 'random_state':1234, 
        # 学習用の指標 (RMSE)
        'eval_metric': 'rmse',
    }
num_round = 500
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]#訓練データはdtrain、評価用のテストデータはdvalidと設定

In [None]:
model = xgb.train(params,
                    dtrain,#訓練データ
                    num_round,#設定した学習回数
                    early_stopping_rounds=20,
                    evals=watchlist,
                    )

In [None]:
#予測
prediction_XG = model.predict(dtest, ntree_limit = model.best_ntree_limit)

#小数を丸めている
# prediction_XG = np.round(prediction_XG)

In [None]:
_, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(model,
                    ax=ax,
                    importance_type='weight',
                    show_values=True)
plt.show()

In [None]:
import lightgbm as lgb

In [None]:
# 訓練・テストデータの設定
train_data = lgb.Dataset(X_train, label=y_train)
eval_data = lgb.Dataset(X_test, label=y_test, reference= train_data)

In [None]:
params = {'metric': 'rmse',
#           'boosting_type': 'gbdt',
          'max_depth' : 9}


In [None]:
gbm = lgb.train(params,
                train_data,
                valid_sets=eval_data,
                num_boost_round=10000,
                early_stopping_rounds=100,
                verbose_eval=50)

In [None]:
df_test = pd.get_dummies(df_test, drop_first=True)

In [None]:
df_test

In [None]:
prediction_gbm = gbm.predict(df_test)

In [None]:
df_train

In [None]:
df_train = df_train.drop("target", axis=1).values

In [None]:
df_train

In [None]:
prediction_XG = model.predict(X, ntree_limit = model.best_ntree_limit)

In [None]:
prediction_gbm = gbm.predict(df_train)

In [None]:
len(prediction_XG)

In [None]:
predicted

In [None]:
prediction_XG

In [None]:
model

In [None]:
gbm

In [None]:
print(mean_squared_error(prediction_XG, y, squared=False))
# print(mean_squared_error(prediction_gbm, y, squared=False))

In [None]:
print(mean_squared_error(0.75*train_oof_lgb+0.25*train_oof_xgb, target, squared=False))

In [None]:
#関数の処理で必要なライブラリ
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#予測値と正解値を描写する関数
def True_Pred_map(pred_df):
    RMSE = np.sqrt(mean_squared_error(pred_df['true'], pred_df['pred']))
    R2 = r2_score(pred_df['true'], pred_df['pred']) 
    plt.figure(figsize=(8,8))
    ax = plt.subplot(111)
    ax.scatter('true', 'pred', data=pred_df)
    ax.set_xlabel('True Value', fontsize=15)
    ax.set_ylabel('Pred Value', fontsize=15)
    ax.set_xlim(pred_df.min().min()-0.1 , pred_df.max().max()+0.1)
    ax.set_ylim(pred_df.min().min()-0.1 , pred_df.max().max()+0.1)
    x = np.linspace(pred_df.min().min()-0.1, pred_df.max().max()+0.1, 2)
    y = x
    ax.plot(x,y,'r-')
    plt.text(0.1, 0.9, 'RMSE = {}'.format(str(round(RMSE, 5))), transform=ax.transAxes, fontsize=15)
    plt.text(0.1, 0.8, 'R^2 = {}'.format(str(round(R2, 5))), transform=ax.transAxes, fontsize=15)

In [None]:
pred_df = pd.concat([pd.Series(y_test), pd.Series(predicted)], axis=1)
pred_df.columns = ['true', 'pred']

In [None]:
pred_df

In [None]:
True_Pred_map(pred_df)

In [None]:
prediction_XG

In [None]:
prediction_gbm

In [None]:
predicted = 0.75*prediction_gbm + 0.25*prediction_XG
predicted

In [None]:
submission = pd.DataFrame({"id": df_test.index, "target": predicted})

In [None]:
submission.to_csv('xg_gbm.csv',index=False)

In [None]:
file = pd.read_csv('xg_gbm.csv')

In [None]:
file

## We need to encode the categoricals.

There are different strategies to accomplish this, and different approaches will have different performance when using different algorithms. For this starter notebook, we'll use simple encoding.

In [None]:
# for c in train.columns:
#     if train[c].dtype=='object': 
#         lbl = LabelEncoder()
#         lbl.fit(list(train[c].values) + list(test[c].values))
#         train[c] = lbl.transform(train[c].values)
#         test[c] = lbl.transform(test[c].values)
        
# display(train.head())

## Pull out the target, and make a validation split

In [None]:
# # target = train.pop('target')
# X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.70)

# How well can we do with a completely naive model?

We'll want any of our models to do (hopefully much!) better than this.

In [None]:
# # Let's get a benchmark score
# model_dummy = DummyRegressor(strategy='median')
# model_dummy.fit(X_train, y_train)
# y_dummy = model_dummy.predict(X_test)
# score_dummy = mean_squared_error(y_test, y_dummy, squared=False)
# print(f'{score_dummy:0.5f}')

# Simple Linear Regression

A simple linear regression doesn't do better than our dummy regressor! (Alghouth, simple categorical encoding really doesn't make sense for this approach!)

In [None]:
# # Simple Linear Regression
# model_simple_linear = LinearRegression(fit_intercept=False) # data is not centered, don't fit intercept
# model_simple_linear.fit(X_train, y_train)
# y_simple_linear = model_simple_linear.predict(X_test)
# score_simple_linear = mean_squared_error(y_test, y_simple_linear, squared=False)
# print(f'{score_simple_linear:0.5f}')

# This seems slow and repetative. Can we automate it a bit?

In [None]:
# def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
#     plt.figure(figsize=figsize)
#     score = mean_squared_error(y, yhat, squared=False)
#     plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
#     plt.plot(lims, lims)
#     plt.ylim(lims)
#     plt.xlim(lims)
#     plt.title(f'{name}: {score:0.5f}', fontsize=18)
#     plt.show()

In [None]:
# model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

# models = [
#     DummyRegressor(strategy='median'),
#     LinearRegression(fit_intercept=False),
#     Lasso(fit_intercept=False),
#     RandomForestRegressor(n_estimators=50, n_jobs=-1)]

# for name, model in zip(model_names, models):
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     plot_results(name, y_test, y_pred)

# It look like RandomForest did the best. Let's train it on all the data and make a submission!

In [None]:
# model = RandomForestRegressor(n_estimators=50, n_jobs=-1)
# model.fit(train, target)
# submission['target'] = model.predict(test)
# submission.to_csv('random_forest.csv')

## Now you should save your Notebook (blue button in the upper right), and then when that's complete go to the notebook viewer and make a submission to the competition. :-)

## There's lots of room for improvement. What things can you try to get a better score?

In [None]:
import operator, math, random, time
import numpy as np

from deap import algorithms, base, creator, tools, gp

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss

import matplotlib.pyplot as plt

# サンプルデータの生成


# ベースラインスコアの算出
clf = LogisticRegression(penalty="l2", C=1.0)
base_train_auc = np.mean(cross_val_score(clf, X_train, y_train, scoring="roc_auc", cv=5))
clf.fit(X_train, y_train)
base_test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

# 除算関数の定義
# 左項 / 右項で右項が0の場合1を代入する
def protectedDiv(left, right):
    eps = 1.0e-7
    tmp = np.zeros(len(left))
    tmp[np.abs(right) >= eps] = left[np.abs(right) >= eps] / right[np.abs(right) >= eps]
    tmp[np.abs(right) < eps] = 1.0
    return tmp


# 乱数シード
random.seed(123)

# 適合度を最大化するような木構造を個体として定義
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)

# 初期値の計算
# 学習データの5-fold CVのAUCスコアを評価指標の初期値とする
n_features = X_train.shape[1]
clf = LogisticRegression(penalty="l2", C=1.0)
prev_auc = np.mean(cross_val_score(clf, X_train, y_train, scoring="roc_auc", cv=5))

# メインループ
# resultsに特徴量数、学習データのAUCスコア（5-fold CV）、テストデータのAUCスコアを保持する
# exprsに生成された特徴量の表記を保持する
results = []
exprs = []
for i in range(100):
    # 構文木として利用可能な演算の定義
    pset = gp.PrimitiveSet("MAIN", n_features)
    pset.addPrimitive(operator.add, 2)
    pset.addPrimitive(operator.sub, 2)
    pset.addPrimitive(operator.mul, 2)
    pset.addPrimitive(protectedDiv, 2)
    pset.addPrimitive(operator.neg, 1)
    pset.addPrimitive(np.cos, 1)
    pset.addPrimitive(np.sin, 1)
    pset.addPrimitive(np.tan, 1)

    # 関数のデフォルト値の設定
    toolbox = base.Toolbox()
    toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=3)
    toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("compile", gp.compile, pset=pset)

    # 評価関数の設定
    # 新しく生成した変数を元の変数に加えて5-fold CVを求める
    def eval_genfeat(individual):
        func = toolbox.compile(expr=individual)
        features_train = [X_train[:,i] for i in range(n_features)]
        new_feat_train = func(*features_train)
        X_train_tmp = np.c_[X_train, new_feat_train]
        return np.mean(cross_val_score(clf, X_train_tmp, y_train, scoring="roc_auc", cv=5)),

    # 評価、選択、交叉、突然変異の設定
    # 選択はサイズ10のトーナメント方式、交叉は1点交叉、突然変異は深さ2のランダム構文木生成と定義
    toolbox.register("evaluate", eval_genfeat)
    toolbox.register("select", tools.selTournament, tournsize=10)
    toolbox.register("mate", gp.cxOnePoint)
    toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
    toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

    # 構文木の制約の設定
    # 交叉や突然変異で深さ5以上の木ができないようにする
    toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=5))
    toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=5)) 

    # 世代ごとの個体とベスト解を保持するクラスの生成
    pop = toolbox.population(n=300)
    hof = tools.HallOfFame(1)

    # 統計量の表示設定
    stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
    stats_size = tools.Statistics(len)
    mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    mstats.register("avg", np.mean)
    mstats.register("std", np.std)
    mstats.register("min", np.min)
    mstats.register("max", np.max)

    # 進化の実行
    # 交叉確率50%、突然変異確率10%、10世代まで進化
    start_time = time.time()
    pop, log = algorithms.eaSimple(pop, toolbox, 0.5, 0.1, 10, stats=mstats, halloffame=hof, verbose=True)
    end_time = time.time()

    # ベスト解とAUCの保持
    best_expr = hof[0]
    best_auc = mstats.compile(pop)["fitness"]["max"]

    # 5-fold CVのAUCスコアが前ステップのAUCを超えていた場合
    # 生成変数を学習、テストデータに追加し、ベストAUCを更新する
    if prev_auc < best_auc:
        # 生成変数の追加
        func = toolbox.compile(expr=best_expr)
        features_train = [X_train[:,i] for i in range(n_features)]
        features_test = [X_test[:,i] for i in range(n_features)]
        new_feat_train = func(*features_train)
        new_feat_test = func(*features_test)
        X_train = np.c_[X_train, new_feat_train]
        X_test = np.c_[X_test, new_feat_test]

        ### テストAUCの計算（プロット用）
        clf.fit(X_train, y_train)
        train_auc = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
        test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

        # ベストAUCの更新と特徴量数の加算
        prev_auc = best_auc
        n_features += 1

        # 表示と出力用データの保持
        print(n_features, best_auc, train_auc, test_auc, end_time - start_time)
        results.append([n_features, best_auc, train_auc, test_auc])
        exprs.append(best_expr)

        # 変数追加後の特徴量数が30を超えた場合break
        if n_features >= 30:
            break

# 結果の出力
print()
print("### Results")
print("Baseline AUC train :", base_train_auc)
print("Baseline AUC test :", base_test_auc)
print("Best AUC train :", results[-1][1])
print("Best AUC test :", results[-1][3])

# 結果のプロット
res = np.array(results)
plt.plot(res[:,0], res[:,1],"o-", color="b", label="train(5-fold CV)")
plt.plot(res[:,0], res[:,3],"o-", color="r", label="test")
plt.plot(10, base_train_auc, "d", color="b", label = "train baseline(5-fold CV)")
plt.plot(10, base_test_auc, "d", color="r", label = "test baseline")
plt.xlim(9,31)
plt.grid(which="both")
plt.xlabel('n_features')
plt.ylabel('AUC')
plt.legend(loc="lower right")
plt.savefig("gp_featgen.png")

# 生成した構文木の出力
print()
print("### Generated feature expression")
for expr in exprs:
    print(expr)