In [None]:
# 必要なライブラリのインストール
!pip install lightgbm
!pip install signate

In [None]:
# signate api を一回実行しておく。初回実行時は必ず失敗するが、これで「.signate」フォルダがクラウド環境に作成される
!signate list

In [None]:
# tokenをクラウド環境に配置する
!echo '[作成したsignateのapi tokenを使用する]' > ~/.signate/signate.json

In [None]:
# コンペデータのダウンロード
!signate download -c 1

In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import lightgbm as lgb

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [None]:
# 日付に関するデータの調整
def dmtoint(df_all):
    month_dict = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
    df_all["month_int"] = df_all["month"].map(month_dict)

    # 何年かはわからないので2014年で仮置き（時間経過のようなものが表現できればそれでいいので）
    data_datetime = df_all \
        .assign(ymd_str=lambda x: "2014" + "-" + x["month_int"].astype(str) + "-" + x["day"].astype(str)) \
        .assign(datetime=lambda x: pd.to_datetime(x["ymd_str"])) \
        ["datetime"].values

    index = pd.DatetimeIndex(data_datetime)
    df_all["weekday"] = index.weekday # 曜日も加えてみる
    df_all["datetime_int"] = np.log(index.astype(np.int64))

    # 不要な列を削除
    return df_all.drop(["month", "day", "month_int"], axis=1)

In [None]:
# 訓練データ作成
train["housing"] = train["housing"].replace({"no": 0, "yes": 1})
train["loan"] = train["loan"].replace({"no": 0, "yes": 1})
train["default"] = train["default"].replace({"no": 0, "yes": 1})

test["housing"] = test["housing"].replace({"no": 0, "yes": 1})
test["loan"] = test["loan"].replace({"no": 0, "yes": 1})
test["default"] = test["default"].replace({"no": 0, "yes": 1})


# 改善その１　大きな値を対数にする
'''
train["balance"] = np.log(train.balance - train.balance.min() + 1)
train["duration"] = np.log(train.duration + 1)
train["campaign"] = np.log(train.campaign + 1)
train["pdays"] = np.log(train.pdays - train.pdays.min() + 1)

test["balance"] = np.log(test.balance - test.balance.min() + 1)
test["duration"] = np.log(test.duration + 1)
test["campaign"] = np.log(test.campaign + 1)
test["pdays"] = np.log(test.pdays - test.pdays.min() + 1)
'''


# 改善その２　日付に関するデータを組み替えてみる（連続した数値化と曜日の抽出）
'''
train = dmtoint(train)
train = pd.get_dummies(train, columns=["job", "marital", "education", "contact", "poutcome", "weekday"])
test = dmtoint(test)
test = pd.get_dummies(test, columns=["job", "marital", "education", "contact", "poutcome", "weekday"])
'''

train = pd.get_dummies(train, columns=["job", "marital", "education", "contact", "poutcome", "month"])
test = pd.get_dummies(test, columns=["job", "marital", "education", "contact", "poutcome", "month"])

Y = train["y"]
X = train.drop(columns=["id", "y"])
Z = test.drop(columns=["id"])

In [None]:
# 訓練
dtrain = lgb.Dataset(X, label=Y)
gbm = lgb.train(
    params={},
    train_set=dtrain
)
pred = gbm.predict(Z)

In [None]:
# 提出データ作成
answer = pd.read_csv('./submit_sample.csv', header=None)
answer[1] = pred

answer.to_csv('./answer.csv',  index=False, header=None)

In [None]:
# 提出
!signate submit -c 1 './answer.csv'

In [None]:
# 特徴量選択、特徴量の重要度
importance = pd.DataFrame(gbm.feature_importance(importance_type='gain'), index=X.columns, columns=['importance'])

In [None]:
importance.plot.barh()