# 最適化に必要なライブラリ
1. 前処理
1. ハイパーパラメータ
1. 評価関数

In [32]:
# 基本操作
import pandas as pd
import numpy as np

In [33]:
from mlbox.preprocessing import * # 前処理の最適化
from mlbox.optimisation import *
from mlbox.prediction import *

In [34]:
import featuretools as ft # 特徴量エンジニアリング
import featuretools.variable_types as vtypes

In [35]:
from sklearn.metrics import make_scorer
from sklearn import metrics

# 前処理の自動化

In [36]:
paths = ["./0_input/train.csv", "./0_input/test.csv"] #to modify
target_name = "y" #to modify

In [37]:
# Preprocessing
data = Reader(sep = ",").train_test_split(paths, target_name)


reading csv : train.csv ...
cleaning data ...
CPU time: 0.19073486328125 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 0.1282029151916504 seconds

> Number of common features : 17

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 9
> Number of numerical features: 8
> Number of training samples : 27100
> Number of test samples : 18050

> You have no missing values on train set...

> Task : classification
0.0    24988
1.0     2112
Name: y, dtype: int64

encoding target ...


In [38]:
data = Drift_thresholder().fit_transform(data)  #deleting non-stable variables


computing drifts ...
CPU time: 1.0722665786743164 seconds

> Top 10 drifts

('id', 0.3261392973597328)
('job', 0.013323940264333256)
('balance', 0.011248062475084364)
('contact', 0.007256787725772096)
('month', 0.007167981519150324)
('pdays', 0.007133620222628867)
('housing', 0.006552422033915484)
('age', 0.00620428289601449)
('duration', 0.0055432572497471355)
('campaign', 0.005348198423812622)

> Deleted variables : []
> Drift coefficients dumped into directory : save


In [39]:
# Encoding
Categorical_encoder(strategy = "dummification").get_params()

{'strategy': 'dummification', 'verbose': False}

# 特徴量エンジニアリング

In [40]:
# ダミー変数の生成
item_df = pd.DataFrame(data['train'])
item_df = pd.get_dummies(item_df)

tx_df = pd.DataFrame(data['test'])
tx_df = pd.get_dummies(tx_df)

In [41]:
es = ft.EntitySet(id='id')

In [42]:
es = es.entity_from_dataframe(entity_id='train',
                              dataframe=item_df,
                              index = 'id'
                             )

In [43]:
es = es.entity_from_dataframe(entity_id='test',
                              dataframe=tx_df,
                              index = 'id'
                             )

In [44]:
feature_train = ft.dfs(entityset=es, 
                       target_entity='train',
                       agg_primitives=['sum', 'std', 'max', 'skew', 'min', 'mean', 'count', 'percent_true', 'mode'], 
                       cutoff_time_in_index=True
                      )
feature_train = feature_train.reset_index(inplace=True)
feature_train

  agg_primitives: ['count', 'max', 'mean', 'min', 'mode', 'percent_true', 'skew', 'std', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


AttributeError: 'tuple' object has no attribute 'reset_index'

In [None]:
feature_test = ft.dfs(entityset=es, 
                      target_entity='test',
                      agg_primitives=['sum', 'std', 'max', 'skew', 'min', 'mean', 'count', 'percent_true', 'mode'], 
                      cutoff_time_in_index=True
                     )
feature_test = feature_test.reset_index(inplace=True)
feature_test

In [None]:
y = pd.read_csv("./0_input/train.csv")
y = ['y']
y = pd.DataFrame(y)

In [None]:
train = pd.concat([feature_train, y], axis=1)
train

# LightGBMの最適化

In [None]:
# Model(二値分類)
Clf_feature_selector()

In [None]:
Classifier().get_estimator()

In [None]:
AUC = make_scorer(metrics.auc, greater_is_better=True, needs_proba=True)

In [None]:
# Optimisation
opt = Optimiser(scoring = AUC, n_folds = 5)

params = {
     "ne__numerical_strategy" : int(0),
     "ce__strategy" : "label_encoding",
     "fs__threshold" : int(3),
     "stck__base_estimators" : [Regressor(strategy = "LightGBM")],
    #  "stck__base_estimators" : [Regressor(strategy = "RandomForest"), Regressor(strategy = "ExtraTrees")],
     "est__strategy" : "Linear"
}
opt.evaluate(params, data)

In [None]:
space = {    
        'est__strategy':{"search":"choice",
                                  "space":["LightGBM"]},    
        'est__n_estimators':{"search":"choice",
                                  "space":[150]},    
        'est__colsample_bytree':{"search":"uniform",
                                  "space":[0.8,0.95]},
        'est__subsample':{"search":"uniform",
                                  "space":[0.8,0.95]},
        'est__max_depth':{"search":"choice",
                                  "space":[5,6,7,8,9]},
        'est__learning_rate':{"search":"choice",
                                  "space":[0.07]} 
        }

# 最適化シミュレーション
best = opt.optimise(space, data, max_evals = int(2000)) #

In [None]:
# Prediction
Predictor().fit_predict(best, data)

# 提出処理
- コンペティションの規定に依る。

In [None]:
preds = pd.read_csv("save/"+"y"+"_predictions.csv")
result =  preds["y_predicted"].values
id = np.arange(1, 18051, 1)
result = pd.DataFrame(result, columns = list('1'))
id = pd.DataFrame(id, columns = list('0'))
submit = pd.concat([id, result], axis = 1)
print(submit)
submit.to_csv("./1_output/mlbox.csv", index = False)