# データ準備する
azuremlのデータを扱うために必要なazureml-dataprep等をインストールする(公式チュートリアルのままだとバージョンの整合性が合わないため、 `--ignore-installed` オプションを利用している)

In [None]:
!pip install --ignore-installed azureml-dataprep azureml-sdk[automl,notebooks] matplotlib certifi

In [2]:
import azureml.dataprep as dprep
import azureml.core
import pandas as pd
import logging
import os
from azureml.core.workspace import Workspace
from sklearn.model_selection import train_test_split
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment

CSVファイルもdprepをデータフレームではなく独自のデータフロー型に変換する

In [3]:
df_raw = dprep.auto_read_file(path="./32_Shimane Prefecture_20171_20182.csv")

In [4]:
display(df_raw.head(5))

Unnamed: 0,No,種類,地域,市区町村コード,都道府県名,市区町村名,地区名,最寄駅：名称,最寄駅：距離（分）,取引価格（総額）,...,今後の利用目的,前面道路：方位,前面道路：種類,前面道路：幅員（ｍ）,都市計画,建ぺい率（％）,容積率（％）,取引時点,改装,取引の事情等
0,1.0,宅地(土地),住宅地,32201.0,島根県,松江市,秋鹿町,秋鹿町,4,5200000.0,...,住宅,東,市道,11.0,市街化調整区域,70.0,200.0,平成29年第１四半期,,
1,2.0,林地,,32201.0,島根県,松江市,秋鹿町,,,25000.0,...,,,,,,,,平成30年第２四半期,,
2,3.0,宅地(土地と建物),住宅地,32201.0,島根県,松江市,上乃木,乃木,19,25000000.0,...,住宅,南,市道,6.0,第２種中高層住居専用地域,60.0,200.0,平成29年第４四半期,,
3,4.0,宅地(土地),住宅地,32201.0,島根県,松江市,上乃木,松江,30分?60分,2000000.0,...,その他,北,市道,2.3,第１種住居地域,60.0,200.0,平成30年第２四半期,,
4,5.0,宅地(土地と建物),住宅地,32201.0,島根県,松江市,上乃木,松江,30分?60分,28000000.0,...,住宅,南東,市道,4.0,第１種住居地域,60.0,200.0,平成30年第２四半期,,


カラムの内容を自動で分析する

In [5]:
type_infer = df_raw.builders.set_column_types()

In [6]:
type_infer.learn()

In [7]:
type_converted_df = type_infer.to_dataflow()

In [8]:
type_converted_df.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent missing,Error Count,Empty count,0.1% Quantile,1% Quantile,5% Quantile,25% Quantile,50% Quantile,75% Quantile,95% Quantile,99% Quantile,99.9% Quantile,Mean,Standard Deviation,Variance,Skewness,Kurtosis
No,FieldType.DECIMAL,1.00,3132.00,3132.0,0.0,3132.0,0.0,0.0,0.0,3.63,313.7,312.5,783.5,1566.5,2349.5,2975.9,3101.18,3129.37,1566.5,904.27,817713.0,0.0,-1.2
種類,FieldType.STRING,中古マンション等,農地,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
地域,FieldType.STRING,,工業地,3132.0,0.0,3132.0,0.0,0.0,1218.0,,,,,,,,,,,,,,
市区町村コード,FieldType.DECIMAL,32201.00,32528.00,3132.0,0.0,3132.0,0.0,0.0,0.0,32201.0,32201.0,32201.0,32202.0,32203.0,32206.0,32502.72,32528.0,32528.0,32233.85,86.3,7447.32,2.65,5.39
都道府県名,FieldType.STRING,島根県,島根県,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
市区町村名,FieldType.STRING,仁多郡奥出雲町,鹿足郡津和野町,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
地区名,FieldType.STRING,あけぼの西町,（大字なし）,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
最寄駅：名称,FieldType.STRING,,高浜(島根),3132.0,0.0,3132.0,0.0,0.0,1354.0,,,,,,,,,,,,,,
最寄駅：距離（分）,FieldType.STRING,,9,3132.0,0.0,3132.0,0.0,0.0,1356.0,,,,,,,,,,,,,,
取引価格（総額）,FieldType.DECIMAL,1200.00,500000000.00,3132.0,0.0,3132.0,0.0,0.0,0.0,3800.8,94716.8,87947.37,541006.35,3263428.57,9240196.08,31496491.23,81016666.67,207360000.0,8545262.93,19720974.62,388916839997765.25,11.43,221.26


データフレームをパッケージ化する

In [9]:
file_path = os.path.join(os.getcwd(), "dflows.dprep")

package = dprep.Package([type_converted_df])
package.save(file_path)

Package
  name: None
  path: /home/nbuser/library/dflows.dprep
  dataflows: [
    Dataflow {
      name: 32_Shimane Prefecture_20171_20182
      steps: 5
    },
  ]

# トレーニング
最初に利用するWorkspaceの情報を取得する。
実行する時に一度Azureにログインを行う。

In [None]:
ws = Workspace.from_config()
experiment_name = 'automated-ml-regression'
project_folder = './automated-ml-regression'

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

In [12]:
file_path = os.path.join(os.getcwd(), "dflows.dprep")

package_saved = dprep.Package.open(file_path)
dflow_prepared = package_saved.dataflows[0]
dflow_prepared.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent missing,Error Count,Empty count,0.1% Quantile,1% Quantile,5% Quantile,25% Quantile,50% Quantile,75% Quantile,95% Quantile,99% Quantile,99.9% Quantile,Mean,Standard Deviation,Variance,Skewness,Kurtosis
No,FieldType.DECIMAL,1.00,3132.00,3132.0,0.0,3132.0,0.0,0.0,0.0,3.63,313.7,312.5,783.5,1566.5,2349.5,2975.9,3101.18,3129.37,1566.5,904.27,817713.0,0.0,-1.2
種類,FieldType.STRING,中古マンション等,農地,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
地域,FieldType.STRING,,工業地,3132.0,0.0,3132.0,0.0,0.0,1218.0,,,,,,,,,,,,,,
市区町村コード,FieldType.DECIMAL,32201.00,32528.00,3132.0,0.0,3132.0,0.0,0.0,0.0,32201.0,32201.0,32201.0,32202.0,32203.0,32206.0,32502.72,32528.0,32528.0,32233.85,86.3,7447.32,2.65,5.39
都道府県名,FieldType.STRING,島根県,島根県,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
市区町村名,FieldType.STRING,仁多郡奥出雲町,鹿足郡津和野町,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
地区名,FieldType.STRING,あけぼの西町,（大字なし）,3132.0,0.0,3132.0,0.0,0.0,0.0,,,,,,,,,,,,,,
最寄駅：名称,FieldType.STRING,,高浜(島根),3132.0,0.0,3132.0,0.0,0.0,1354.0,,,,,,,,,,,,,,
最寄駅：距離（分）,FieldType.STRING,,9,3132.0,0.0,3132.0,0.0,0.0,1356.0,,,,,,,,,,,,,,
取引価格（総額）,FieldType.DECIMAL,1200.00,500000000.00,3132.0,0.0,3132.0,0.0,0.0,0.0,3800.8,94716.8,87947.37,541006.35,3263428.57,9240196.08,31496491.23,81016666.67,207360000.0,8545262.93,19720974.62,388916839997765.25,11.43,221.26


In [13]:
dflow_X = dflow_prepared.keep_columns(['市区町村コード','間取り', '改装','前面道路：幅員（ｍ）', '延床面積（㎡）'])

In [14]:
dflow_y = dflow_prepared.keep_columns('取引価格（総額）')

In [15]:
x_df = dflow_X.to_pandas_dataframe()
y_df = dflow_y.to_pandas_dataframe()

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)
y_train.values.flatten()

array([1.1e+04, 1.9e+04, 8.4e+06, ..., 3.4e+07, 8.2e+05, 2.3e+07])

In [16]:
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 30,
    "primary_metric" : 'spearman_correlation',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 5
}

In [17]:
automated_ml_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = project_folder,
                             X = x_train.values,
                             y = y_train.values.flatten(),
                             **automl_settings)

In [18]:
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(automated_ml_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_ae8b2755-4234-448b-8dbd-0d7a9a13955a
********************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
SAMPLING %: Percent of the training data to sample.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
********************************************************************************************************************

 ITERATION   PIPELINE                                       SAMPLING %  DURATION      METRIC      BEST
         0   MaxAbsScaler RandomForest                      100.0000    0:00:48       0.7276    0.7276
         1   StandardScalerWrapper DecisionTree             100.0000    0:01:12       0.7324    0.7324
         2   StandardScalerWrapper LightGBM                 100