# mlflow tutorial
### 参考URL
- https://www.mlflow.org/docs/latest/tracking.html
- https://blog.imind.jp/entry/2019/06/14/223224
- https://qiita.com/masa26hiro/items/574c48d523ed76e76a3b
- https://qiita.com/ike_dai/items/3121a8cc8398c6ec6a33

In [1]:
import mlflow
import pandas as pd

# set tracking uri

In [2]:
# 保存先の指定と確認、指定しなければ直下にできる、今後はそのディレクトリ直下を見に行くようになる
# s3とかのも見に行けるから共同でやるときに便利そう(多分gitのもいける)
# databricks社が開発しているから、databricksのworkspaceにも指定できるっぽい
# mlrunsフォルダにexperimentの結果を格納する必要があるらしい。
mlflow.set_tracking_uri('../mlflow/test2/mlruns/')
mlflow.tracking.get_tracking_uri()

'../mlflow/test2/mlruns/'

# create experiment

In [3]:
# experimentの作成、戻り値でIDが返される（後々使う？）
# experimentの切り分けは色々ありそう（モデルごと、特徴量ごと、案件ごと等）
experiment_id =  mlflow.create_experiment('compare_max_depth')

  if not isinstance(key, collections.Hashable):


In [4]:
# エクスペリメントの指定、なければ自動で作成される
mlflow.set_experiment('compare_max_depth')

# run!

In [5]:
# runのリストを取得、初回実行前はもちろん何もない。
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time


In [9]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)

# logs a local file or directory as an artifact
df.to_csv('./test.csv')

with mlflow.start_run():
    mlflow.log_artifact('./test.csv') # 試してないけど、pickleとか
    mlflow.log_param('param1', 3)
    mlflow.log_metric('metric1', 0.3)

In [7]:
# runのリストを取得
# 結果を見れる
# 下記方法(Viewing the Tracking UI)でもっとイカすUIで見れるらしい
# https://www.mlflow.org/docs/latest/quickstart.html#quickstart
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.metric1,params.param1,tags.mlflow.user,tags.mlflow.source.git.commit,tags.mlflow.source.type,tags.mlflow.source.name
0,5338b53abf2b45a6bfe48c1e6f6e6ea5,1,FINISHED,../mlflow/test2/mlruns/1/5338b53abf2b45a6bfe48...,2020-03-15 05:16:06.466000+00:00,2020-03-15 05:16:06.589000+00:00,0.1,1,s.imazeki,97bf8b5f00284af63c34ae660e047ad3cf497369,LOCAL,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...


In [47]:
# experimentを変えれば、runも変わる
mlflow.set_experiment('test_runs2')
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time


# download artifacts

In [14]:
# logをとったartifactの取得
tracking = mlflow.tracking.MlflowClient()
tracking.list_artifacts(run_id='9175be40c10d4a6491cfb8536fbc6ea8')

[<FileInfo: file_size=23, is_dir=False, path='test.csv'>]

In [15]:
# dstを指定しない場合はuriの中に。
tracking.download_artifacts('9175be40c10d4a6491cfb8536fbc6ea8', 'test.csv')

'/Users/s.imazeki/Documents/kaggle/NCAA/mlflow/1/9175be40c10d4a6491cfb8536fbc6ea8/artifacts/test.csv'

In [16]:
tracking.download_artifacts('9175be40c10d4a6491cfb8536fbc6ea8', 'test.csv', './')

'/Users/s.imazeki/Documents/kaggle/NCAA/notebooks/test.csv'

# tips的な

In [33]:
# こけたやつとか不要なやつ消せる
# 本来はおそらくstaus見てfailedのやつでフィルターかけて消す運用になりそう。
# start_run時にパラメータで既存のruin_idを指定すると上書きできるっぽい（ないやつを指定したらnot found）
tracking = mlflow.tracking.MlflowClient()
tracking.delete_run('067f32de2308491997e44d1c958842c9')

In [12]:
# idからname取得
tracking = mlflow.tracking.MlflowClient()
experimet = tracking.get_experiment('1')
experimet.name

'compare_max_depth'

In [19]:
# interface to MLflow experiments and runs
# よくわかってないけど、trackingモジュールのが色々触れるんだろうなって感じかな（適当）
tracking = mlflow.tracking.MlflowClient()
experiment = tracking.get_experiment_by_name('test_runs')
# tracking_idの取得
experiment.experiment_id

'1'

In [20]:
# experiment list
tracking.list_experiments()

[<Experiment: artifact_location='../mlflow/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='../mlflow/1', experiment_id='1', lifecycle_stage='active', name='test_runs', tags={}>]

In [61]:
with mlflow.start_run():
    mlflow.log_param('test1', 1)
    mlflow.log_param('test2', 2)
    mlflow.log_param('test3', 3) # 追加
    mlflow.log_metric('metric', 100)

In [62]:
# 途中からparamを追加すると、それ以前のはNone
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.metric,params.test3,params.test2,params.test1,tags.mlflow.source.git.commit,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.source.type
0,d39bc244caf24a62aa8973a9c3aed564,1,FINISHED,../mlflow/1/d39bc244caf24a62aa8973a9c3aed564/a...,2020-03-01 08:49:31.100000+00:00,2020-03-01 08:49:31.303000+00:00,100.0,3.0,2,1,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
1,0a41f715048f48618c44bae34de12294,1,FINISHED,../mlflow/1/0a41f715048f48618c44bae34de12294/a...,2020-03-01 08:12:21.442000+00:00,2020-03-01 08:12:21.618000+00:00,100.0,,2,1,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
2,ab9ec5df0bf94bad84a3b60601909d18,1,FINISHED,../mlflow/1/ab9ec5df0bf94bad84a3b60601909d18/a...,2020-03-01 07:55:00.316000+00:00,2020-03-01 07:55:00.473000+00:00,100.0,,2,1,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
3,efade88198304041aea82447b08367d0,1,FINISHED,../mlflow/1/efade88198304041aea82447b08367d0/a...,2020-03-01 07:54:32.705000+00:00,2020-03-01 08:34:01.761000+00:00,100.0,,2,1,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL


In [63]:
with mlflow.start_run():
    mlflow.log_param('test1', 1)
    mlflow.log_param('test2', 2)
    mlflow.log_param('test2', 200) # 上書き
    mlflow.log_metric('metric', 100)

In [64]:
# 上書きされる
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.metric,params.test2,params.test1,params.test3,tags.mlflow.source.git.commit,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.source.type
0,db1d2c705ed94b619b99ce163d3538f5,1,FINISHED,../mlflow/1/db1d2c705ed94b619b99ce163d3538f5/a...,2020-03-01 08:51:26.196000+00:00,2020-03-01 08:51:26.370000+00:00,100.0,200,1,,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
1,d39bc244caf24a62aa8973a9c3aed564,1,FINISHED,../mlflow/1/d39bc244caf24a62aa8973a9c3aed564/a...,2020-03-01 08:49:31.100000+00:00,2020-03-01 08:49:31.303000+00:00,100.0,2,1,3.0,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
2,0a41f715048f48618c44bae34de12294,1,FINISHED,../mlflow/1/0a41f715048f48618c44bae34de12294/a...,2020-03-01 08:12:21.442000+00:00,2020-03-01 08:12:21.618000+00:00,100.0,2,1,,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
3,ab9ec5df0bf94bad84a3b60601909d18,1,FINISHED,../mlflow/1/ab9ec5df0bf94bad84a3b60601909d18/a...,2020-03-01 07:55:00.316000+00:00,2020-03-01 07:55:00.473000+00:00,100.0,2,1,,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL
4,efade88198304041aea82447b08367d0,1,FINISHED,../mlflow/1/efade88198304041aea82447b08367d0/a...,2020-03-01 07:54:32.705000+00:00,2020-03-01 08:34:01.761000+00:00,100.0,2,1,,97bf8b5f00284af63c34ae660e047ad3cf497369,/Users/s.imazeki/.pyenv/versions/anaconda3-5.3...,s.imazeki,LOCAL


In [None]:
# 複数同時入れたい時はdict型にする。
params = {
    'test1': 1,
    'test2': 2
         }

with mlflow.start_run():
#     mlflow.log_artifact('./test.csv') # 試してないけど、pickleとか
    mlflow.log_params(params)
    mlflow.log_metric('metric', 100)

In [25]:
with mlflow.start_run():
    run_id = mlflow.active_run().info.run_id
# 再開させられる
with mlflow.start_run(run_id=run_id):
    print('ok')