In [9]:
import os
import json
from azureml.core import Workspace, Dataset, Datastore
from azureml.data.datapath import DataPath

### Create word_to_index.json

In [11]:
path = 'data/data.txt'
word_to_index = {"[PAD]": 0, "[UNK]": 1}
index = 2
with open(path, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        text = line.split('\t')[0]
        for word in text.split(' '):
            if not word in word_to_index:
                word_to_index[word] = index
                index += 1

with open('data/word_to_index.json', 'w', encoding='utf-8') as f:
    json.dump(word_to_index, f)

### Prepare data for batch inference

we need to create a file for each input sample

In [12]:
dir_ = 'data_for_batch_inference'
os.makedirs(dir_, exist_ok=True)
num = 200
with open(path, 'r', encoding='utf-8') as f:
    lines = []
    for i, line in enumerate(f.readlines()):
        if i==num:
            break
        lines.append(line.split('\t')[0])
for i, line in enumerate(lines):
    path = os.path.join(dir_, str(i))
    with open(path, 'w', encoding='utf-8') as f:
        f.write(line)

### Prepare your workspace

In [13]:
workspace = Workspace.from_config('config.json')
workspace

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')

### Datastore

In [14]:
path_on_datastore = 'my_dataset'
datastore = Datastore.get(workspace=workspace, datastore_name='workspaceblobstore')
datastore.upload(src_dir='data', target_path=path_on_datastore, overwrite=True, show_progress=True)

Uploading an estimated of 5 files
Uploading data\char_to_index.json
Uploading data\data.txt
Uploading data\data_origin.txt
Uploading data\label.txt
Uploading data\word_to_index.json
Uploaded data\label.txt, 1 files out of an estimated total of 5
Uploaded data\char_to_index.json, 2 files out of an estimated total of 5
Uploaded data\word_to_index.json, 3 files out of an estimated total of 5
Uploaded 3 files


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\site-packages\urllib3\connectionpool.py", line 677, in urlopen
    chunked=chunked,
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\site-packages\urllib3\connectionpool.py", line 392, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\http\client.py", line 1065, in _send_output
    self.send(chunk)
  File "C:\Users\50657\miniconda3\envs\tmp2\lib\http\client.py", lin

$AZUREML_DATAREFERENCE_d86a6dd4621144198fcf2d7be8847eb9

### Register the dataset


In [15]:
dataset_name = 'THUCNews'
description = 'THUCNews dataset is generated by filtering and filtering historical data \
of Sina News RSS subscription channel from 2005 to 2011'
datastore_path = [DataPath(datastore=datastore, path_on_datastore=path_on_datastore)]
data = Dataset.File.from_files(path=datastore_path)
data.register(workspace=workspace, name=dataset_name, description=description, create_new_version=True)

{
  "source": [
    "('workspaceblobstore', 'my_dataset')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "0c7ae634-ad70-40d5-88cb-9e40c3572999",
    "name": "THUCNews",
    "version": 2,
    "description": "THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011",
    "workspace": "Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}

### Datastore

In [16]:
path_on_datastore = 'my_dataset_for_batch_inference'
datastore = Datastore.get(workspace=workspace, datastore_name='workspaceblobstore')
datastore.upload(src_dir='data_for_batch_inference', target_path=path_on_datastore, overwrite=True, show_progress=True)

Uploading an estimated of 200 files
Uploading data_for_batch_inference\0
Uploading data_for_batch_inference\1
Uploading data_for_batch_inference\10
Uploading data_for_batch_inference\100
Uploading data_for_batch_inference\101
Uploading data_for_batch_inference\102
Uploading data_for_batch_inference\103
Uploading data_for_batch_inference\104
Uploading data_for_batch_inference\105
Uploading data_for_batch_inference\106
Uploading data_for_batch_inference\107
Uploading data_for_batch_inference\108
Uploading data_for_batch_inference\109
Uploading data_for_batch_inference\11
Uploading data_for_batch_inference\110
Uploading data_for_batch_inference\111
Uploading data_for_batch_inference\112
Uploading data_for_batch_inference\113
Uploading data_for_batch_inference\114
Uploading data_for_batch_inference\115
Uploading data_for_batch_inference\116
Uploading data_for_batch_inference\117
Uploading data_for_batch_inference\118
Uploading data_for_batch_inference\119
Uploading data_for_batch_inference

$AZUREML_DATAREFERENCE_6fca592c224f449c8f06f51e2cfb74a9

### Register the dataset


In [17]:
dataset_name = 'THUCNews_For_Batch_Inference'
description = 'THUCNews dataset is generated by filtering and filtering historical data \
of Sina News RSS subscription channel from 2005 to 2011'
datastore_path = [DataPath(datastore=datastore, path_on_datastore=path_on_datastore)]
data = Dataset.File.from_files(path=datastore_path)
data.register(workspace=workspace, name=dataset_name, description=description, create_new_version=True)

{
  "source": [
    "('workspaceblobstore', 'my_dataset_for_batch_inference')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "6d939c01-f8a8-49f9-a233-cf9ce2c55ccf",
    "name": "THUCNews_For_Batch_Inference",
    "version": 1,
    "description": "THUCNews dataset is generated by filtering and filtering historical data of Sina News RSS subscription channel from 2005 to 2011",
    "workspace": "Workspace.create(name='fundamental3', subscription_id='4f455bd0-f95a-4b7d-8d08-078611508e0b', resource_group='fundamental')"
  }
}