# Install dependencies

In [12]:
%%bash
pip install wds-client --upgrade
pip install chess
pip install terra-notebook-utils --user




[notice] A new release of pip is available: 23.1.1 -> 23.3.2
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.1 -> 23.3.2
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.1 -> 23.3.2
[notice] To update, run: pip install --upgrade pip


In [13]:
import chess.pgn
import requests
import os
import json
import wds_client
from terra_notebook_utils import azure_auth

# Constants and helper functions

In [14]:
DOMAIN = 'dsde-prod.broadinstitute.org'
WSM_BASE_URL = f'https://workspace.{DOMAIN}/api/workspaces/v1'
LEO_BASE_URL = f'https://leonardo.{DOMAIN}/api/apps/v2'
WORKSPACE_ID = os.environ['WORKSPACE_ID']
WORKSPACE_STORAGE_CONTAINER_ID = os.environ['WORKSPACE_STORAGE_CONTAINER_ID']
WORKSPACE_STORAGE_CONTAINER_URL = os.environ['WORKSPACE_STORAGE_CONTAINER_URL']
FILE_NAME = 'lichess_robitto_2024-01-21.pgn'
VERSION = 'v0.2'

def get_token():
    """Get Azure access token"""
    return azure_auth.get_azure_access_token()

def headers():
    return {
        "Authorization": f"Bearer {get_token()}",
        "Accept": "application/json"
    }

def get_sas_token():
    """Get SAS token for workspace storage container"""
    uri = f"{WSM_BASE_URL}/{WORKSPACE_ID}/resources/controlled/azure/storageContainer/{WORKSPACE_STORAGE_CONTAINER_ID}/getSasToken"
    response = requests.post(uri, headers=headers())
    status_code = response.status_code
    if status_code != 200:
        return response.text
    return json.loads(response.text)['token']

def get_wds_url():
    """Get url for WDS"""
    uri = f"{LEO_BASE_URL}/{WORKSPACE_ID}/wds-{WORKSPACE_ID}"
    response = requests.get(uri, headers=headers())
    status_code = response.status_code
    if status_code != 200:
        return response.text
    return json.loads(response.text)['proxyUrls']['wds']

# Copy the pgn file to disk

In [15]:
pgn_file_base = f"{WORKSPACE_STORAGE_CONTAINER_URL}/{FILE_NAME}?{get_sas_token()}"
pgn_file = f"{pgn_file_base}?{get_sas_token()}"
! azcopy copy '{pgn_file}' .

INFO: Scanning...
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

failed to perform copy command due to error: cannot start job due to error: cannot list files due to reason -> github.com/Azure/azure-storage-blob-go/azblob.newStorageError, /home/vsts/go/pkg/mod/github.com/!azure/azure-storage-blob-go@v0.15.0/azblob/zc_storage_error.go:42
===== RESPONSE ERROR (ServiceCode=AuthenticationFailed) =====
Description=403 Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature., Details: (none)
   HEAD https://lz88a1ce71eb2a5df44002f0.blob.core.windows.net/sc-0805d3a9-a309-415b-af9c-f7b5de0c01ea/lichess_robitto_2024-01-21.pgn?rscd=100182905304480635229%3Fsv%3D2021-12-02&se=2024-01-23t19%3A33%3A55z&sig=-REDACTED-&sp=racwdlt&spr=https&sr=c&st=2024-01-23t18%3A18%3A55z&sv=2021-12-02&timeout=901
   User-Agent: [AzCopy/10.18.1 Azure-Storage/0.15 (go1.19.8; lin

# Prepare Workspace Data Service clients

In [16]:
api_client = wds_client.ApiClient(header_name='Authorization', header_value="Bearer " + get_token())
api_client.configuration.host = get_wds_url()

# set up all the clients
records_client = wds_client.RecordsApi(api_client)
generalInfo_instance = wds_client.GeneralWDSInformationApi(api_client)
schema_instance = wds_client.SchemaApi(api_client)
client_instance = wds_client.InstancesApi(api_client)

# Parse PGN file and upload to WDS in batches

In [17]:
pgn = open(FILE_NAME)
table_name = 'games'
cols = [
    'Event', 
    'Site', 
    'Date', 
    'White',
    'Black', 
    'Result', 
    'WhiteElo',
    'BlackElo', 
    'WhiteRatingDiff',
    'BlackRatingDiff',
    'Variant',
    'TimeControl',
    'ECO',
    'Opening',
    'Termination'
]
batch_size = 1000
batch_upsert = []
i = 0
last_updated = 0
while True:
    offset = pgn.tell()
    headers = chess.pgn.read_headers(pgn)
    if headers is None:
        break
        
    attrs = {}
    attrs['offset'] = offset
    attrs['pgn_file'] = pgn_file_base
    for c in cols:
        attrs[c] = headers.get(c, "?")
    record = wds_client.models.BatchRecordRequest(str(i), table_name, attrs)
    operation = wds_client.models.BatchOperation('upsert', record)
    batch_upsert.append(operation)
    i = i + 1
    if i % batch_size == 0: 
        print(f"Uploading games {last_updated+1}-{i}...")
        record = records_client.batch_write_records(WORKSPACE_ID, VERSION, table_name, batch_upsert)
        print(record)
        batch_upsert.clear()
        last_updated = i
    
print(f"Uploading games {last_updated+1}-{i}...")
record = records_client.batch_write_records(WORKSPACE_ID, VERSION, table_name, batch_upsert)
print(record)
batch_upsert.clear()
    

Uploading games 1-1000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 1001-2000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 2001-3000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 3001-4000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 4001-5000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 5001-6000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 6001-7000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 7001-8000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 8001-9000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 9001-10000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 10001-11000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 11001-12000...
{'message': 'Huzzah', 'records_modified': 1000}
Uploading games 12001-13000...
{'message': 'Huzzah', 'records_modified': 1