# Data Collection

### Authorize EMSI Skills calls


In [17]:
import json
import os
from requests import request
from datetime import datetime, timedelta
from time import sleep


def authorize():
    with open('../config.json', 'r') as auth:
        obj = json.load(auth)
        clientId = obj['emsi']['id']
        clientSecret = obj['emsi']['secret']

    url = 'https://auth.emsicloud.com/connect/token'

    payload = 'client_id=' + clientId + '&client_secret=' + \
        clientSecret + '&grant_type=client_credentials&scope=emsi_open'
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}

    response = request('POST', url, data=payload, headers=headers)
    token = json.loads(response.text)['access_token']
    return {'token': token, 'expires': datetime.now() + timedelta(minutes=55)}


def renew(tokenObj={}):
    try:
        if tokenObj['expires'] > datetime.now():
            return authorize()
        return tokenObj
    except (KeyError):
        return authorize()


token = renew()


In [18]:
def header(token):
    return {'Authorization': 'Bearer ' + token['token']}


baseurl = 'https://emsiservices.com/skills/versions/latest'


### Fetch all skills


In [13]:
def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100):
    percent = ("{0:." + str(decimals) + "f}").format(100 *
                                                     (iteration / float(total)))

    filledLength = int(length * iteration // total)

    bar = '=' * (filledLength - 1) + '>' + '-' * (length - filledLength)

    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r')

    if iteration == total:
        print()


In [14]:
skillIds = request('GET', baseurl+'/skills',
                   headers=header(token)).json()['data']


In [19]:
skills = []

lastIndex = 0
with open('../results/index.tmp', 'a+') as indexFile:
    indexFile.seek(0)
    try:
        lastIndex = int(indexFile.read())
    except(ValueError):
        pass

    with open('../results/skills.json', 'a+', encoding='utf8') as skillFile:
        skillFile.seek(0)
        try:
            skills = json.load(skillFile)

            if skillIds[lastIndex]['id'] != skills[len(skills) - 1]['id']:
                lastIndex = 0
                skills = []

        except(json.JSONDecodeError):
            print('No json in skills.json')
            lastIndex = 0

        for i, skill in enumerate(skillIds):
            if i < lastIndex:
                continue

            skills.append(
                request('GET',
                        baseurl+'/skills/'+skill['id'],
                        headers=header(token))
                .json()['data']
            )
            skillFile.truncate(0)
            skillFile.write(json.dumps(skills, indent=2))

            indexFile.truncate(0)
            indexFile.write(str(i))

            printProgressBar(i+1,
                             len(skillIds),
                             suffix='downloaded, skill '+skill['id']+' '+str(i+1)+'/'+str(len(skillIds)),
                             decimals=3)

            token = renew(token)
            sleep(0.2)  # 5 requests per second max

os.remove('../results/index.tmp')




[**Preprocess data**](./preprocessing.ipynb)