# Find UUIDs we want in the cache

Start your local database for this with anonymous data import!

In [None]:
from sqlalchemy import create_engine
import pandas as pd
import mysql.connector

pd.options.mode.chained_assignment = None
pd.set_option("display.max_rows", None)

engine = create_engine("mysql+mysqlconnector://root:secret@localhost:3306/serlo")

In [None]:
# supported (https://github.com/serlo/database-layer/blob/main/server/src/uuid/model/entity/entity_type.rs#L10-L25) and not trashed entities
entities = pd.read_sql("""
    SELECT entity.id
    FROM entity
    JOIN uuid ON uuid.id = entity.id
    JOIN type ON type.id = entity.type_id
    WHERE trashed = 0
    AND type.name IN (
        "applet", "article", "course", "course-page", "event", 
        "text-exercise", "text-exercise-group", "grouped-text-exercise", "text-solution", "video"
    )
    """,
    con=engine,
)
entities

In [None]:
# current revisions for supported and not trashed for entities
current_revisions = pd.read_sql("""
    SELECT entity.current_revision_id AS id
    FROM entity
    JOIN uuid ON uuid.id = entity.id
    JOIN type ON type.id = entity.type_id
    WHERE trashed = 0
    AND current_revision_id IS NOT NULL
    AND type.name IN (
        "applet", "article", "course", "course-page", "event", 
        "text-exercise", "text-exercise-group", "grouped-text-exercise", "text-solution", "video"
    )
    """,
    con=engine,
)
current_revisions

In [None]:
# not trashed taxonomies
taxonomies = pd.read_sql("""
    SELECT term_taxonomy.id
    FROM term_taxonomy
    JOIN uuid ON uuid.id = term_taxonomy.id
    WHERE trashed = 0
    """,
    con=engine,
)
taxonomies

# Define functions to make GraphQL queries for the UUIDs

Here you have to switch, depending on if you want refill the cache of the production or staging environment:

In [None]:
# GRAPHQL_API="https://api.serlo.org/graphql"
GRAPHQL_API="https://api.serlo-staging.dev/graphql"

import requests

def api_call(query, variables={}):
    req = requests.post(GRAPHQL_API,
        headers = { "Content-Type": "application/json" },
        json = { "query": query, "variables": variables }
    )
    
    return req.json()

api_call(" query { version }")

In [None]:
def get_uuid(uuid):
    return api_call("query($uuid: Int!) { uuid(id: $uuid) { __typename }}", { "uuid": uuid })

get_uuid(1)

In [None]:
def update_cache(row_with_id):
    uuid = int(row_with_id['id'])
    result = get_uuid(uuid)
    
    if "data" in result and result["data"] != None and result["data"]["uuid"] != None:
        print(f"Uuid updated: {uuid}")
    else:
        print(f"No uuid:      {uuid}")

# Query UUIDs to get them back into the cache

We process the frames with the UUIDS in a parallelized way so it doesn't take forever. 
Making GraphQL queries is an I/O-bound task, so you could try using even more workers than you have CPU cores.

In [None]:
from tqdm.notebook import tqdm
import concurrent.futures

# split the long frames in halves to be processed in parallel
# if you want it faster, you can try splitting it in even more parts
# and give the max_workers argument of ThreadPoolExecutor a higher value
half_entities = len(entities) // 2
half_revisions = len(current_revisions) // 2
uuid_frames = [entities.iloc[:half_entities], entities.iloc[half_entities:], 
               current_revisions.iloc[:half_revisions], current_revisions.iloc[half_revisions:], 
               taxonomies]
functions = [update_cache] * len(uuid_frames)

total_iterations = sum(len(uuid_frame) for uuid_frame in uuid_frames)
progress_bar = tqdm(total=total_iterations)

def process_dataframe(data_frame, func):
    for index, row in data_frame.iterrows():
        func(row)
        progress_bar.update(1)

with concurrent.futures.ThreadPoolExecutor() as executor:
    for uuid_frame, func in zip(uuid_frames, functions):
        executor.submit(process_dataframe, uuid_frame, func)

progress_bar.close()