Attempting to format a notebook with `black`. Does NOT work - corrupts the notebook

# TODO

In [12]:
# !pip install black[jupyter] --quiet

# from google.colab import drive
# drive.mount("/content/drive")

# # %cd /content/drive/MyDrive/'Colab Notebooks'
# !black /content/drive/MyDrive/'Colab Notebooks'/TGStat.ipynb


### 1. Install `pyppeteer`

In [13]:
%%capture
!pip install pyppeteer

### 2. Install the missing library to make Pyppeteer's local version of Chromium run under Linux/Colab environment, otherwise it crashes on startup

In [14]:
!apt install libxtst6

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libxtst6 is already the newest version (2:1.2.3-1).
0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.


### 3. Fix `asyncio` in the notebook environment

In [15]:
import nest_asyncio

nest_asyncio.apply()

### 4. The program

In [16]:
import os
import asyncio
import json
import csv
import string

from pyppeteer import launch
from pyppeteer.page import Page


def get_suffix(selector: str) -> str:
  """
  Takes a CSS selector and returns an appropriate DOM element property
  to use with Pyppeteer's query method in order to get correct data
  """
  if selector.endswith(" a"):
    return "href"
  else:
    return "innerText"


async def get_instructions(directory: str) -> list:
  """
  Parses the ScrapeMate JSON export file in a given directory
  and returns a Python list with the selectors and column names

  TODO: add the ability to load more than one file; currently, it loads
    the first file it finds
  """
  dir_list = os.listdir(directory)
  for file in dir_list:
    if file.startswith("ScrapeMate.") and file.endswith(".json"):
      with open(os.path.join(directory, file)) as f:
        return json.load(f)


def remove_special_chars(text: str, special_chars: str) -> str:
  """Takes in a string and sanitizes it to be used, for example, as a safe file name on Windows"""
  for char in special_chars:
    if char in text:
      text = text.replace(char, "_")

  return text

async def scrape(instructions: list, page: Page):
  for i in instructions:
    urls = instructions[i]["urls"]
    fields = instructions[i]["fields"]

  for url in urls:
    await page.goto(url, {"waitUntil": "domcontentloaded"})

    data: dict[list] = {}
    for field in fields:
      name = field["name"]
      selector = field["selector"]

      data[name] = []
      elements = await page.querySelectorAll(selector)
      for element in elements:
        text: str = await page.evaluate(f'(element) => element.{get_suffix(selector)}', element)
        data[name].append(text.strip("\n").strip(" ").encode("utf-8", "strict").decode())

    # import pprint; pprint.pprint(data)

    file_name = remove_special_chars(url, string.punctuation + " ") + ".csv"
    with open(file_name, "w", encoding="utf-8") as f:
      writer = csv.writer(f)
      writer.writerow(data.keys())
      writer.writerows(zip(*data.values()))


async def main():
  browser = await launch(headless=True, args=["--no-sandbox"])

  page = await browser.newPage()

  instructions = await get_instructions("/content")
  await scrape(instructions, page)

  await browser.close()

asyncio.get_event_loop().run_until_complete(main())


### If Chromium still crashes on startup, run this command to find any other missing dependencies and install them with `!apt install`

In [17]:
# !ldd /root/.local/share/pyppeteer/local-chromium/588429/chrome-linux/chrome | grep "not found"


### 5. Test the output `.csv`

In [18]:
import pandas as pd

df = pd.read_csv("/content/https___uz_tgstat_com_en_ratings_channels_public_sort_ci.csv")

df


Unnamed: 0,name,url,subscribers,post_reach,citation_index
0,Xushnudbek.uz,https://uz.tgstat.com/en/channel/@xushnudbek/stat,489 194,187.1k,3 418.8
1,Shavkat Mirziyoyev_press-service,https://uz.tgstat.com/en/channel/@shmirziyoyev...,172 569,3 418.8,3 153.8
2,Kun.uz | Расмий канал,https://uz.tgstat.com/en/channel/@kunuzofficia...,1 310 412,98.6k,3 134.7
3,Huquqiy axborot,https://uz.tgstat.com/en/channel/@huquqiyaxbor...,245 005,3 153.8,2 595.8
4,Prezident matbuot kotibi | Sherzod Asadov,https://uz.tgstat.com/en/channel/@Press_Secret...,75 787,381.7k,2 406.6
...,...,...,...,...,...
95,TEZKOR RUSTILI,https://uz.tgstat.com/en/channel/@RUSTILI_DARS...,30 540,1 230.4,1 038.4
96,Qora Axborotlar | Rasmiy Kanal,https://uz.tgstat.com/en/channel/@Qora_Axborot...,73 078,14.9k,1 037.4
97,Rumiy Aforizmlari📚,https://uz.tgstat.com/en/channel/@Rumiy_Aforiz...,74 081,1 225.5,1 031.7
98,TIBBIYOT TV,https://uz.tgstat.com/en/channel/@TIBBIYOT_TV/...,598 966,15.4k,1 028.7
