In [1]:
import os

if "notebooks" in os.getcwd():
    os.chdir("..")

In [2]:
import time
from tqdm.notebook import tqdm

In [None]:
from azure_cosmos_db import *

azure_cosmos = AzureCosmos()

azure_cosmos.list_containers()

# Events, Dice & Shotgun Scraper

In [4]:
SCRAPER_NAME = "shotgun_links"
INPUT_FILE_PATH = "data/inputs/Shotgun Analysis - Non Duplicates.csv"
SHEET_NAME = ""

In [None]:
azure_cosmos.DATABASE_ID = "Scraper"
azure_cosmos.CONTAINER_NAME = SCRAPER_NAME

azure_cosmos.container = azure_cosmos.initialize_cosmosdb()

In [None]:
SHEET_NAME, extension = os.path.splitext(os.path.basename(INPUT_FILE_PATH))

if extension==".csv":
    df = pd.read_csv(INPUT_FILE_PATH)
elif extension==".xlsx":
    df = pd.read_excel(INPUT_FILE_PATH, sheet_name="Sheet4")

print(f"Total: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Unique: {df.shape}")

print(SHEET_NAME)
# df = df.sample(5)
df.head()

In [None]:
max_retries = 3
success = 0

urls = df["Non Duplicates Organisers"].to_list()

for url in tqdm(urls, desc=f"Uploading {len(urls)} URLs to {SCRAPER_NAME}"):
    secure_url = url.replace("http://", "https://")
    hash_key = SHEET_NAME + secure_url
    data = {
        "id": hashlib.sha256(hash_key.encode()).hexdigest(),
        "url": secure_url,
        "processed": False,
        "sheet_name": SHEET_NAME
    }

    retries = 0
    while retries < max_retries:
        try:
            azure_cosmos.create_conversation(conversation_data=data)
            success += 1
            break
        except CosmosResourceExistsError as e:
            success += 1
            print(f"[INFO] Record already exists in {azure_cosmos.CONTAINER_NAME}, skipping insertion.")
            break
        except CosmosHttpResponseError as e:
            if e.status_code == 429:
                retry_after = int(e.headers.get("x-ms-retry-after-ms", 1000))/1000
                print(f"[WARNNING] Rate limit exeeded. Retrying in {retry_after}")
                time.sleep(retry_after)
                retries += 1
            else:
                print(e)
                break
                     
print(f"Out of {df.shape[0]} URLs, {success} were uploaded and {df.shape[0]-success} failed to upload")