In [None]:
import shutil
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
    'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br, identity',
    'Referer': 'https://www.tiktok.com/',
    'Origin': 'https://www.tiktok.com',
    'DNT': '1',
    'Sec-Fetch-Dest': 'video',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'Range': 'bytes=0-',
    'Connection': 'keep-alive',
    'Cookie': r"",
    'TE': 'trailers'
}

session = requests.Session()

def download_file(url, filepath):
    with session.get(url, headers=headers, stream=True) as r:
        with open(filepath, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

In [None]:
import json
import time
import base64
import ffmpeg

from tqdm import tqdm
from pathlib import Path
from haralyzer import HarParser
from collections import OrderedDict


def embed_json_in_video(video_path, json_data, output_path):
    json_str = json.dumps(json_data)
    
    (
        ffmpeg
        .input(video_path)
        .output(output_path, codec='copy', metadata=f'comment={json_str}')
        .run(overwrite_output=True)
    )


def extract_video_data(har_file):
    black_list = []
    with open(har_file, 'r', encoding='utf-8') as f:
        entries = HarParser(json.loads(f.read())).har_data["entries"]

    videos = {}
    for entry in entries:
        if "item_list" in entry["request"]["url"] and "text" in entry["response"]["content"]:
            try:
                item_list = json.loads(base64.b64decode(entry["response"]["content"]["text"]).decode("utf-8"))["itemList"]

                for item in item_list:
                    if "playAddr" in item["video"]:
                        user_id   = item["author"]["uniqueId"]
                        music_id  = item["music"]["id"]
                        plays     = item["stats"]["playCount"]
                        video_id  = item["id"]
                        video_url = item["video"]["playAddr"]
                        filename  = f"tt_[{user_id}]_[{music_id}]_[{video_id}].mp4"

                        if int(music_id) not in black_list:
                            videos[filename] = (video_url, item)
            except Exception as e:
                print(e)

    videos = OrderedDict(sorted(videos.items(), key=lambda item: item[1][1]["stats"]["playCount"], reverse=True))
    print(f"{len(videos)} videos")
    return videos

In [None]:
for har_file in list(Path("...").glob(r"*.har")):
    pbar = tqdm(extract_video_data(har_file=har_file).items())
    
    for filename, (url, item) in pbar:
        folder = Path(item["author"]["uniqueId"])
        folder.mkdir(exist_ok=True, parents=True)

        file_path = folder/filename

        
        if filename not in [i.name for i in list(folder.iterdir())]:
            try:
                pbar.set_description(filename)
                download_file(url, file_path)
                # time.sleep(4)

            except Exception as e:
                print(e)
                print(filename)
                continue

            try:
                output_path = folder/f"meta_{filename}"
                embed_json_in_video(str(file_path), item, str(output_path))
                file_path.unlink()
                output_path.rename(file_path)
            except Exception as e:
                print(e)