In [1]:
from pathlib import Path
import csv
import re
def preprocess_imdb(input_dir: str = "./data", output_dir: str = "./data/processed"):
    """
    Przygotowuje pliki TSV zgodne z ERD:
    - titles.tsv
    - people.tsv
    - ratings.tsv
    - principals.tsv
    - aka_titles.tsv
    - episodes.tsv
    - title_genres.tsv
    """
    import sys
    max_field_size = sys.maxsize
    while True:
        try:
            csv.field_size_limit(max_field_size)
            break
        except OverflowError:
            max_field_size = max_field_size // 10
    in_path = Path(input_dir)
    out_path = Path(output_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    
    # regex: tab + " + tekst bez " i tabów, aż do taba albo końca linii
    quote_fix = re.compile(r'\t"([^"\t]*)(?=\t|$)')
    # tekst do pierwszego taba + tab + \N - dla people
    people_fix = re.compile(r'^([^\t]*\t)\\N')

    def cleaned_lines(f, clean_people=False):
        for line in f:
            # zamiana: \t"tekst   ->  \ttekst
            line = quote_fix.sub(r'\t\1', line)
            
            if clean_people:
                line = people_fix.sub(r'\1JakisTyp', line)
                
            yield line

    valid_tconsts = set()
    # ---------- T I T L E S  +  T I T L E _ G E N R E S ----------
    basics_file = in_path / "title.basics.tsv"
    if basics_file.exists():
        titles_out = out_path / "titles.tsv"
        genres_out = out_path / "title_genres.tsv"

        with basics_file.open("r", encoding="utf-8") as fin, \
             titles_out.open("w", encoding="utf-8", newline="") as ftitles, \
             genres_out.open("w", encoding="utf-8", newline="") as fgenres:
                 

            reader = csv.DictReader(cleaned_lines(fin), delimiter="\t")
            wtitles = csv.writer(ftitles, delimiter="\t", lineterminator="\n")
            wgenres = csv.writer(fgenres, delimiter="\t", lineterminator="\n")
                 

            # nagłówki zgodne ze schemą
            wtitles.writerow([
                "tconst",
                "title_type",
                "primary_title",
                "original_title",
                "is_adult",
                "start_year",
                "end_year",
                "runtime_minutes",
            ])
            wgenres.writerow(["title_id", "genre"])

            for row in reader:
                tconst = row.get("tconst", "\\N")
                
                if tconst and tconst != "\\N":
                    valid_tconsts.add(tconst) 

                wtitles.writerow([
                    tconst,
                    row.get("titleType", "\\N"),
                    row.get("primaryTitle", "\\N"),
                    row.get("originalTitle", "\\N"),
                    row.get("isAdult", "\\N"),
                    row.get("startYear", "\\N"),
                    row.get("endYear", "\\N"),
                    row.get("runtimeMinutes", "\\N"),
                ])

                genres = row.get("genres", "\\N")
                if genres and genres != "\\N":
                    for g in genres.split(","):
                        g = g.strip()
                        if g:
                            wgenres.writerow([tconst, g])
        print("[preprocess] titles + title_genres OK")
    else:
        print("[preprocess] WARNING: title.basics.tsv not found")

    # ---------- P E O P L E ----------
    name_file = in_path / "name.basics.tsv"
    valid_nconsts = set()
    if name_file.exists():
        people_out = out_path / "people.tsv"
        with name_file.open("r", encoding="utf-8") as fin, \
             people_out.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(cleaned_lines(fin, clean_people=True), delimiter="\t")
            w = csv.writer(fout, delimiter="\t", lineterminator="\n")

            # nconst, primary_name, birth_year, death_year, primary_profession
            w.writerow(["nconst", "primary_name", "birth_year", "death_year", "primary_profession"])

            for row in reader:
                nconst = row.get("nconst", "\\N")
                if nconst and nconst != "\\N":
                    valid_nconsts.add(nconst)
                w.writerow([
                    nconst,
                    row.get("primaryName", "\\N"),
                    row.get("birthYear", "\\N"),
                    row.get("deathYear", "\\N"),
                    row.get("primaryProfession", "\\N"),
                ])
        print("[preprocess] people OK")
    else:
        print("[preprocess] WARNING: name.basics.tsv not found")

    # ---------- R A T I N G S ----------
    ratings_file = in_path / "title.ratings.tsv"
    if ratings_file.exists():
        ratings_out = out_path / "ratings.tsv"
        with ratings_file.open("r", encoding="utf-8") as fin, \
             ratings_out.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            w = csv.writer(fout, delimiter="\t", lineterminator="\n")

            # tconst -> title_id
            w.writerow(["title_id", "average_rating", "num_votes"])

            for row in reader:
                title_id = row.get("tconst", "\\N")
                if title_id in valid_tconsts:
                    w.writerow([
                        title_id,
                        row.get("averageRating", "\\N"),
                        row.get("numVotes", "\\N"),
                    ])
        print("[preprocess] ratings OK")
    else:
        print("[preprocess] WARNING: title.ratings.tsv not found")

    # ---------- P R I N C I P A L S ----------
    MAX_PRINCIPALS = 15000000  # Limit 15mln
    count = 0
    principals_file = in_path / "title.principals.tsv"
    if principals_file.exists():
        principals_out = out_path / "principals.tsv"
        with principals_file.open("r", encoding="utf-8") as fin, \
             principals_out.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            w = csv.writer(fout, delimiter="\t", lineterminator="\n")

            # title_id, ordering, person_id, category, job, characters
            w.writerow(["title_id", "ordering", "person_id", "category", "job", "characters"])

            for row in reader:
                title_id = row.get("tconst", "\\N")
                person_id = row.get("nconst", "\\N")
                if person_id in valid_nconsts and title_id in valid_tconsts:
                    if count >= MAX_PRINCIPALS:
                        break
                    w.writerow([
                        title_id,
                        row.get("ordering", "\\N"),
                        person_id,
                        row.get("category", "\\N"),
                        row.get("job", "\\N"),
                        row.get("characters", "\\N"),
                    ])
                    count += 1
        print("[preprocess] principals OK")
    else:
        print("[preprocess] WARNING: title.principals.tsv not found")

    # ---------- A K A _ T I T L E S ----------
    akas_file = in_path / "title.akas.tsv"
    if akas_file.exists():
        akas_out = out_path / "aka_titles.tsv"
        with akas_file.open("r", encoding="utf-8") as fin, \
             akas_out.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            w = csv.writer(fout, delimiter="\t", lineterminator="\n")

            # title_id, ordering, aka_title, region, language, types, attributes, is_original_title
            w.writerow([
                "title_id", "ordering", "aka_title",
                "region", "language", "types", "attributes", "is_original_title"
            ])

            for row in reader:
                title_id = row.get("titleId", "\\N")
                if title_id in valid_tconsts:
                    w.writerow([
                        title_id,
                        row.get("ordering", "\\N"),
                        row.get("title", "\\N"),
                        row.get("region", "\\N"),
                        row.get("language", "\\N"),
                        row.get("types", "\\N"),
                        row.get("attributes", "\\N"),
                        row.get("isOriginalTitle", "\\N"),
                    ])
        print("[preprocess] aka_titles OK")
    else:
        print("[preprocess] WARNING: title.akas.tsv not found")

    # ---------- E P I S O D E S ----------
    episodes_file = in_path / "title.episode.tsv"
    if episodes_file.exists():
        episodes_out = out_path / "episodes.tsv"
        with episodes_file.open("r", encoding="utf-8") as fin, \
             episodes_out.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            w = csv.writer(fout, delimiter="\t", lineterminator="\n")

            # episode_id, parent_id, season_number, episode_number
            w.writerow(["episode_id", "parent_id", "season_number", "episode_number"])

            for row in reader:
                episode_id = row.get("tconst", "\\N") 
                parent_id = row.get("parentTconst", "\\N") 
                if episode_id in valid_tconsts and parent_id in valid_tconsts:
                    w.writerow([
                        episode_id,
                        parent_id,
                        row.get("seasonNumber", "\\N"),
                        row.get("episodeNumber", "\\N"),
                    ])
        print("[preprocess] episodes OK")
    else:
        print("[preprocess] WARNING: title.episode.tsv not found")

    print("[preprocess] DONE. Pliki w:", out_path)


In [16]:
from pathlib import Path
import csv


def generate_files(input_dir: str, max_child_rows: int):
    """
    Tworzy podzbiór przetworzonych plików IMDb z zachowaniem integralności referencyjnej.

    - input_dir: katalog z plikami po preprocess_imdb (titles.tsv, people.tsv, itd.)
    - max_child_rows: maksymalna liczba wierszy w KAŻDEJ tabeli dziecka:
        ratings, aka_titles, principals, episodes, title_genres

    W nowym katalogu (np. processed_1000) powstają:
        titles.tsv, people.tsv, ratings.tsv, principals.tsv,
        aka_titles.tsv, episodes.tsv, title_genres.tsv
    z prefixem 'NEW' w kluczach.
    """
    in_path = Path(input_dir)
    out_path = in_path.parent / f"processed_{max_child_rows}"
    out_path.mkdir(parents=True, exist_ok=True)

    KEY_PREFIX = "NEW"

    # Zbiór użytych kluczy (oryginalnych, bez prefiksu)
    selected_title_ids = set()
    selected_person_ids = set()

    # --- helper do kopiowania tabel-dzieci ------------------------
    def copy_child_table(
        filename: str,
        key_columns_title=None,
        key_columns_person=None,
        limit: int = None,
    ):
        """
        filename: nazwa pliku TSV
        key_columns_title: lista nazw kolumn zawierających FK do titles
        key_columns_person: lista nazw kolumn zawierających FK do people
        limit: maksymalna liczba wierszy (None => bez limitu)
        """
        nonlocal selected_title_ids, selected_person_ids

        key_columns_title = key_columns_title or []
        key_columns_person = key_columns_person or []

        src = in_path / filename
        if not src.exists():
            print(f"[generate_files] WARNING: {filename} not found, pomijam")
            return

        dst = out_path / filename

        with src.open("r", encoding="utf-8") as fin, \
             dst.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            if reader.fieldnames is None:
                print(f"[generate_files] WARNING: {filename} ma pusty nagłówek?")
                return

            writer = csv.writer(fout, delimiter="\t", lineterminator="\n")
            writer.writerow(reader.fieldnames)

            count = 0
            for row in reader:
                if limit is not None and count >= limit:
                    break

                # zbieramy oryginalne ID (bez prefiksu)
                for col in key_columns_title:
                    val = row.get(col, "\\N")
                    if val and val != "\\N":
                        selected_title_ids.add(val)

                for col in key_columns_person:
                    val = row.get(col, "\\N")
                    if val and val != "\\N":
                        selected_person_ids.add(val)

                # przygotowujemy wiersz wyjściowy z prefiksem w kolumnach-kluczach
                out_row = []
                for col in reader.fieldnames:
                    val = row.get(col, "\\N")
                    if val and val != "\\N":
                        if col in key_columns_title or col in key_columns_person:
                            val = KEY_PREFIX + val
                    out_row.append(val)

                writer.writerow(out_row)
                count += 1

        print(f"[generate_files] {filename} OK (zapisano {count} wierszy)")

    # --- 1) Tabele dzieci -----------------------------------------
    # ratings: title_id (FK -> titles)
    copy_child_table(
        "ratings.tsv",
        key_columns_title=["title_id"],
        key_columns_person=[],
        limit=max_child_rows,
    )

    # aka_titles: title_id (FK -> titles)
    copy_child_table(
        "aka_titles.tsv",
        key_columns_title=["title_id"],
        key_columns_person=[],
        limit=max_child_rows,
    )

    # title_genres: title_id (FK -> titles)
    copy_child_table(
        "title_genres.tsv",
        key_columns_title=["title_id"],
        key_columns_person=[],
        limit=max_child_rows,
    )

    # episodes: episode_id, parent_id (oba FK -> titles)
    copy_child_table(
        "episodes.tsv",
        key_columns_title=["episode_id", "parent_id"],
        key_columns_person=[],
        limit=max_child_rows,
    )

    # principals: title_id (FK -> titles), person_id (FK -> people)
    copy_child_table(
        "principals.tsv",
        key_columns_title=["title_id"],
        key_columns_person=["person_id"],
        limit=max_child_rows,
    )

    # --- 2) Tabele rodziców ---------------------------------------

    # titles: tconst w selected_title_ids
    titles_src = in_path / "titles.tsv"
    titles_dst = out_path / "titles.tsv"
    if titles_src.exists():
        with titles_src.open("r", encoding="utf-8") as fin, \
             titles_dst.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            writer = csv.writer(fout, delimiter="\t", lineterminator="\n")

            writer.writerow(reader.fieldnames)

            kept = 0
            for row in reader:
                tconst = row.get("tconst", "\\N")
                if tconst in selected_title_ids:
                    out_row = []
                    for col in reader.fieldnames:
                        val = row.get(col, "\\N")
                        if col == "tconst" and val and val != "\\N":
                            val = KEY_PREFIX + val
                        out_row.append(val)
                    writer.writerow(out_row)
                    kept += 1

        print(f"[generate_files] titles.tsv OK (zapisano {kept} wierszy)")
    else:
        print("[generate_files] WARNING: titles.tsv not found")

    # people: nconst w selected_person_ids
    people_src = in_path / "people.tsv"
    people_dst = out_path / "people.tsv"
    if people_src.exists():
        with people_src.open("r", encoding="utf-8") as fin, \
             people_dst.open("w", encoding="utf-8", newline="") as fout:

            reader = csv.DictReader(fin, delimiter="\t")
            writer = csv.writer(fout, delimiter="\t", lineterminator="\n")

            writer.writerow(reader.fieldnames)

            kept = 0
            for row in reader:
                nconst = row.get("nconst", "\\N")
                if nconst in selected_person_ids:
                    out_row = []
                    for col in reader.fieldnames:
                        val = row.get(col, "\\N")
                        if col == "nconst" and val and val != "\\N":
                            val = KEY_PREFIX + val
                        out_row.append(val)
                    writer.writerow(out_row)
                    kept += 1

        print(f"[generate_files] people.tsv OK (zapisano {kept} wierszy)")
    else:
        print("[generate_files] WARNING: people.tsv not found")

    print(f"[generate_files] DONE. Nowe pliki w: {out_path}")


In [2]:
preprocess_imdb(input_dir="./data", output_dir="./data/processed")

[preprocess] titles + title_genres OK
[preprocess] people OK
[preprocess] ratings OK
[preprocess] principals OK
[preprocess] aka_titles OK
[preprocess] episodes OK
[preprocess] DONE. Pliki w: data\processed2


In [18]:
generate_files(input_dir="./data/processed", max_child_rows=100)

[generate_files] ratings.tsv OK (zapisano 100 wierszy)
[generate_files] aka_titles.tsv OK (zapisano 100 wierszy)
[generate_files] title_genres.tsv OK (zapisano 100 wierszy)
[generate_files] episodes.tsv OK (zapisano 100 wierszy)
[generate_files] principals.tsv OK (zapisano 100 wierszy)
[generate_files] titles.tsv OK (zapisano 234 wierszy)
[generate_files] people.tsv OK (zapisano 39 wierszy)
[generate_files] DONE. Nowe pliki w: data\processed_100


In [19]:
generate_files(input_dir="./data/processed", max_child_rows=1000)

[generate_files] ratings.tsv OK (zapisano 1000 wierszy)
[generate_files] aka_titles.tsv OK (zapisano 1000 wierszy)
[generate_files] title_genres.tsv OK (zapisano 1000 wierszy)
[generate_files] episodes.tsv OK (zapisano 1000 wierszy)
[generate_files] principals.tsv OK (zapisano 1000 wierszy)
[generate_files] titles.tsv OK (zapisano 2387 wierszy)
[generate_files] people.tsv OK (zapisano 243 wierszy)
[generate_files] DONE. Nowe pliki w: data\processed_1000


In [20]:
generate_files(input_dir="./data/processed", max_child_rows=10000)

[generate_files] ratings.tsv OK (zapisano 10000 wierszy)
[generate_files] aka_titles.tsv OK (zapisano 10000 wierszy)
[generate_files] title_genres.tsv OK (zapisano 10000 wierszy)
[generate_files] episodes.tsv OK (zapisano 10000 wierszy)
[generate_files] principals.tsv OK (zapisano 10000 wierszy)
[generate_files] titles.tsv OK (zapisano 24931 wierszy)
[generate_files] people.tsv OK (zapisano 1663 wierszy)
[generate_files] DONE. Nowe pliki w: data\processed_10000


In [None]:
generate_files(input_dir="./data/processed", max_child_rows=100000)

In [None]:
generate_files(input_dir="./data/processed", max_child_rows=1000000)