In [1]:
import polars as pl
import pandas as pd
import pyarrow
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def read_tsv_with_polars(file_path, column_names, column_types, null_values=['\\N']):
    """
    Reads a TSV file into a Polars DataFrame with specified column names and types.

    Parameters:
    file_path (str): Path to the TSV file.
    column_names (list): List of column names.
    column_types (list): List of Polars data types for the columns.
    null_values (list, optional): List of strings to be treated as null values. Defaults to ['\\N'].

    Returns:
    pl.DataFrame: Polars DataFrame with the TSV data.
    """
    # Set the format string lengths for display
    pl.Config.set_fmt_str_lengths(50)

    # Read the TSV file
    df = pl.read_csv(
        file_path,
        separator='\t',
        has_header=False,
        new_columns=column_names,
        dtypes=column_types,
        ignore_errors=True,
        null_values=null_values,
        skip_rows=1
    )
    return df

In [10]:
names = read_tsv_with_polars('./raw_data/name.basics.tsv', ['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], [pl.Utf8, pl.Utf8, pl.Int32, pl.Int32, pl.Utf8, pl.Utf8])


In [9]:
names.shape

(13123690, 6)

In [6]:
principals = read_tsv_with_polars('./raw_data/title.principals.tsv', ['const', 'ordering', 'nconst', 'job', 'characters'], [pl.Utf8, pl.Int32, pl.Utf8, pl.Utf8, pl.Utf8, pl.Utf8])

In [11]:
principals.shape

(58658557, 6)

In [12]:
actors_total = principals.join(names, on='nconst', how='inner')

In [23]:
actors_total.shape

(58658404, 11)

In [14]:
actors_total.head()

const,ordering,nconst,job,characters,column_6,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
str,i32,str,str,str,str,str,i32,i32,str,str
"""tt0000001""",1,"""nm1588970""","""self""",,"""[""Self""]""","""Carmencita""",1868.0,1910,"""soundtrack""","""tt0057728,tt0000001"""
"""tt0000001""",2,"""nm0005690""","""director""",,,"""William K.L. Dickson""",1860.0,1935,"""cinematographer,director,producer""","""tt1496763,tt1428455,tt0308254,tt0219560"""
"""tt0000001""",3,"""nm0374658""","""cinematographer""","""director of photography""",,"""William Heise""",1847.0,1910,"""cinematographer,director,producer""","""tt0285863,tt0229665,tt0241715,tt0241393"""
"""tt0000002""",1,"""nm0721526""","""director""",,,"""Émile Reynaud""",1844.0,1918,"""director,animation_department,producer""","""tt16763674,tt0000003,tt16763740,tt13125956"""
"""tt0000002""",2,"""nm1335271""","""composer""",,,"""Gaston Paulin""",,1921,"""composer""","""tt0000004,tt0000003,tt2184231,tt13125956"""


In [16]:
movies_df = pl.read_parquet('./processed_data/movies_with_ratings.parquet')

In [21]:
movies_df.head()

const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
str,str,str,str,i32,i32,i32,i32,str,f32,i32
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999
"""tt0006864""","""movie""","""Intolerance""","""Intolerance: Love's Struggle Throughout the Ages""",0,1916,,163,"""Drama,History""",7.7,16507
"""tt0009968""","""movie""","""Broken Blossoms""","""Broken Blossoms or The Yellow Man and the Girl""",0,1919,,90,"""Drama,Romance""",7.2,10924
"""tt0010323""","""movie""","""The Cabinet of Dr. Caligari""","""Das Cabinet des Dr. Caligari""",0,1920,,76,"""Horror,Mystery,Thriller""",8.0,68792
"""tt0012349""","""movie""","""The Kid""","""The Kid""",0,1921,,68,"""Comedy,Drama,Family""",8.2,132794


In [17]:
moves_with_actors = movies_df.join(actors_total, on='const', how='inner')

In [18]:
moves_with_actors.shape

(107416, 21)

In [22]:
moves_with_actors.filter(pl.col('const') == 'tt0004972')

const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,ordering,nconst,job,characters,column_6,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
str,str,str,str,i32,i32,i32,i32,str,f32,i32,i32,str,str,str,str,str,i32,i32,str,str
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,10,"""nm0376221""","""editor""",,,"""Joseph Henabery""",1888,1976,"""director,actor,assistant_director""","""tt0004972,tt0006864,tt0007931,tt0021098"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,1,"""nm0001273""","""actress""",,"""[""Elsie - Stoneman's Daughter""]""","""Lillian Gish""",1893,1993,"""actress,writer,soundtrack""","""tt0038499,tt0048424,tt0014605,tt0094315"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,2,"""nm0550615""","""actress""",,"""[""Flora Cameron - The Pet Sister""]""","""Mae Marsh""",1894,1968,"""actress""","""tt0004972,tt0006826,tt0006864,tt0014604"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,3,"""nm0910400""","""actor""",,"""[""Col. Ben Cameron aka The Little Colonel""]""","""Henry B. Walthall""",1878,1936,"""actor,assistant_director""","""tt0004972,tt0018097,tt0022753,tt0007478"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,4,"""nm0178270""","""actress""",,"""[""Margaret Cameron - The Elder Sister""]""","""Miriam Cooper""",1891,1976,"""actress,casting_director,costume_department""","""tt0004972,tt0006864,tt0012515,tt0012665"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,5,"""nm0000428""","""director""",,,"""D.W. Griffith""",1875,1948,"""director,writer,producer""","""tt0010484,tt0004972,tt0009968,tt0006864"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,6,"""nm0228746""","""writer""","""adapted from his novel: ""The Clansman: An Histori…",,"""Thomas Dixon Jr.""",1864,1946,"""writer,actor,director""","""tt0014257,tt0004972,tt3869584,tt0006664"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,7,"""nm0940488""","""writer""",,,"""Frank E. Woods""",1860,1939,"""writer,miscellaneous,producer""","""tt1334605,tt0016016,tt0004972,tt0014766"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,8,"""nm0106922""","""composer""",,,"""Joseph Carl Breil""",1870,1926,"""music_department,composer,soundtrack""","""tt0005997,tt0004972,tt0020771,tt0016220"""
"""tt0004972""","""movie""","""The Birth of a Nation""","""The Birth of a Nation""",0,1915,,195,"""Drama,History,War""",6.1,25999,9,"""nm0005658""","""cinematographer""",,,"""G.W. Bitzer""",1872,1944,"""cinematographer,director,camera_department""","""tt0431889,tt0009968,tt0006864,tt0004972"""
