In [10]:
import polars as pl
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq

In [11]:
def convert_tsv_to_parquet(tsv_file_path, parquet_dir):
    """
    Converts a TSV file to a Parquet file
    """

    parquet_file_name = os.path.basename(tsv_file_path).replace('.csv', '.parquet')
    parquet_file_path = os.path.join(parquet_dir, parquet_file_name)

    df = pl.read_csv(tsv_file_path, separator=',', has_header=True, ignore_errors=True)
    df.write_parquet(parquet_file_path)

    return parquet_file_path

In [12]:
# path = convert_tsv_to_parquet("../pipeline/data/raw_data/csv/rotten_tomatoes_movies.csv", "../pipeline/data/raw_data/parquet/")

In [13]:
df = pl.read_parquet("../pipeline/data/raw_data/parquet/rotten_tomatoes_movies.parquet")

In [14]:
df.columns

['id',
 'title',
 'audienceScore',
 'tomatoMeter',
 'rating',
 'ratingContents',
 'releaseDateTheaters',
 'releaseDateStreaming',
 'runtimeMinutes',
 'genre',
 'originalLanguage',
 'director',
 'writer',
 'boxOffice',
 'distributor',
 'soundMix']

In [34]:
# Convert the date string column to a date object
df = df.with_columns(
    pl.col("releaseDateTheaters").cast(pl.Date)
)

# Sort the DataFrame by the date column
df_sorted = df.sort("releaseDateTheaters")
print(df_sorted)

shape: (143_258, 16)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ id        ┆ title     ┆ audienceS ┆ tomatoMet ┆ … ┆ writer    ┆ boxOffice ┆ distribut ┆ soundMix │
│ ---       ┆ ---       ┆ core      ┆ er        ┆   ┆ ---       ┆ ---       ┆ or        ┆ ---      │
│ str       ┆ str       ┆ ---       ┆ ---       ┆   ┆ str       ┆ str       ┆ ---       ┆ str      │
│           ┆           ┆ i64       ┆ i64       ┆   ┆           ┆           ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ space-zom ┆ Space     ┆ 50        ┆ null      ┆ … ┆ George    ┆ null      ┆ null      ┆ null     │
│ bie-bingo ┆ Zombie    ┆           ┆           ┆   ┆ Ormrod,Jo ┆           ┆           ┆          │
│           ┆ Bingo!    ┆           ┆           ┆   ┆ hn        ┆           ┆           ┆          │
│           ┆           ┆           ┆           ┆   ┆ Sabotta   ┆     

In [36]:
table = pq.read_table("../pipeline/data/raw_data/parquet/title.basics.parquet")
imdb = pl.from_arrow(table)

In [54]:
imdb = imdb.filter(pl.col("titleType") == 'movie')
imdb.shape

(667364, 9)

In [51]:
joined = imdb.join(df, left_on=["primaryTitle", ""], right_on="title", how="inner")

In [52]:
joined["primaryTitle"].value_counts().sort(by='count')

primaryTitle,count
str,u32
"""Crooks in Cloi…",1
"""Punish Me""",1
"""A River Runs, …",1
"""The Thread of …",1
"""iGirlfriend""",1
"""Mister Buddwin…",1
"""Complex World""",1
"""Intern""",1
"""I Love You Bab…",1
"""The Silent Twi…",1
