In [2]:
import polars as pl
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq

In [3]:
def convert_tsv_to_parquet(tsv_file_path, parquet_dir):
    """
    Converts a TSV file to a Parquet file
    """

    parquet_file_name = os.path.basename(tsv_file_path).replace('.csv', '.parquet')
    parquet_file_path = os.path.join(parquet_dir, parquet_file_name)

    df = pl.read_csv(tsv_file_path, separator=',', has_header=True, ignore_errors=True)
    df.write_parquet(parquet_file_path)

    return parquet_file_path

In [4]:
# path = convert_tsv_to_parquet("../pipeline/data/raw_data/csv/rotten_tomatoes_movies.csv", "../pipeline/data/raw_data/parquet/")

In [7]:
df = pl.read_parquet("../dagster/data/raw_data/parquet/rotten_tomatoes_movies.parquet")

In [8]:
df.columns

['id',
 'title',
 'audienceScore',
 'tomatoMeter',
 'rating',
 'ratingContents',
 'releaseDateTheaters',
 'releaseDateStreaming',
 'runtimeMinutes',
 'genre',
 'originalLanguage',
 'director',
 'writer',
 'boxOffice',
 'distributor',
 'soundMix']

In [22]:
rt = df.with_columns(pl.col('releaseDateStreaming').str.strptime(pl.Date, '%Y-%m-%d').dt.year().alias('stream_year'))
rt = rt.with_columns(
    pl.col('releaseDateTheaters').dt.year().alias('theater_year'),
)

print(rt.shape)
rt.write_parquet("../dagster/data/raw_data/parquet/rt.parquet", compression="snappy")
rt.head()

(143258, 18)


id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix,stream_year,theater_year
str,str,i64,i64,str,str,date,str,i64,str,str,str,str,str,str,str,i32,i32
"""space-zombie-b…","""Space Zombie B…",50.0,,,,,"""2018-08-25""",75,"""Comedy, Horror…","""English""","""George Ormrod""","""George Ormrod,…",,,,2018.0,
"""the_green_gras…","""The Green Gras…",,,,,,"""2020-02-11""",114,"""Drama""","""English""","""Tiffany Edward…","""Tiffany Edward…",,,,2020.0,
"""love_lies""","""Love, Lies""",43.0,,,,,,120,"""Drama""","""Korean""","""Park Heung-Sik…","""Ha Young-Joon,…",,,,,
"""the_sore_loser…","""Sore Losers""",60.0,,,,,"""2020-10-23""",90,"""Action, Myster…","""English""","""John Michael M…","""John Michael M…",,,,2020.0,
"""dinosaur_islan…","""Dinosaur Islan…",70.0,,,,,"""2017-03-27""",80,"""Fantasy, Adven…","""English""","""Will Meugniot""","""John Loy""",,,,2017.0,


In [10]:
# Convert the date string column to a date object
df = df.with_columns(
    pl.col("releaseDateTheaters").cast(pl.Date)
)

# Sort the DataFrame by the date column
df_sorted = df.sort("releaseDateTheaters")
print(df_sorted)

shape: (143_258, 16)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ id        ┆ title     ┆ audienceS ┆ tomatoMet ┆ … ┆ writer    ┆ boxOffice ┆ distribut ┆ soundMix │
│ ---       ┆ ---       ┆ core      ┆ er        ┆   ┆ ---       ┆ ---       ┆ or        ┆ ---      │
│ str       ┆ str       ┆ ---       ┆ ---       ┆   ┆ str       ┆ str       ┆ ---       ┆ str      │
│           ┆           ┆ i64       ┆ i64       ┆   ┆           ┆           ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ space-zom ┆ Space     ┆ 50        ┆ null      ┆ … ┆ George    ┆ null      ┆ null      ┆ null     │
│ bie-bingo ┆ Zombie    ┆           ┆           ┆   ┆ Ormrod,Jo ┆           ┆           ┆          │
│           ┆ Bingo!    ┆           ┆           ┆   ┆ hn        ┆           ┆           ┆          │
│           ┆           ┆           ┆           ┆   ┆ Sabotta   ┆     