In [17]:
import polars as pl
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq

In [18]:
def convert_tsv_to_parquet(tsv_file_path, parquet_dir):
    """
    Converts a TSV file to a Parquet file
    """

    parquet_file_name = os.path.basename(tsv_file_path).replace('.csv', '.parquet')
    parquet_file_path = os.path.join(parquet_dir, parquet_file_name)

    df = pl.read_csv(tsv_file_path, separator=',', has_header=True, ignore_errors=True)
    df.write_parquet(parquet_file_path)

    return parquet_file_path

In [35]:
# path = convert_tsv_to_parquet("../pipeline/data/raw_data/csv/rotten_tomatoes_movies.csv", "../pipeline/data/raw_data/parquet/")
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [20]:
df = pl.read_parquet("../dagster/data/raw_data/parquet/rotten_tomatoes_movies.parquet")

In [21]:
df.columns

['id',
 'title',
 'audienceScore',
 'tomatoMeter',
 'rating',
 'ratingContents',
 'releaseDateTheaters',
 'releaseDateStreaming',
 'runtimeMinutes',
 'genre',
 'originalLanguage',
 'director',
 'writer',
 'boxOffice',
 'distributor',
 'soundMix']

In [41]:
# rt = df.with_columns(pl.col('releaseDateStreaming').str.strptime(pl.Date, '%Y-%m-%d').dt.year())
# rt = rt.with_columns(
#     pl.col('releaseDateTheaters').str.strptime(pl.Date, '%Y-%m-%d').dt.year()
# )

rt = df.with_columns(
    pl.when(pl.col('releaseDateTheaters').is_not_null())
    .then(pl.col('releaseDateTheaters').str.strptime(pl.Date, '%Y-%m-%d').dt.year())
    .otherwise(pl.col('releaseDateStreaming').str.strptime(pl.Date, '%Y-%m-%d').dt.year())
    .alias('year')
)
rt = rt.filter(pl.col("year") > 1)

print(rt.shape)
rt.write_parquet("../dagster/data/raw_data/parquet/rt.parquet", compression="snappy")
rt.head()

(84282, 17)


id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix,year
str,str,i64,i64,str,str,str,str,i64,str,str,str,str,str,str,str,i32
"""space-zombie-bingo""","""Space Zombie Bingo!""",50.0,,,,,"""2018-08-25""",75,"""Comedy, Horror, Sci-fi""","""English""","""George Ormrod""","""George Ormrod,John Sabotta""",,,,2018
"""the_green_grass""","""The Green Grass""",,,,,,"""2020-02-11""",114,"""Drama""","""English""","""Tiffany Edwards""","""Tiffany Edwards""",,,,2020
"""the_sore_losers_1997""","""Sore Losers""",60.0,,,,,"""2020-10-23""",90,"""Action, Mystery & thriller""","""English""","""John Michael McCarthy""","""John Michael McCarthy""",,,,2020
"""dinosaur_island_2002""","""Dinosaur Island""",70.0,,,,,"""2017-03-27""",80,"""Fantasy, Adventure, Animation""","""English""","""Will Meugniot""","""John Loy""",,,,2017
"""adrift_2018""","""Adrift""",65.0,69.0,"""PG-13""","""['Injury Images', 'Brief Drug Use', 'Thematic Elements', 'Language', 'Partial Nudity', 'Peril']""","""2018-06-01""","""2018-08-21""",120,"""Adventure, Drama, Romance""","""English""","""Baltasar Kormákur""","""Aaron Kandell,Jordan Kandell,David Branson Smith""","""$31.4M""","""STX Films""",,2018


In [40]:
one = rt.filter(pl.col("year") == 1)
print(one["id"])

shape: (0,)
Series: 'id' [str]
[
]
