In [10]:
import polars as pl
import pandas as pd
import numpy as np
from utils.functions import read_tsv_with_polars

In [11]:
ratings_df = read_tsv_with_polars("../dagster/data/raw_data/csv/imdb/title.ratings.tsv", ['const', 'averageRating', 'numVotes'], [pl.Utf8, pl.Float32, pl.Int32])


In [12]:
ratings_df.head(10)

const,averageRating,numVotes
str,f32,i32
"""tt0000001""",5.7,2014
"""tt0000002""",5.7,272
"""tt0000003""",6.5,1936
"""tt0000004""",5.4,179
"""tt0000005""",6.2,2707
"""tt0000006""",5.0,184
"""tt0000007""",5.4,843
"""tt0000008""",5.4,2160
"""tt0000009""",5.3,207
"""tt0000010""",6.8,7382


In [13]:
column_names = ['const', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 
                'startYear', 'endYear', 'runtimeMinutes', 'genres']

# Define column types
# Adjust these based on the actual data in each column
column_types = [pl.Utf8, pl.Utf8, pl.Utf8, pl.Utf8, pl.Int32, 
                pl.Int32, pl.Int32, pl.Int32, pl.Utf8]

basics_df = read_tsv_with_polars('../dagster/data/raw_data/csv/imdb/title.basics.tsv', column_names, column_types)

In [14]:
basics_df["titleType"].value_counts()

titleType,count
str,u32
"""tvEpisode""",7982934
"""tvSeries""",254628
"""tvSpecial""",45127
"""tvMiniSeries""",51763
"""short""",969727
"""movie""",667364
"""tvShort""",10143
"""video""",285553
"""tvPilot""",1
"""videoGame""",36964


In [15]:
filter_condition = basics_df['titleType'] == "movie"

movies = basics_df.filter(filter_condition)

In [16]:
print(movies.shape)
movies.columns

(667364, 9)


['const',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [28]:
movie_ratings = movies.join(ratings_df, on="const", how="inner")

movie_ratings = movie_ratings.sort(by="numVotes", descending=True)
print(movie_ratings.shape)
movie_ratings = movie_ratings.rename({"genres": "genre"})
print(movie_ratings.head())
# movie_ratings.write_parquet('../dagster/data/raw_data/parquet/imdb.movies.parquet')


(301971, 11)
shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ const     ┆ titleType ┆ primaryTi ┆ originalT ┆ … ┆ runtimeMi ┆ genre     ┆ averageRa ┆ numVotes │
│ ---       ┆ ---       ┆ tle       ┆ itle      ┆   ┆ nutes     ┆ ---       ┆ ting      ┆ ---      │
│ str       ┆ str       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ str       ┆ ---       ┆ i32      │
│           ┆           ┆ str       ┆ str       ┆   ┆ i32       ┆           ┆ f32       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ tt0111161 ┆ movie     ┆ The       ┆ The       ┆ … ┆ 142       ┆ Drama     ┆ 9.3       ┆ 2835969  │
│           ┆           ┆ Shawshank ┆ Shawshank ┆   ┆           ┆           ┆           ┆          │
│           ┆           ┆ Redemptio ┆ Redemptio ┆   ┆           ┆           ┆           ┆          │
│           ┆           ┆ n         ┆ n         ┆   ┆          

In [18]:
movie_ratings = movie_ratings.filter(movie_ratings['startYear'].is_not_null())
movie_ratings['startYear'].value_counts(sort=True)

startYear,count
i32,u32
2019,10485
2018,10368
2017,10293
2022,10081
2016,9811
2015,9423
2014,9201
2021,8904
2013,8573
2020,8436


In [19]:
movies_2023 = movie_ratings.filter(pl.col('startYear') == 2023)
movies_2023.head()

const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
str,str,str,str,i32,i32,i32,i32,str,f32,i32
"""tt15398776""","""movie""","""Oppenheimer""","""Oppenheimer""",0,2023,,180,"""Biography,Drama,History""",8.4,568407
"""tt1517268""","""movie""","""Barbie""","""Barbie""",0,2023,,114,"""Adventure,Comedy,Fantasy""",6.9,434968
"""tt6791350""","""movie""","""Guardians of the Galaxy Vol. 3""","""Guardians of the Galaxy Vol. 3""",0,2023,,150,"""Action,Adventure,Comedy""",7.9,355855
"""tt9362722""","""movie""","""Spider-Man: Across the Spider-Verse""","""Spider-Man: Across the Spider-Verse""",0,2023,,140,"""Action,Adventure,Animation""",8.6,320793
"""tt10366206""","""movie""","""John Wick: Chapter 4""","""John Wick: Chapter 4""",0,2023,,169,"""Action,Crime,Thriller""",7.7,311134


In [20]:
movie_ratings = movie_ratings.filter(pl.col("numVotes") > 10000)

movie_ratings.shape

(10916, 11)

In [21]:
movie_ratings.sort('averageRating', descending=True)

movie_ratings.write_parquet('../dagster/data/raw_data/parquet/imdb.movies.parquet')