In [21]:
import polars as pl
import pandas as pd
import numpy as np
from utils.functions import read_tsv_with_polars

In [23]:
ratings_df = read_tsv_with_polars("../pipeline/data/raw_data/csv/imdb/title.ratings.tsv", ['const', 'averageRating', 'numVotes'], [pl.Utf8, pl.Float32, pl.Int32])


In [25]:
ratings_df.head(10)

const,averageRating,numVotes
str,f32,i32
"""tt0000001""",5.7,2014
"""tt0000002""",5.7,272
"""tt0000003""",6.5,1936
"""tt0000004""",5.4,179
"""tt0000005""",6.2,2707
"""tt0000006""",5.0,184
"""tt0000007""",5.4,843
"""tt0000008""",5.4,2160
"""tt0000009""",5.3,207
"""tt0000010""",6.8,7382


In [26]:
column_names = ['const', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 
                'startYear', 'endYear', 'runtimeMinutes', 'genres']

# Define column types
# Adjust these based on the actual data in each column
column_types = [pl.Utf8, pl.Utf8, pl.Utf8, pl.Utf8, pl.Int32, 
                pl.Int32, pl.Int32, pl.Int32, pl.Utf8]

basics_df = read_tsv_with_polars('../pipeline/data/raw_data/csv/imdb/title.basics.tsv', column_names, column_types)

In [27]:
basics_df["titleType"].value_counts()

titleType,count
str,u32
"""short""",969727
"""movie""",667364
"""tvEpisode""",7982934
"""tvPilot""",1
"""tvMiniSeries""",51763
"""tvSpecial""",45127
"""tvShort""",10143
"""video""",285553
"""tvSeries""",254628
"""tvMovie""",143915


In [28]:
filter_condition = basics_df['titleType'] == "movie"

movies = basics_df.filter(filter_condition)

In [29]:
print(movies.shape)
movies.columns

(667364, 9)


['const',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [32]:
movie_ratings = movies.join(ratings_df, on="const", how="inner")

movie_ratings = movie_ratings.sort(by="numVotes", descending=True)
print(movie_ratings.shape)
movie_ratings["originalTitle"].value_counts()


(301971, 11)


originalTitle,count
str,u32
"""The Yellow Balloon""",1
"""Blef doskonaly""",1
"""The Final Scream""",1
"""The Great Train Robbery""",1
"""Song One""",1
"""Baradaram Khosro""",1
"""Et hjørne af paradis""",1
"""Pál Adrienn""",1
"""Unoponchash Batash""",1
"""WTFry""",1


In [29]:
movie_ratings = movie_ratings.filter(movie_ratings['startYear'].is_not_null())
movie_ratings['startYear'].value_counts(sort=True)

startYear,count
i32,u32
2019,10485
2018,10368
2017,10293
2022,10081
2016,9811
2015,9423
2014,9201
2021,8904
2013,8573
2020,8436


In [30]:
movies_2023 = movie_ratings.filter(pl.col('startYear') == 2023)
movies_2023.head()

const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
str,str,str,str,i32,i32,i32,i32,str,f32,i32
"""tt0070596""","""movie""","""Socialist Realism""","""El realismo socialista""",0,2023,,78.0,"""Drama""",7.5,52
"""tt0122511""","""movie""","""The Gnomes Great Adventure""","""The Gnomes Great Adventure""",0,2023,,74.0,"""Adventure,Animation,Comedy""",6.1,62
"""tt0164115""","""movie""","""Nine Ball""","""Nine Ball""",0,2023,,,,6.0,35
"""tt0221503""","""movie""","""A Question of Suspense""","""A Question of Suspense""",0,2023,,62.0,"""Crime,Drama""",5.7,112
"""tt0347992""","""movie""","""Death Ray on Coral Island""","""Shanhu dao shang de shi guang""",0,2023,,,"""Sci-Fi,Thriller""",4.8,26


In [31]:
movie_ratings = movie_ratings.filter(pl.col("numVotes") > 10000)

movie_ratings.shape

(10916, 11)

In [None]:
movie_ratings.sort('averageRating', descending=True)

movie_ratings.write_parquet('./data/movies_with_ratings.parquet')