In [19]:
import polars as pl
import pandas as pd
import pyarrow
import matplotlib.pyplot as plt
import numpy as np

In [20]:
def read_tsv_with_polars(file_path, column_names, column_types, null_values=['\\N']):
    """
    Reads a TSV file into a Polars DataFrame with specified column names and types.

    Parameters:
    file_path (str): Path to the TSV file.
    column_names (list): List of column names.
    column_types (list): List of Polars data types for the columns.
    null_values (list, optional): List of strings to be treated as null values. Defaults to ['\\N'].

    Returns:
    pl.DataFrame: Polars DataFrame with the TSV data.
    """
    # Set the format string lengths for display
    pl.Config.set_fmt_str_lengths(50)

    # Read the TSV file
    df = pl.read_csv(
        file_path,
        separator='\t',
        has_header=False,
        new_columns=column_names,
        dtypes=column_types,
        ignore_errors=True,
        null_values=null_values,
        skip_rows=1
    )
    return df

In [21]:
ratings_df = read_tsv_with_polars("../pipeline/data/raw_data/tsv/title.ratings.tsv", ['const', 'averageRating', 'numVotes'], [pl.Utf8, pl.Float32, pl.Int32])


In [22]:
# ratings_df = ratings_df.with_columns(pl.col("averageRating").cast(pl.Float32, strict=False))

In [23]:
ratings_df.head(10)

const,averageRating,numVotes
str,f32,i32
"""tt0000001""",5.7,2014
"""tt0000002""",5.7,272
"""tt0000003""",6.5,1936
"""tt0000004""",5.4,179
"""tt0000005""",6.2,2707
"""tt0000006""",5.0,184
"""tt0000007""",5.4,843
"""tt0000008""",5.4,2160
"""tt0000009""",5.3,207
"""tt0000010""",6.8,7382


In [24]:
column_names = ['const', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 
                'startYear', 'endYear', 'runtimeMinutes', 'genres']

# Define column types
# Adjust these based on the actual data in each column
column_types = [pl.Utf8, pl.Utf8, pl.Utf8, pl.Utf8, pl.Int32, 
                pl.Int32, pl.Int32, pl.Int32, pl.Utf8]

basics_df = read_tsv_with_polars('../pipeline/data/raw_data/tsv/title.basics.tsv', column_names, column_types)

In [25]:
basics_df["titleType"].value_counts()

titleType,count
str,u32
"""tvMiniSeries""",51763
"""tvEpisode""",7982934
"""tvShort""",10143
"""tvMovie""",143915
"""videoGame""",36964
"""video""",285553
"""tvSpecial""",45127
"""movie""",667364
"""short""",969727
"""tvSeries""",254628


In [26]:
filter_condition = basics_df['titleType'] == "movie"

movies = basics_df.filter(filter_condition)

In [27]:
movies.columns

['const',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [35]:
movie_ratings = movies.join(ratings_df, on="const", how="inner")

movie_ratings = movie_ratings.sort(by="numVotes", descending=True)
movie_ratings.head()


const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
str,str,str,str,i32,i32,i32,i32,str,f32,i32
"""tt0111161""","""movie""","""The Shawshank Redemption""","""The Shawshank Redemption""",0,1994,,142,"""Drama""",9.3,2835969
"""tt0468569""","""movie""","""The Dark Knight""","""The Dark Knight""",0,2008,,152,"""Action,Crime,Drama""",9.0,2817391
"""tt1375666""","""movie""","""Inception""","""Inception""",0,2010,,148,"""Action,Adventure,Sci-Fi""",8.8,2500739
"""tt0137523""","""movie""","""Fight Club""","""Fight Club""",0,1999,,139,"""Drama""",8.8,2271670
"""tt0109830""","""movie""","""Forrest Gump""","""Forrest Gump""",0,1994,,142,"""Drama,Romance""",8.8,2210564


In [29]:
movie_ratings = movie_ratings.filter(movie_ratings['startYear'].is_not_null())
movie_ratings['startYear'].value_counts(sort=True)

startYear,count
i32,u32
2019,10485
2018,10368
2017,10293
2022,10081
2016,9811
2015,9423
2014,9201
2021,8904
2013,8573
2020,8436


In [30]:
movies_2023 = movie_ratings.filter(pl.col('startYear') == 2023)
movies_2023.head()

const,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
str,str,str,str,i32,i32,i32,i32,str,f32,i32
"""tt0070596""","""movie""","""Socialist Realism""","""El realismo socialista""",0,2023,,78.0,"""Drama""",7.5,52
"""tt0122511""","""movie""","""The Gnomes Great Adventure""","""The Gnomes Great Adventure""",0,2023,,74.0,"""Adventure,Animation,Comedy""",6.1,62
"""tt0164115""","""movie""","""Nine Ball""","""Nine Ball""",0,2023,,,,6.0,35
"""tt0221503""","""movie""","""A Question of Suspense""","""A Question of Suspense""",0,2023,,62.0,"""Crime,Drama""",5.7,112
"""tt0347992""","""movie""","""Death Ray on Coral Island""","""Shanhu dao shang de shi guang""",0,2023,,,"""Sci-Fi,Thriller""",4.8,26


In [31]:
movie_ratings = movie_ratings.filter(pl.col("numVotes") > 10000)

movie_ratings.shape

(10916, 11)

In [32]:
movies_pd = movie_ratings.to_pandas()


# movies_pd.hist(column='numVotes', bins=100)

# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.hist(movies_pd['numVotes'], bins=100)
# plt.yscale('log')
# plt.title('Histogram with Logarithmic Scale')
# plt.show()

# Histogram of log-transformed data
movies_pd['log_transformed'] = np.log(movies_pd['numVotes'])
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 2)
plt.hist(movies_pd['log_transformed'], bins=50)
plt.title('Histogram of Log-Transformed Data')
plt.show()

NameError: name 'plt' is not defined

In [None]:
movie_ratings.sort('averageRating', descending=True)

movie_ratings.write_parquet('./data/movies_with_ratings.parquet')