In [1]:
cd .. 

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
import pandas as pd
import polars as pl
from pathlib import Path

In [None]:
from src.table_integration.utils_joins import execute_join

In [3]:
import polars as pl
from catboost import CatBoostRegressor, CatBoostError
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from tqdm import tqdm
import numpy as np

In [4]:
data_dir = "data/source_tables/ken_datasets/the-movies-dataset/"

In [5]:
tab_path = Path(data_dir, "movies-prepared.parquet")
movies = pl.read_parquet(tab_path)

In [6]:
ratings = pl.read_csv(Path(data_dir, "ratings_small.csv"))

In [7]:
ratings = ratings.with_columns(
    pl.col("movieId").cast(pl.Utf8).alias("movieId")
)

In [8]:
ratings.lazy().join(
    movies.lazy(), left_on="movieId", right_on="id"
).select(
    pl.col(["userId","movieId", "rating", "timestamp", "col_to_embed"])
    ).collect().write_parquet(Path(data_dir, "ratings-prepared.parquet"))

In [9]:
movies

adult,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,title,video,vote_average,vote_count,col_to_embed,target
str,str,str,str,str,str,str,str,str,str,f64,str,str,str,bool,f64,f64,str,f64
"""False""","""30000000""","""Animation""","""862""","""en""","""Toy Story""","""21.946943""","""Pixar Animatio…","""US""","""1995""",81.0,"""English""","""Released""","""Toy Story""",false,7.7,5415.0,"""<Toy_Story>""",8.572353
"""False""","""65000000""","""Adventure""","""8844""","""en""","""Jumanji""","""17.015539""","""TriStar Pictur…","""US""","""1995""",104.0,"""English""","""Released""","""Jumanji""",false,6.9,2413.0,"""<Jumanji>""",8.419621
"""False""","""60000000""","""Action""","""949""","""en""","""Heat""","""17.924927""","""Regency Enterp…","""US""","""1995""",170.0,"""English""","""Released""","""Heat""",false,7.7,1886.0,"""<Heat_(1995_fi…",8.272855
"""False""","""35000000""","""Action""","""9091""","""en""","""Sudden Death""","""5.23158""","""Universal Pict…","""US""","""1995""",106.0,"""English""","""Released""","""Sudden Death""",false,5.5,174.0,"""<Sudden_Death_…",7.80855
"""False""","""0""","""Family""","""21032""","""en""","""Balto""","""12.140733""","""Universal Pict…","""US""","""1995""",78.0,"""English""","""Released""","""Balto""",false,7.1,423.0,"""<Balto_(film)>…",7.054932
"""False""","""44000000""","""History""","""10858""","""en""","""Nixon""","""5.092""","""Hollywood Pict…","""US""","""1995""",192.0,"""English""","""Released""","""Nixon""",false,7.1,72.0,"""<Nixon_(film)>…",7.136142
"""False""","""98000000""","""Action""","""1408""","""en""","""Cutthroat Isla…","""7.284477""","""Le Studio Cana…","""FR""","""1995""",119.0,"""English""","""Released""","""Cutthroat Isla…",false,5.7,137.0,"""<Cutthroat_Isl…",7.000752
"""False""","""52000000""","""Drama""","""524""","""en""","""Casino""","""10.137389""","""Universal Pict…","""FR""","""1995""",178.0,"""English""","""Released""","""Casino""",false,7.8,1343.0,"""<Casino_(1995_…",8.064879
"""False""","""4000000""","""Crime""","""5""","""en""","""Four Rooms""","""9.026586""","""Miramax Films""","""US""","""1995""",98.0,"""English""","""Released""","""Four Rooms""",false,6.5,539.0,"""<Four_Rooms>""",6.633468
"""False""","""60000000""","""Action""","""11517""","""en""","""Money Train""","""7.337906""","""Columbia Pictu…","""US""","""1995""",103.0,"""English""","""Released""","""Money Train""",false,5.4,224.0,"""<Money_Train>""",7.549385


In [10]:
credits = pl.read_csv(Path(data_dir, "credits.csv"))

In [11]:
credits

cast,crew,id
str,str,i64
"""[{'cast_id': 1…","""[{'credit_id':…",862
"""[{'cast_id': 1…","""[{'credit_id':…",8844
"""[{'cast_id': 2…","""[{'credit_id':…",15602
"""[{'cast_id': 1…","""[{'credit_id':…",31357
"""[{'cast_id': 1…","""[{'credit_id':…",11862
"""[{'cast_id': 2…","""[{'credit_id':…",949
"""[{'cast_id': 1…","""[{'credit_id':…",11860
"""[{'cast_id': 2…","""[{'credit_id':…",45325
"""[{'cast_id': 1…","""[{'credit_id':…",9091
"""[{'cast_id': 1…","""[{'credit_id':…",710


# Create metadata

In [12]:
from src.utils.data_structures import RawDataset
dataset_path = Path(data_dir, "ratings-prepared.parquet")
dataset_source = "ken_datasets"
metadata_dest = "data/metadata/extra"
ds = RawDataset(dataset_path, dataset_source, metadata_dest)
ds.save_metadata_to_json()


In [68]:
tab_path = Path(data_dir, "movies-prepared.parquet")
movies = pl.read_parquet(tab_path)
tab_path = Path(data_dir, "ratings-prepared.parquet")
ratings = pl.read_parquet(tab_path)


In [69]:
def measure_rmse(y_true, y_pred, squared=False):
    rmse = mean_squared_error(y_true, y_pred, squared=squared)
    return rmse


def measure_r2(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    return r2


In [72]:
src_df = execute_join(movies, ratings, left_on=["col_to_embed"], right_on=["col_to_embed"], how="left", dedup=True)

target_column = "target"
iterations = 1000
cat_features = [k for k, v in src_df.schema.items() if v == pl.Utf8]

y = src_df[target_column].to_pandas()
df = src_df.drop(target_column).to_pandas()
df = df.fillna("null")

k_fold = KFold(n_splits=5)

r2_list = []
rmse_list = []


if len(df) < 5:
    raise ValueError

for train_indices, test_indices in k_fold.split(df):
    X_train = df.iloc[train_indices]
    y_train = y[train_indices]
    X_test = df.iloc[test_indices]
    y_test = y[test_indices]

    model = CatBoostRegressor(cat_features=cat_features, 
                                iterations=iterations,
                                )
    model.fit(X_train, y_train, verbose=0)
    y_pred = model.predict(X_test)
    rmse = measure_rmse(y_test, y_pred)
    r2score = measure_r2(y_test, y_pred)

    r2_list.append(r2score)
    rmse_list.append(rmse)
    
print("rmse %.2f" % np.mean(rmse))
print("r2 %.2f" % np.mean(r2_list))

rmse 0.47
r2 0.62


In [74]:
src_df = execute_join(movies, ratings, left_on=["col_to_embed"], right_on=["col_to_embed"], how="left", dedup=False)

target_column = "target"
iterations = 1000
cat_features = [k for k, v in src_df.schema.items() if v == pl.Utf8]

y = src_df[target_column].to_pandas()
df = src_df.drop(target_column).to_pandas()
df = df.fillna("null")

k_fold = KFold(n_splits=5)

r2_list = []
rmse_list = []


if len(df) < 5:
    raise ValueError

for train_indices, test_indices in k_fold.split(df):
    X_train = df.iloc[train_indices]
    y_train = y[train_indices]
    X_test = df.iloc[test_indices]
    y_test = y[test_indices]

    model = CatBoostRegressor(cat_features=cat_features, 
                                iterations=iterations,
                                )
    model.fit(X_train, y_train, verbose=0)
    y_pred = model.predict(X_test)
    rmse = measure_rmse(y_test, y_pred)
    r2score = measure_r2(y_test, y_pred)

    r2_list.append(r2score)
    rmse_list.append(rmse)
    
print("rmse %.2f" % np.mean(rmse))
print("r2 %.2f" % np.mean(r2_list))

rmse 0.88
r2 0.54


In [73]:
src_df = movies
target_column = "target"
iterations = 1000
cat_features = [k for k, v in src_df.schema.items() if v == pl.Utf8]
y = src_df[target_column].to_pandas()
df = src_df.drop(target_column).to_pandas()
df = df.fillna("null")

k_fold = KFold(n_splits=5)

r2_list = []
rmse_list = []


if len(df) < 5:
    raise ValueError

for train_indices, test_indices in k_fold.split(df):
    X_train = df.iloc[train_indices]
    y_train = y[train_indices]
    X_test = df.iloc[test_indices]
    y_test = y[test_indices]

    model = CatBoostRegressor(cat_features=cat_features, 
                                iterations=iterations,
                                )
    model.fit(X_train, y_train, verbose=0)
    y_pred = model.predict(X_test)
    rmse = measure_rmse(y_test, y_pred)
    r2score = measure_r2(y_test, y_pred)

    r2_list.append(r2score)
    rmse_list.append(rmse)

print("rmse %.2f" % np.mean(rmse))
print("r2 %.2f" % np.mean(r2_list))

rmse 1.12
r2 0.48
