### 1 Importing libraries, Data and data conversion
#### 1.1 Download Data & Setup up files
Download the 100K MovieLens dataset from:
https://grouplens.org/datasets/movielens/latest/

Create a folder named data, and inside it create a subfolder called 100k.
Place all the downloaded files from the 100k dataset into this folder and unzip them.

#### 1.2 pip instal

In [None]:

#if you are in need to install dependencies in your venv, uncomment the line below.
%pip install duckdb pandas


Note: you may need to restart the kernel to use updated packages.


#### 1.3 Importing libraries and folder paths

In [None]:
import duckdb, pandas as pd
from pathlib import Path
import os
 
DATA_DIR = Path("..") / "data" / "100k"
#movies_path
 
links_csv = DATA_DIR / "links.csv"
movies_csv = DATA_DIR / "movies.csv"
ratings_csv = DATA_DIR / "ratings.csv"
tags_csv = DATA_DIR / "tags.csv"


# diretório para guardar parquet
PARQUET_DIR = DATA_DIR / "parquet"
PARQUET_DIR.mkdir(exist_ok=True)

#### 1.4 Creating function to create parque files

In [None]:
def safe_copy_to_parquet(csv_path, parquet_path, sql_select):
    if parquet_path.exists():
        print(f"Deleting existing file: {parquet_path}")
        os.remove(parquet_path)

    print(f"Creating parquet file: {parquet_path}")

    duckdb.sql(f"""
    COPY (
        {sql_select}
    ) TO '{parquet_path}'
    (FORMAT 'parquet');
    """)

    print(f"✔ Finished writing {parquet_path}\n")


#### 1.5 Creating Parquet files
##### 1.5.1 movies.parquet

In [None]:
# converter movies
safe_copy_to_parquet(
    movies_csv,
    PARQUET_DIR / "movies.parquet",
    f"SELECT * FROM read_csv_auto('{movies_csv}')"
)

Creating parquet file: ..\data\100k\parquet\movies.parquet
✔ Finished writing ..\data\100k\parquet\movies.parquet



##### 1.5.2 ratings.parquet

In [None]:
# ratings parquet with correct types
duckdb.sql(f"""
COPY (
    SELECT
        userId::INT     AS userId,
        movieId::INT    AS movieId,
        rating::DOUBLE  AS rating,
        to_timestamp(CAST(timestamp AS BIGINT)) AS timestamp
    FROM read_csv_auto('{ratings_csv}')
) TO '{PARQUET_DIR / "ratings.parquet"}'
(FORMAT 'parquet');
""")

##### 1.5.3 tags.parquet

In [None]:
# tags parquet with correct types
duckdb.sql(f"""
COPY (
    SELECT
        userId::INT     AS userId,
        movieId::INT    AS movieId,
        tag,
        to_timestamp(CAST(timestamp AS BIGINT)) AS timestamp
    FROM read_csv_auto('{tags_csv}')
) TO '{PARQUET_DIR / "tags.parquet"}'
(FORMAT 'parquet');
""")

##### 1.5.4 links.parquet

In [None]:
# converter links
safe_copy_to_parquet(
    links_csv,
    PARQUET_DIR / "links.parquet",
    f"SELECT * FROM read_csv_auto('{links_csv}')"
)

Creating parquet file: ..\data\100k\parquet\links.parquet
✔ Finished writing ..\data\100k\parquet\links.parquet



### 2.0 Creating tables in Duckdb

In [None]:
con = duckdb.connect("movielens100K.duckdb")

con.sql(f"CREATE OR REPLACE TABLE movies  AS SELECT * FROM read_parquet('{PARQUET_DIR / "movies.parquet"}')")
con.sql(f"CREATE OR REPLACE TABLE ratings AS SELECT * FROM read_parquet('{PARQUET_DIR / "ratings.parquet"}')")
con.sql(f"CREATE OR REPLACE TABLE tags    AS SELECT * FROM read_parquet('{PARQUET_DIR / "tags.parquet"}')")
con.sql(f"CREATE OR REPLACE TABLE links   AS SELECT * FROM read_parquet('{PARQUET_DIR / "links.parquet"}')")


In [None]:
# show final state
con.sql("SHOW TABLES").df()

Unnamed: 0,name
0,links
1,movies
2,ratings
3,tags


In [None]:
# Show row counts for all tables in the current DuckDB connection
for table in con.sql("SHOW TABLES").df()["name"]:
    count = con.sql(f"SELECT COUNT(*) AS cnt FROM {table}").df()["cnt"][0]
    print(f"Table '{table}': {count} rows")

Table 'links': 9742 rows
Table 'movies': 9742 rows
Table 'ratings': 100836 rows
Table 'tags': 3683 rows


### Close the connection (when done)

In [None]:
con.close()
print("Connection closed.")


Connection closed.
