# **__ Data Preprocessing __**

### 1. Download

In [1]:
import kagglehub

data_path = kagglehub.dataset_download("odedgolden/movielens-1m-dataset")

### 2. Explore

In [2]:
import os
import pandas as pd

os.makedirs("data", exist_ok=True)
columns = ["movieid", "title", "genre"]
movies = pd.read_table(
    data_path + "/movies.dat",
    names=columns,
    sep="::",
    encoding="latin1",
    engine="python",
)
movies["genre"] = movies["genre"].apply(lambda x: x.split("|"))
movies.to_parquet("data/movies.parquet", engine="pyarrow")
movies.head()

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [3]:
columns = ["userid", "movieid", "rating", "timestamp"]
ratings = pd.read_table(
    data_path + "/ratings.dat",
    names=columns,
    sep="::",
    encoding="latin1",
    engine="python",
)
ratings.to_parquet("data/ratings.parquet", engine="pyarrow")
ratings.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
columns = ["userid", "gender", "age", "occupation", "zipcode"]
users = pd.read_table(
    data_path + "/users.dat",
    names=columns,
    sep="::",
    encoding="latin1",
    engine="python",
)
users.to_parquet("data/users.parquet", engine="pyarrow")
users.head()

Unnamed: 0,userid,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


### 3. Feature Engineering

In [5]:
import pandas as pd

users = pd.read_parquet("data/users.parquet")
ratings = pd.read_parquet("data/ratings.parquet")
movies = pd.read_parquet("data/movies.parquet")

df = ratings.merge(users, on="userid", how="left").merge(
    movies, on="movieid", how="left"
)
df.head()

Unnamed: 0,userid,movieid,rating,timestamp,gender,age,occupation,zipcode,title,genre
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),[Drama]
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),"[Animation, Children's, Musical]"
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),"[Musical, Romance]"
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),[Drama]
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)","[Animation, Children's, Comedy]"


In [6]:
df["gender"] = df["gender"].astype("category").cat.codes
df["gender"].value_counts()

gender
1    753769
0    246440
Name: count, dtype: int64

In [7]:
df["rating_norm"] = (df["rating"] - 1) / 4
df["rating_binary"] = (df["rating"] > 3).astype(int)
df.head()

Unnamed: 0,userid,movieid,rating,timestamp,gender,age,occupation,zipcode,title,genre,rating_norm,rating_binary
0,1,1193,5,978300760,0,1,10,48067,One Flew Over the Cuckoo's Nest (1975),[Drama],1.0,1
1,1,661,3,978302109,0,1,10,48067,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0.5,0
2,1,914,3,978301968,0,1,10,48067,My Fair Lady (1964),"[Musical, Romance]",0.5,0
3,1,3408,4,978300275,0,1,10,48067,Erin Brockovich (2000),[Drama],0.75,1
4,1,2355,5,978824291,0,1,10,48067,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1.0,1


In [8]:
df = df.drop(columns=["timestamp", "age", "occupation", "zipcode"])

In [9]:
from sklearn.preprocessing import LabelEncoder

all_genres = [genre for sublist in df["genre"] for genre in sublist]
unique_genres = sorted(list(set(all_genres)))
genre_encoder = LabelEncoder()
genre_encoder.fit(unique_genres)


def encode_genre_list(genre_list, encoder):
    return encoder.transform(genre_list).tolist()


df["genre_list"] = df["genre"].apply(lambda x: encode_genre_list(x, genre_encoder))
df.head()

Unnamed: 0,userid,movieid,rating,gender,title,genre,rating_norm,rating_binary,genre_list
0,1,1193,5,0,One Flew Over the Cuckoo's Nest (1975),[Drama],1.0,1,[7]
1,1,661,3,0,James and the Giant Peach (1996),"[Animation, Children's, Musical]",0.5,0,"[2, 3, 11]"
2,1,914,3,0,My Fair Lady (1964),"[Musical, Romance]",0.5,0,"[11, 13]"
3,1,3408,4,0,Erin Brockovich (2000),[Drama],0.75,1,[7]
4,1,2355,5,0,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1.0,1,"[2, 3, 4]"


In [10]:
df = df.drop(columns=["genre", "title", "rating"])
df_shuffled = df.sample(frac=1)
df_shuffled.head()

Unnamed: 0,userid,movieid,gender,rating_norm,rating_binary,genre_list
980082,5915,1100,1,0.5,0,"[0, 13]"
984880,5950,2779,1,0.5,0,[4]
309006,1837,2020,1,0.75,1,"[7, 13]"
337341,1984,1263,1,0.75,1,"[7, 16]"
225966,1369,1294,1,0.75,1,"[4, 16]"


In [11]:
df_shuffled.to_parquet("data/preprocessed.parquet", engine="pyarrow")