In [1]:
import numpy as np
import pandas as pd

In [2]:
user_features_path = "../data/csv/users.tsv"
movie_features_path = "../data/csv/movies.tsv"

### prepare users

In [22]:
users_df = pd.read_csv(user_features_path, sep="\t")
users_df

Unnamed: 0,user_id,gender,age_group,region,address
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [23]:
users_df.drop(columns=['address'], inplace=True)
users_df

Unnamed: 0,user_id,gender,age_group,region
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20
...,...,...,...,...
6035,6036,F,25,15
6036,6037,F,45,1
6037,6038,F,56,1
6038,6039,F,45,0


In [24]:
users_df = pd.get_dummies(users_df, columns=['gender', 'age_group', 'region']).astype(int)
users_df

Unnamed: 0,user_id,gender_F,gender_M,age_group_1,age_group_18,age_group_25,age_group_35,age_group_45,age_group_50,age_group_56,...,region_11,region_12,region_13,region_14,region_15,region_16,region_17,region_18,region_19,region_20
0,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,3,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6036,6037,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,6038,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6038,6039,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### prepare movies

In [27]:
movies_df = pd.read_csv(movie_features_path, sep="\t")
movies_df

Unnamed: 0,movie_id,title,year,fantasy,drama,action,horror,musical,crime,children's,...,documentary,thriller,western,sci-fi,romance,mystery,film-noir,adventure,comedy,animation
0,1,Toy Story,1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
1,2,Jumanji,1995,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,4,Waiting to Exhale,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,Father of the Bride Part II,1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,2000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3879,3949,Requiem for a Dream,2000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland,2000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House,2000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
movies_df['decade'] = (movies_df['year'] // 10) * 10
movies_df['decade'] = movies_df['decade'].astype(int).astype(str) + "s"

# One-hot encode the 'decade' column
decade_one_hot = pd.get_dummies(movies_df['decade'], prefix='decade').astype(int)

# Concatenate the one-hot encoded columns back to the original DataFrame
movies_df = pd.concat([movies_df, decade_one_hot], axis=1)

# Print the updated DataFrame
movies_df.head()

Unnamed: 0,movie_id,title,year,fantasy,drama,action,horror,musical,crime,children's,...,decade_1910s,decade_1920s,decade_1930s,decade_1940s,decade_1950s,decade_1960s,decade_1970s,decade_1980s,decade_1990s,decade_2000s
0,1,Toy Story,1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,2,Jumanji,1995,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,Waiting to Exhale,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,Father of the Bride Part II,1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [29]:
movies_df = movies_df.drop(columns=['title', 'year', 'decade'])
movies_df

Unnamed: 0,movie_id,fantasy,drama,action,horror,musical,crime,children's,war,documentary,...,decade_1910s,decade_1920s,decade_1930s,decade_1940s,decade_1950s,decade_1960s,decade_1970s,decade_1980s,decade_1990s,decade_2000s
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3879,3949,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3880,3950,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3881,3951,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### save CSVs

In [30]:
user_features_proc_path = "../data/csv/users_proc.csv"
movie_features_proc_path = "../data/csv/movies_proc.csv"

In [32]:
users_df.to_csv(user_features_proc_path, index=False)
movies_df.to_csv(movie_features_proc_path, index=False)