In this file, the following is done:
1. We read files
2. One hot encode what we can: genre, occupating, age and other staff
3. Drop unimportant columns like zipcode, dates and other staff
4. Create a tensors from dataframes in order to use them later like feature tensors

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
sns.set_theme()

In [8]:
raw_movielens_folder = "./../data/interim/ml-100k" # even though folder if interim, this file is raw

In [9]:
genres = ["unknown",
"Action",
"Adventure",
"Animation",
"Children",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western"
]

In [65]:
df_ratings = pd.read_csv(raw_movielens_folder+"/u.data", sep = "\t", header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
df_users = pd.read_csv(raw_movielens_folder+"/u.user", sep='|', header=None, names=['user_id', 'age', 'gender', 'occupation', 'zipcode'], index_col=0)

# in interim this file has replaces || by |
df_movies = pd.read_csv(raw_movielens_folder+"/u.item", sep='|', header=None, names = ["movie_id", "name", "video_date", "link"] + genres, encoding='latin-1', index_col=0)
df_movies.drop(["link"], axis=1, inplace=True) # useless

In [67]:
# drop titles, they are useless. also, i am not going to use release date
df_movies.drop(["name"], inplace=True, axis=1)
df_ratings.drop(["timestamp"], axis=1, inplace=True) # timestamp is useless
df_movies.drop(["video_date"], axis=1, inplace=True) # useless
df_users.drop(["zipcode"], inplace=True, axis=1) # useless

In [68]:
df_users = pd.get_dummies(df_users, columns=['gender'], dtype="int") # one hot encoding the gender
df_users.drop(["gender_M"], inplace=True, axis=1)
df_users.rename({"gender_F" : "gender"}, axis=1, inplace=True)

# so we have gender with either 1 on 0

In [69]:
df_users = pd.get_dummies(df_users, columns=['occupation'], dtype="int") # one hot encoding the occupation

In [70]:
# Encoding the age into cathegories

bins = [0, 17, 24, 34, 49, float('inf')]

# Define the labels
labels = ['Under 18', '18-24', '25-34', '35-49', 'Above 49']

# Create the new column
df_users['age_group'] = pd.cut(df_users['age'], bins=bins, labels=labels, include_lowest=True)

In [71]:
df_users = pd.get_dummies(df_users, columns=['age_group'], dtype="int")

In [72]:
last_1_columns = df_movies.columns[-1:]

# Cast the last 5 columns to int
df_movies[last_1_columns] = df_movies[last_1_columns].astype(int)

In [73]:
df_ratings.head(1)

Unnamed: 0,user_id,movie_id,rating
0,196,242,3


In [74]:
df_users.sample(1)

Unnamed: 0_level_0,age,gender,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,...,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,age_group_Under 18,age_group_18-24,age_group_25-34,age_group_35-49,age_group_Above 49
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
588,18,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [75]:
df_movies.head(1)

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
df_users.drop("age", axis=1, inplace=True) # We created an onehot for this

In [77]:
df_ratings.to_csv("./../data/interim/ratings_cleaned.csv")

In [78]:
df_users.to_csv("./../data/interim/users_cleaned.csv")
df_movies.to_csv("./../data/interim/movies_cleaned.csv")

Here we create a tensor of those 3 matrices

In [79]:
import torch

In [80]:
users_tensor = torch.tensor(df_users.values)
torch.save(users_tensor, "./../data/interim/users.pt")

In [81]:
movies_tensor = torch.tensor(df_movies.values)
torch.save(movies_tensor, "./../data/interim/movies.pt")

In [56]:
df_ratings.head()

Unnamed: 0_level_0,movie_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
196,242,3
186,302,3
22,377,1
244,51,2
166,346,1


In [82]:
ratings_tensor = torch.zeros(len(set(df_ratings["user_id"])),len(set(df_ratings["movie_id"]))) # matrix with ratings. 

for i, s in df_ratings.iterrows():
    ratings_tensor[s["user_id"]-1][s["movie_id"]-1] = s["rating"]

In [83]:
torch.save(ratings_tensor, "./../data/interim/ratings.pt")