In [1]:
import pandas as pd
import numpy as np


In [2]:
ratings = pd.read_csv(
    "../data/raw/ratings.dat",
    sep="::",
    engine="python",
    encoding="latin-1",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

users = pd.read_csv(
    "../data/raw/users.dat",
    sep="::",
    encoding="latin-1",
    engine="python",
    names=["user_id", "gender", "age", "occupation", "zip"]
)

movies = pd.read_csv(
    "../data/raw/movies.dat",
    sep="::",
    encoding="latin-1",
    engine="python",
    names=["movie_id", "title", "genres"]
)


In [3]:
ratings["implicit_feedback"] = (ratings["rating"] >= 4).astype(int)


In [4]:
ratings["implicit_feedback"].value_counts()


implicit_feedback
1    575281
0    424928
Name: count, dtype: int64

In [5]:
interactions = ratings[ratings["implicit_feedback"] == 1][
    ["user_id", "movie_id"]
]


In [6]:
interactions.shape


(575281, 2)

In [7]:
user_counts = interactions["user_id"].value_counts()
movie_counts = interactions["movie_id"].value_counts()

active_users = user_counts[user_counts >= 5].index
popular_movies = movie_counts[movie_counts >= 5].index


In [8]:
interactions = interactions[
    interactions["user_id"].isin(active_users) &
    interactions["movie_id"].isin(popular_movies)
]


In [9]:
interactions.shape


(574376, 2)

In [10]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

interactions["user_idx"] = user_encoder.fit_transform(interactions["user_id"])
interactions["movie_idx"] = movie_encoder.fit_transform(interactions["movie_id"])


In [11]:
interactions.head()


Unnamed: 0,user_id,movie_id,user_idx,movie_idx
0,1,1193,0,924
3,1,3408,0,2685
4,1,2355,0,1835
6,1,1287,0,1015
7,1,2804,0,2218


In [12]:
import pickle

with open("../models/user_encoder.pkl", "wb") as f:
    pickle.dump(user_encoder, f)

with open("../models/movie_encoder.pkl", "wb") as f:
    pickle.dump(movie_encoder, f)


In [13]:
from scipy.sparse import csr_matrix

n_users = interactions["user_idx"].nunique()
n_movies = interactions["movie_idx"].nunique()

interaction_matrix = csr_matrix(
    (
        np.ones(len(interactions)),
        (interactions["movie_idx"], interactions["user_idx"])
    ),
    shape=(n_movies, n_users)
)


In [14]:
interaction_matrix


<3125x6034 sparse matrix of type '<class 'numpy.float64'>'
	with 574376 stored elements in Compressed Sparse Row format>

In [15]:
import joblib

joblib.dump(interaction_matrix, "../data/processed/interaction_matrix.pkl")
interactions.to_csv("../data/processed/interactions.csv",index=False)


ðŸ”‘ Feature Engineering Summary

Converted explicit ratings into implicit feedback suitable for real-world recommender systems.

Filtered low-activity users and movies to reduce noise and sparsity.

Encoded user and item IDs for model compatibility.

Constructed a sparse userâ€“item interaction matrix.

Saved reusable artifacts for modeling and deployment.