In [1]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [240]:
%pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [251]:
import pandas as pd
import numpy as np

In [252]:
import pandas as pd

# Load CSVs
ratings_df = pd.read_csv("./data/ratings.csv")      # has userId, movieId, rating
movies_df = pd.read_csv("./data/movies.csv")        # has all movieId (and maybe title)

# Get all unique users and movies
all_users = ratings_df['userId'].unique()
all_movies = movies_df['movieId'].unique()

# Build full user–movie grid
full_grid = pd.MultiIndex.from_product(
    [all_users, all_movies],
    names=["userId", "movieId"]
).to_frame(index=False)


# Merge: left = full_grid, right = ratings(incl NA)
mergedY = full_grid.merge(ratings_df[['userId', 'movieId', 'rating']], 
                         on=['userId', 'movieId'], how='left')

# Fill missing ratings with 0
mergedY['rating'] = mergedY['rating'].fillna(0)

# Pivot to movie rows, user columns
Y = mergedY.pivot(index='movieId', columns='userId', values='rating')

# Remove row/column labels if you want raw matrix
Y = Y.reset_index(drop=True)
Y.columns.name = None

# Save
Y.to_csv("./data/small_movies_Y.csv", index=False, header=False)



# Add a column that flags actual interactions
ratings_df['interaction'] = 1  # flag real interactions

# Merge: left = full_grid, right = real interactions
merged = full_grid.merge(ratings_df[['userId', 'movieId', 'interaction']], 
                         on=['userId', 'movieId'], how='left')

# Fill missing interactions with 0
merged['interaction'] = merged['interaction'].fillna(0).astype(int)

# Pivot to movie rows, user columns
R = merged.pivot(index='movieId', columns='userId', values='interaction')

# Remove row/column labels if you want raw matrix
R = R.reset_index(drop=True)
R.columns.name = None

# Save
R.to_csv("./data/small_movies_R.csv", index=False, header=False)


In [253]:
num_movies = Y.shape[0]
num_users = Y.shape[1]
num_features = 50  # number of latent features (you choose this)

In [254]:
X = np.random.randn(num_movies, num_features)
df_X = pd.DataFrame(X)
df_X.to_csv("./data/small_movies_X.csv", index=False, header=False)
W = np.random.randn(num_users, num_features)
df_W = pd.DataFrame(W)
df_W.to_csv("./data/small_movies_W.csv", index=False, header=False)
b = np.zeros(num_users)  # or include item bias too
df_b = pd.DataFrame(b)
df_b.to_csv("./data/small_movies_b.csv", index=False, header=False)

In [255]:
print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)
print("Y", Y.shape, "R", R.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (9742, 610) R (9742, 610)
X (9742, 50)
W (610, 50)
b (610,)
X (9742, 50)
W (610, 50)
b (610,)
num_features 50
num_movies 9742
num_users 610
Y (9742, 610) R (9742, 610)
num_features 50
num_movies 9742
num_users 610


In [256]:
def load_ratings_small():
    file = open('./data/small_movies_Y.csv', 'rb')
    Y = np.loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_R.csv', 'rb')
    R = np.loadtxt(file,delimiter = ",")
    return(Y,R)

def load_precalc_para_small():

    file = open('./data/small_movies_X.csv', 'rb')
    X = np.loadtxt(file, delimiter = ",")

    file = open('./data/small_movies_W.csv', 'rb')
    W = np.loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_b.csv', 'rb')
    b = np.loadtxt(file,delimiter = ",")
    b = b.reshape(1,-1)
    num_movies, num_features = X.shape
    num_users,_ = W.shape
    return(X, W, b, num_movies, num_features, num_users)

#Load data
X, W, b, num_movies, num_features, num_users = load_precalc_para_small()
Y, R = load_ratings_small()


In [257]:
import tensorflow as tf
from tensorflow import keras
"""
Returns the cost for the content-based filtering
Args:
  X (ndarray (num_movies,num_features)): matrix of item features
  W (ndarray (num_users,num_features)) : matrix of user parameters
  b (ndarray (1, num_users)            : vector of user parameters
  Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
  R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
  lambda_ (float): regularization parameter
Returns:
  J (float) : Cost
"""
#Vectorized cost function implementation
def cofi_cost_func(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

J = cofi_cost_func(X, W, b, Y, R, 0)
print(f"Cost: {J:.2f}")

J = cofi_cost_func(X ,W ,b , Y ,R , 0);
print(f"Cost: {J:0.2f}")

Cost: 3146094.85
Cost: 3146094.85


In [258]:
#  From the matrix, we can compute statistics like average rating.
tsmean =  np.mean(Y[0, R[0, :].astype(bool)])
print(f"Average rating for movie 1 : {tsmean:0.3f} / 5" )

Average rating for movie 1 : 3.921 / 5


In [259]:
def compute_rmse_mae(Y_true, Y_pred, R):
    # Only consider actual ratings (where R == 1)
    error = (Y_pred - Y_true) * R
    squared_error = np.square(error)
    abs_error = np.abs(error)

    mse = np.sum(squared_error) / np.sum(R)
    mae = np.sum(abs_error) / np.sum(R)

    rmse = np.sqrt(mse)
    return rmse, mae

def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeRatings(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)


# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [260]:
# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

from sklearn.model_selection import KFold
import tensorflow as tf
import numpy as np

# Convert to numpy if not already
Y = np.array(Ynorm)
R = np.array(R)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_index, test_index in kf.split(Y.T):  # Y.T because users are columns
    print(f"\nFold {fold}")

    # Build train/test masks
    R_train = np.zeros_like(R)
    R_test = np.zeros_like(R)
    for idx in train_index:
        R_train[:, idx] = R[:, idx]
    for idx in test_index:
        R_test[:, idx] = R[:, idx]

    # Initialize model parameters
    X = tf.Variable(np.random.randn(num_movies, num_features), dtype=tf.float32)
    W = tf.Variable(np.random.randn(num_users, num_features), dtype=tf.float32)
    b = tf.Variable(np.zeros((1, num_users)), dtype=tf.float32)
    c = tf.Variable(np.zeros((num_movies, 1)), dtype=tf.float32)


    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    lambda_ = 19.733122695917142

    check_every = 50  # Print every N epochs
    # Train on training set only
    for epoch in range(500):
        with tf.GradientTape() as tape:
            cost = cofi_cost_func(X, W, b, Y, R_train, lambda_)
        grads = tape.gradient(cost, [X, W, b])
        optimizer.apply_gradients(zip(grads, [X, W, b]))
        if epoch % check_every == 0:
        # Compute validation loss
            val_loss = cofi_cost_func(X, W, b, Y, R_test, lambda_)
            print(f"Epoch {epoch:3d} | Train Loss: {cost.numpy():.4f} | Val Loss: {val_loss.numpy():.4f}")

    # Final predictions in normalized space
    preds = tf.matmul(X, tf.transpose(W)) + b
    
    # Convert preds back to original rating scale
    preds_un = preds.numpy() + Ymean  # broadcasting (num_movies x 1)
    
    # Training metrics
    Y_train = Y + Ymean  # unnormalize ground truth
    train_diff = (preds_un - Y_train) * R_train
    train_rmse = np.sqrt(np.sum(train_diff**2) / np.sum(R_train))
    train_mae = np.sum(np.abs(train_diff)) / np.sum(R_train)
    
    # Validation metrics
    test_diff = (preds_un - Y_train) * R_test
    test_rmse = np.sqrt(np.sum(test_diff**2) / np.sum(R_test))
    test_mae = np.sum(np.abs(test_diff)) / np.sum(R_test)


    # Print both
    print(f"Train RMSE for Fold {fold}: {train_rmse:.4f}")
    print(f"Train MAE  for Fold {fold}: {train_mae:.4f}")
    print(f"Test  RMSE for Fold {fold}: {test_rmse:.4f}")
    print(f"Test  MAE  for Fold {fold}: {test_mae:.4f}")

    fold += 1




Fold 1
Epoch   0 | Train Loss: 6918362.0000 | Val Loss: 5694660.0000
Epoch  50 | Train Loss: 2647242.2500 | Val Loss: 2507632.0000
Epoch 100 | Train Loss: 1230522.8750 | Val Loss: 1172143.0000
Epoch 150 | Train Loss: 613444.6250 | Val Loss: 581259.5625
Epoch 200 | Train Loss: 322033.0312 | Val Loss: 302766.0938
Epoch 250 | Train Loss: 179146.8906 | Val Loss: 166340.5312
Epoch 300 | Train Loss: 107210.1484 | Val Loss: 97499.7500
Epoch 350 | Train Loss: 70125.9375 | Val Loss: 61789.9688
Epoch 400 | Train Loss: 50533.6523 | Val Loss: 42710.8984
Epoch 450 | Train Loss: 39894.1094 | Val Loss: 32165.3281
Train RMSE for Fold 1: 0.6800
Train MAE  for Fold 1: 0.5135
Test  RMSE for Fold 1: 0.8148
Test  MAE  for Fold 1: 0.6150

Fold 2
Epoch   0 | Train Loss: 7163435.0000 | Val Loss: 5479692.0000
Epoch  50 | Train Loss: 2702284.5000 | Val Loss: 2467635.5000
Epoch 100 | Train Loss: 1255449.3750 | Val Loss: 1169016.1250
Epoch 150 | Train Loss: 628748.0000 | Val Loss: 584921.6875
Epoch 200 | Train L

In [261]:
import pandas as pd
import re

movies_df = pd.read_csv('./data/movies.csv')

# --- Extract year from title ---
def extract_year(title):
    match = re.search(r"\((\d{4})\)", title)
    return int(match.group(1)) if match else None

movies_df['year'] = movies_df['title'].apply(extract_year)

# --- One-hot encode genres ---
genre_cols = list(set('|'.join(movies_df['genres']).split('|')))
for genre in genre_cols:
    movies_df[f'genre_{genre}'] = movies_df['genres'].apply(lambda x: int(genre in x))

# Drop unneeded columns
cb_features = movies_df.drop(columns=['title', 'genres'])
cb_features

Unnamed: 0,movieId,year,genre_Animation,genre_Comedy,genre_Adventure,genre_Musical,genre_Documentary,genre_Romance,genre_Crime,genre_(no genres listed),...,genre_Action,genre_Children,genre_Thriller,genre_IMAX,genre_Fantasy,genre_War,genre_Film-Noir,genre_Mystery,genre_Drama,genre_Western
0,1,1995.0,1,1,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,2,1995.0,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,3,1995.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1995.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,1995.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,2017.0,1,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
9738,193583,2017.0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9739,193585,2017.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9740,193587,2018.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [262]:
ratings_df = pd.read_csv('./data/ratings.csv')

cb_dataset = ratings_df.merge(cb_features, on='movieId')

# Features: genre & year
X_cb = cb_dataset.drop(columns=['userId', 'movieId', 'rating', 'timestamp'])
y_cb = cb_dataset['rating']
X_cb

Unnamed: 0,year,genre_Animation,genre_Comedy,genre_Adventure,genre_Musical,genre_Documentary,genre_Romance,genre_Crime,genre_(no genres listed),genre_Sci-Fi,...,genre_Action,genre_Children,genre_Thriller,genre_IMAX,genre_Fantasy,genre_War,genre_Film-Noir,genre_Mystery,genre_Drama,genre_Western
0,1995.0,1,1,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,1995.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1995.0,0,0,0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0
3,1995.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1995.0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,2017.0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
100832,2017.0,0,0,0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0
100833,2017.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100834,2017.0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [263]:
cf_preds_full = preds_un  # shape: (num_movies, num_users)


In [264]:
import pandas as pd

# Load CSVs
ratings = pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')

In [265]:
import pandas as pd
import re

# Load data
ratings = pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')

# ---------------------------------------
# 🎬 Create content_movie.csv
# ---------------------------------------

# Extract year and encode genres (same as before)
movies['year'] = movies['title'].apply(extract_year)
movies['genres'] = movies['genres'].replace('(no genres listed)', '')
genre_ohe = movies['genres'].str.get_dummies(sep='|')

# Combine movie features
movie_features = pd.concat([movies[['movieId', 'year']], genre_ohe], axis=1)

# Add average rating per movie
movie_rating_ave = ratings.groupby('movieId')['rating'].mean().reset_index(name='ave_rating')
movie_features = pd.merge(movie_features, movie_rating_ave, on='movieId', how='left')

# 🔥 KEY STEP: Merge with ratings to get one row per rating
content_item_train = pd.merge(ratings[['userId', 'movieId', 'rating']], 
                             movie_features, 
                             on='movieId', 
                             how='left')

# Drop userId and rating columns before saving features
content_item_features = content_item_train.drop(['userId', 'rating'], axis=1)
content_item_features.to_csv('./data/content_item_train.csv', index=False, header=False)

# Save the ratings separately
ratings['rating'].to_csv('./data/content_y_train.csv', index=False, header=False)


# ---------------------------------------
# 👤 Create content_user_train.csv
# ---------------------------------------

# Merge ratings with movies to get genres
ratings_movies = pd.merge(ratings, movies[['movieId', 'genres']], on='movieId')

# Explode genres
ratings_movies['genres'] = ratings_movies['genres'].str.split('|')
ratings_genres = ratings_movies.explode('genres')

# Drop empty genre values (in case of '(no genres listed)')
ratings_genres = ratings_genres[ratings_genres['genres'] != '']

# Per-user rating count and average
user_stats = ratings.groupby('userId')['rating'].agg(
    rating_count='count',
    rating_ave='mean'
).reset_index()

# Per-user, per-genre average rating
genre_stats = ratings_genres.groupby(['userId', 'genres'])['rating'].mean().unstack(fill_value=0)
genre_stats.columns = [f"{genre}_ave" for genre in genre_stats.columns]
genre_stats = genre_stats.reset_index()

# Combine
content_user_train = pd.merge(user_stats, genre_stats, on='userId')

# Save to CSV
content_user_train.to_csv('./data/content_user_train.csv', index=False, header=False)

print("✅ Files created: content_user_train.csv and content_item_train.csv")


✅ Files created: content_user_train.csv and content_item_train.csv


In [266]:
ratings = pd.read_csv('./data/ratings.csv')
ratings['rating'].to_csv('./data/content_y_train.csv', index=False, header=False)


# # Keep only movies with ratings
rated_movie_ids = ratings['movieId'].unique()
rated_movies = movies[movies['movieId'].isin(rated_movie_ids)]

# Format output: movieId<TAB>title<TAB>genres
rated_movies[['movieId', 'title', 'genres']].to_csv("./data/content_movie_list.csv",index=False, header=False)# ---- Step 1: Extract year from movie title ----

def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

movies['year'] = movies['title'].apply(extract_year)

# ---- Step 2: Compute average rating per movie ----
movie_avg_rating = ratings.groupby('movieId')['rating'].mean().reset_index(name='ave_rating')

# ---- Step 3: One-hot encode genres ----
movies['genres'] = movies['genres'].replace('(no genres listed)', '')
genre_ohe = movies['genres'].str.get_dummies(sep='|')

# ---- Step 4: Combine features ----
movies_with_genres = pd.concat([movies[['movieId', 'year']], genre_ohe], axis=1)

# ✅ Only include movies that have ratings
content_items_vecs = pd.merge(movie_avg_rating, movies_with_genres, on='movieId', how='inner')

# ---- Step 5: Save to CSV ----
content_items_vecs.to_csv('./data/content_items_vecs.csv', index=False, header=False)

print("✅ content_items_vecs.csv created with only rated movies.")

✅ content_items_vecs.csv created with only rated movies.


In [267]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

# Load MovieLens data
ratings = pd.read_csv('./data/ratings.csv')     # Contains: userId, movieId, rating, timestamp
movies = pd.read_csv('./data/movies.csv')       # Contains: movieId, title, genres

# Define the full genre list including the "(no genres listed)" category
genre_list = [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)'
]

# Create a genre vector from a genre string
def get_genre_vector(genres):
    genre_vector = np.zeros(len(genre_list))
    if genres == '(no genres listed)':
        genre_vector[-1] = 1  # Set the last index for "no genres"
    else:
        for i, genre in enumerate(genre_list):
            if genre in genres:
                genre_vector[i] = 1
    return genre_vector

# Create a map from movieId to genre vector
movie_genre_map = {
    row['movieId']: get_genre_vector(row['genres'].split('|'))
    for _, row in movies.iterrows()
}

# Initialize the result dictionary
user_to_genre = defaultdict(lambda: {
    'glist': np.zeros((1, len(genre_list))),
    'g_count': np.zeros((1, len(genre_list))),
    'rating_count': 0,
    'rating_sum': 0.0,
    'movies': {},
    'rating_ave': 0.0
})

# Process each rating
for _, row in ratings.iterrows():
    uid = row['userId']
    mid = row['movieId']
    rating = row['rating']

    if mid not in movie_genre_map:
        continue  # Skip if the movie is not in the genre map

    genre_vec = movie_genre_map[mid]

    user_data = user_to_genre[uid]
    user_data['glist'] += genre_vec * rating
    user_data['g_count'] += genre_vec
    user_data['rating_count'] += 1
    user_data['rating_sum'] += rating
    user_data['movies'][mid] = rating

# Finalize average rating
for uid, data in user_to_genre.items():
    if data['rating_count'] > 0:
        data['rating_ave'] = round(data['rating_sum'] / data['rating_count'], 2)

# Save the result as a pickle file
with open('./data/user_to_genre.pickle', 'wb') as f:
    pickle.dump(dict(user_to_genre), f)


In [268]:
def load_data():
    ''' called to load preprepared data for the lab '''
    df = pd.read_csv('./data/content_item_train.csv')
    print(df.isnull().sum())  # See which columns have NaNs
    item_train = df.select_dtypes(include=[np.number]).values

    # Load user features (without userId)
    user_features_array = pd.read_csv('./data/content_user_train.csv', header=None).values
    
    # Load ratings
    ratings = pd.read_csv('./data/ratings.csv')
    
    # Get unique users in the same order as your user features
    unique_users = sorted(ratings['userId'].unique())
    
    # Create user features DataFrame with userId
    user_features_df = pd.DataFrame(user_features_array)
    user_features_df['userId'] = unique_users
    
    # Merge to get user features for each rating
    ratings_with_user_features = pd.merge(
        ratings[['userId', 'movieId', 'rating']], 
        user_features_df, 
        on='userId', 
        how='left'
    )
    
    # Extract user features (excluding userId, movieId, rating)
    user_train = ratings_with_user_features.drop(['userId', 'movieId', 'rating'], axis=1).values

    
    y_train    = genfromtxt('./data/content_y_train.csv', delimiter=',')
    with open('./data/content_item_train_header.txt', newline='') as f:    #csv reader handles quoted strings better
        item_features = list(csv.reader(f))[0]
    with open('./data/content_user_train_header.txt', newline='') as f:
        user_features = list(csv.reader(f))[0]
    item_vecs = genfromtxt('./data/content_item_vecs.csv', delimiter=',')

    movie_dict = defaultdict(dict)
    count = 0
#    with open('./data/movies.csv', newline='') as csvfile:
    with open('./data/content_movie_list.csv', newline='',encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1  #skip header
                #print(line) print
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]

    with open('./data/content_user_to_genre.pickle', 'rb') as f:
        user_to_genre = pickle.load(f)

    return(item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre)


In [269]:
from numpy import genfromtxt
import csv


# Load Data, set configuration variables
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()
print("user_train.shape:", user_train.shape)

num_user_features = int(user_train.shape[1] - 3)  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")
print(pd.DataFrame(item_train))

1                      0
1995.0                18
0                      0
1.1                    0
1.2                    0
1.3                    0
1.4                    0
0.1                    0
0.2                    0
0.3                    0
1.5                    0
0.4                    0
0.5                    0
0.6                    0
0.7                    0
0.8                    0
0.9                    0
0.10                   0
0.11                   0
0.12                   0
0.13                   0
3.9209302325581397     0
dtype: int64
user_train.shape: (100836, 22)
Number of training vectors: 100835
              0       1    2    3    4    5    6    7    8    9   ...   12  \
0            3.0  1995.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0   
1            6.0  1995.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  0.0   
2           47.0  1995.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   
3           50.0  1995.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0

In [270]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: [4. 4. 4. 5. 5.]


In [271]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scale training data
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

print(f"NaNs in item_train: {np.isnan(item_train).sum()}")
print(f"NaNs in user_train: {np.isnan(user_train).sum()}")

# Fix NaNs
item_train = np.nan_to_num(item_train, nan=0.0)
user_train = np.nan_to_num(user_train, nan=0.0)  # Just in case

# UPDATE: Also update the reference data
item_train_unscaled = np.nan_to_num(item_train_unscaled, nan=0.0)
user_train_unscaled = np.nan_to_num(user_train_unscaled, nan=0.0)

scalerItem = StandardScaler()
print(np.isnan(item_train).sum())

scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

NaNs in item_train: 18
NaNs in user_train: 0
0
True
True


In [272]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (80668, 22)
movie/item test data shape: (20167, 22)


In [273]:
# GRADED_CELL
# UNQ_C1

num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs) 
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features,))
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

In [274]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [275]:
# Split all arrays together
arrays_to_split = [item_train, user_train, y_train]
split_arrays = train_test_split(*arrays_to_split, train_size=0.80, shuffle=True, random_state=1)

item_train, item_test, user_train, user_test, y_train, y_test = split_arrays

In [280]:
import sklearn.metrics

tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

from sklearn.metrics import mean_squared_error
y_pred = model.predict([user_test[:, u_s:], item_test[:, i_s:]])

rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
print(f"Test RMSE: {rmse:.4f}")

loss = model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)
print(f"Test MSE from model.evaluate: {loss:.4f}")
print(f"Test RMSE: {np.sqrt(loss):.4f}")  # Should match manual RMSE


Epoch 1/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1392
Epoch 2/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.1391
Epoch 3/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.1390
Epoch 4/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.1388
Epoch 5/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.1386
Epoch 6/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1385
Epoch 7/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1384
Epoch 8/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1383
Epoch 9/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.1381
Epoch 10/30
[1m2017/2017[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [281]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 958us/step - loss: 0.1977


0.19721217453479767

In [282]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3
new_war = 4
new_western = 0
user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, 0.0,  # Film-Noir
                      new_horror, 0.0,  # Musical
                      new_mystery, new_romance, new_scifi, new_thriller, 
                      new_war, new_western, 0.0]])  # (no genres listed)
len(item_vecs)

9725

In [283]:
def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict maxtrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (num_items, 1))
    return user_vecs

# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [284]:
def print_pred_movies(y_p, item, movie_dict, maxcount=10):
    """ print results of prediction of a new user. inputs are expected to be in
        sorted order, unscaled. """
    count = 0
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        movie_id = item[i, 0].astype(int)
        disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),
                     movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

    table = tabulate(disp, tablefmt='html', headers="firstrow")
    return table

In [285]:
from tabulate import tabulate
print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 10)

  movie_id = item[i, 0].astype(int)
  disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),


KeyError: 'title'

In [286]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_cb, y_cb, test_size=0.2, random_state=42
)

In [287]:
# Recover validation set (userId, movieId) pairs
val_user_ids = cb_dataset.loc[X_val.index, 'userId'].values
val_movie_ids = cb_dataset.loc[X_val.index, 'movieId'].values

# Map userId and movieId to matrix indices
unique_users = ratings_df['userId'].unique()
unique_movies = ratings_df['movieId'].unique()

user_id_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
movie_id_to_idx = {mid: idx for idx, mid in enumerate(unique_movies)}

# Build list of predicted values from CF model
cf_preds_aligned = np.array([
    cf_preds_full[movie_id_to_idx[mid], user_id_to_idx[uid]]
    for uid, mid in zip(val_user_ids, val_movie_ids)
])

In [288]:
from sklearn.metrics import mean_squared_error

# Extract the validation set user/movie IDs
val_user_ids = cb_dataset.loc[X_val.index, 'userId'].values
val_movie_ids = cb_dataset.loc[X_val.index, 'movieId'].values

# Map userId and movieId to array indices
val_user_vecs = np.array([user_train_unscaled[user_id_to_idx[uid]] for uid in val_user_ids])
val_item_vecs = np.array([item_train_unscaled[movie_id_to_idx[mid]] for mid in val_movie_ids])

# Scale the vectors
val_user_scaled = scalerUser.transform(val_user_vecs)
val_item_scaled = scalerItem.transform(val_item_vecs)

# Make predictions
cb_preds = model.predict([val_user_scaled[:, 0], val_user_scaled[:, 1:], val_item_scaled[:, :-1]])
cb_preds = scalerTarget.inverse_transform(cb_preds)

# Blend predictions and compute RMSE
for alpha in np.linspace(0, 1, 11):  # 0.0 to 1.0 in steps of 0.1
    final_preds = alpha * cf_preds_aligned + (1 - alpha) * cb_preds
    rmse = mean_squared_error(y_val, final_preds, squared=False)
    print(f"α = {alpha:.1f} | RMSE: {rmse:.4f}")


ValueError: Layer "functional_18" expects 2 input(s), but it received 3 input tensors. Inputs received: [<tf.Tensor 'data:0' shape=(32,) dtype=float32>, <tf.Tensor 'data_1:0' shape=(32, 21) dtype=float32>, <tf.Tensor 'data_2:0' shape=(32, 21) dtype=float32>]