In [1]:
import numpy as np
import pandas as pd
import datapac
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Generating The Matrix

In [2]:
df = datapac.load_movies()

In [3]:
def create_soup(x: pd.Series):
    def join_feat(feat):
        if feat is None:
            parsed = ""
        else:
            v = str(feat).split(",")
            parsed = ' '.join(v)
        return parsed

    out = join_feat(x['genres'])
    out = out + ' ' + join_feat(x['production_companies'])
    out = out + ' ' + join_feat(x['title'])
    return out

In [4]:
df = df[['production_companies', 'genres', 'title', 'id']]
df['soup'] = df.apply(create_soup, axis=1)
count = CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(df['soup'])
cos_sim = cosine_similarity(count_matrix, count_matrix)
df = df.reset_index()

In [5]:
indices = pd.DataFrame(df[['id', 'title']]).drop_duplicates()

## Generating Recommendations

In [8]:
test_user_id = 142983

In [9]:
import datapac
from sklearn.model_selection import train_test_split

rat = datapac.load_ratings().drop_duplicates()
ratings = datapac.clean_ratings(rat)
trainset, test_ratings = train_test_split(ratings, test_size=0.2)

In [10]:
movies = pd.read_csv("data/movies_processed_cleaned.csv")
sim_mat = np.load("matrices/feature_sim_matrix.npy")
# test_ratings = pd.read_csv("data/test_ratings.csv")

In [14]:
def get_recommendations(movie_id: int, cosine_sim):
    sim_scores = list(enumerate(cosine_sim[movie_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    # sim = [i[1] for i in sim_scores]
    return movie_indices, sim_scores

In [15]:
get_recommendations(1104, sim_mat)

([6845, 19048, 17010, 6177, 9412, 6794, 11039, 12379, 12498, 14908],
 [(6845, 0.46625240412015695),
  (19048, 0.4423258684646915),
  (17010, 0.440086229423352),
  (6177, 0.43070552164653236),
  (9412, 0.4305283859114459),
  (6794, 0.41702882811414954),
  (11039, 0.41702882811414954),
  (12379, 0.41702882811414954),
  (12498, 0.41702882811414954),
  (14908, 0.41702882811414954)])

# Predict and Measure

In [61]:
test_user_id = 142983
rated1 = test_ratings.loc[test_ratings['userId'] == test_user_id]

In [59]:
rated1.loc[rated1['movieId'] == 1831]

Unnamed: 0,userId,rating,movieId
13762019,142983,1.0,1831.0


In [None]:
def predict_rating(user_id:int, target_id:int, similarity_matrix, indices):
    rated_items = test_ratings.loc[test_ratings['userId'] == user_id][['movieId', 'rating']]
    
    weighted_ratings = 0
    similarity_sum = 0

    for rated in rated_items.iterrows():
        seen_mov_id = int(rated[1]['movieId'])
        seen_idx = indices.loc[indices['id']==seen_mov_id].index[0]
        target_idx = indices.loc[indices['id']==target_id].index[0]

        sim = similarity_matrix[seen_idx][target_idx]
        weighted_ratings += sim * rated[1]['rating']
        similarity_sum += sim

    if similarity_sum == 0:
        return np.nan  # No similar items with ratings
    return weighted_ratings / similarity_sum

In [None]:
# pred_rating = predict_rating(test_user_id, 1104, sim_mat)

def test(row):
    out = predict_rating(int(row['userId']), int(row['movieId']), sim_mat, indices)
    # print({'predicted': out, 'actual': row['rating']})
    return pd.Series({'userId': row['userId'], 'movieId': row['movieId'], 'predicted': out, 'actual': row['rating']})


predictions = test_ratings[:10000].apply(test, axis=1)
predictions


Unnamed: 0,userId,movieId,predicted,actual
19010435,197362.0,185.0,3.518450,3.0
21689960,225292.0,45431.0,3.279286,3.5
18790124,195101.0,590.0,3.694442,4.0
13048870,135598.0,2124.0,1.042823,0.5
449130,4627.0,16.0,3.229359,4.0
...,...,...,...,...
13868142,144121.0,2028.0,2.416548,4.0
3138823,32757.0,132.0,2.947032,2.0
1835856,19031.0,480.0,4.095996,4.0
6467545,66740.0,111.0,4.601041,4.5


In [None]:
predictions.to_csv("metrics/content_based_predictions.csv")

In [92]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import json

rmse = root_mean_squared_error(predictions['actual'].values, predictions['predicted'].values)
mae = mean_absolute_error(predictions['actual'].values, predictions['predicted'].values)

with open("metrics/content_based.txt", "w+") as file:
    file.write(json.dumps({'rmse': rmse, 'mae': mae}))