In [121]:
from src.utils import TrainTestSplitter
import pandas as pd
from sklearn.metrics import mean_squared_error, f1_score

from src.utils import read_pickles
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm

# Alternative Least Squares
---

Alternative Least Squares (ALS) is a matrix factorization technique used in recommender systems to predict user preferences for items. It decomposes the user-item interaction matrix into two lower-dimensional matrices, representing users and items, capturing latent factors that influence user behavior and item appeal. ALS iteratively alternates between updating these matrices by solving a series of least squares problems, minimizing the difference between predicted and actual interactions. This process continues until the model converges, achieving accurate predictions. ALS is particularly effective for handling large, sparse datasets, making it a popular choice for recommendation tasks.

Let's talk about math, ALS can be represented in following way:
   $$
   \min_{U, V} \sum_{(i, j) \in \mathcal{K}} (R_{ij} - U_i^T V_j)^2 + \lambda (\|U_i\|^2 + \|V_j\|^2)
   $$
   where:
   - $R$ is the user-item interaction matrix with $R_{ij}$ representing the rating given by user $i$ to item $j$.
   - $U$ is the user matrix of size $m \times k$, where $m$ is the number of users and $k$ is the number of latent factors.
   - $V$ is the item matrix of size $n \times k$, where $n$ is the number of items and $k$ is the number of latent factors.
   - $\mathcal{K}$ is the set of observed (user, item) pairs.
   - $\lambda$ is the regularization parameter to prevent overfitting.
   - $U_i$ is the latent factor vector for user $i$.
   - $V_j$ is the latent factor vector for item $j$.


In [113]:
df_movies, df_users, df_ratings = read_pickles("../../data/ml-1m-after_eda/")
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df_ratings['UserID'] = user_encoder.fit_transform(df_ratings['UserID'])
df_ratings['MovieID'] = movie_encoder.fit_transform(df_ratings['MovieID'])

rating_matrix = df_ratings.pivot_table(index='UserID', columns='MovieID', values='Rating', fill_value=0)

# Train Test Split

---
For train/test split I will drop 10 percent of data and will ask model to predict it using ALS

![alt text](https://i0.wp.com/neptune.ai/wp-content/uploads/2022/10/How-to-Test-a-Recommender-System27.png?ssl=1
)


In [114]:
train, test, indicies_of_zero = TrainTestSplitter.split_by_deleting_reviews(rating_matrix, 0.1)


In [115]:
class AlternatingLeastSquares:
    def __init__(self, num_factors=10, regularization=0.1, iterations=10):
        self.num_factors = num_factors
        self.regularization = regularization
        self.iterations = iterations

    def fit(self, interaction_matrix):
        self.num_users, self.num_items = interaction_matrix.shape
        self.user_factors = np.random.random((self.num_users, self.num_factors))
        self.item_factors = np.random.random((self.num_items, self.num_factors))

        for iteration in tqdm(range(self.iterations), desc="ALS Training Progress"):
            self.user_factors = self._als_step(interaction_matrix, self.user_factors, self.item_factors)
            self.item_factors = self._als_step(interaction_matrix.T, self.item_factors, self.user_factors)


    def _als_step(self, interaction_matrix, update_vecs, fixed_vecs):
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.num_factors) * self.regularization
        b = interaction_matrix.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        update_vecs = b.dot(A_inv)
        return update_vecs


    def predict(self, user_id=None):
        if user_id is None:
            predictions = self.user_factors.dot(self.item_factors.T)
            return predictions
        else:
            predictions =  self.user_factors.dot(self.item_factors.T)
            print(predictions.shape)
            return predictions[user_id]    

    def calculate_mse(self, interaction_matrix):
        if isinstance(interaction_matrix, pd.DataFrame):
            interaction_matrix = interaction_matrix.values

        predictions = self.predict()
        mse = mean_squared_error(interaction_matrix, predictions)
        return mse

    def calculate_f1(self, interaction_matrix, threshold=0.5):
        if isinstance(interaction_matrix, pd.DataFrame):
            print("dataframe detected")
            interaction_matrix = interaction_matrix.values
    
        predictions = self.predict()
        binary_predictions = (predictions >= threshold).astype(int)
        binary_interactions = (interaction_matrix >= threshold).astype(int)
        f1 = f1_score(np.array(binary_interactions).flatten(), np.array(binary_predictions).flatten())
        return f1


In [116]:
als = AlternatingLeastSquares(num_factors=500, regularization=0.1, iterations=100)

als.fit(train)

ALS Training Progress: 100%|██████████| 100/100 [00:36<00:00,  2.72it/s]


In [117]:
def find_recommendations(user_id, number_of_recommendations=15):
    predictions = als.predict(user_id)
    high_score_movie_ids = np.argsort(predictions)[::-1][:number_of_recommendations]
    items_to_recommend = df_movies.where(df_movies['MovieID'].isin(high_score_movie_ids)).dropna()
    print(items_to_recommend[['Title', 'Genres']])

## We would recommend following items to user with id 20


In [118]:
find_recommendations(20, 15)

(6040, 3706)
                                         Title                 Genres
146                  Awfully Big Adventure, An                [Drama]
297                                  Quiz Show                [Drama]
545   Thirty-Two Short Films About Glenn Gould          [Documentary]
770                     Independence Day (ID4)  [Action, Sci-Fi, War]
1650                      Big Bang Theory, The                [Crime]
1677                              Postman, The                [Drama]
1689                              Men of Means        [Action, Drama]
1827                              Cousin Bette               [Comedy]
2026                          Shaggy D.A., The   [Children's, Comedy]
2108                               Family Plot     [Comedy, Thriller]
2661                              Barry Lyndon                [Drama]
3230                                Hanging Up        [Comedy, Drama]


## Calculate MSE 

In [119]:
mse = als.calculate_mse(rating_matrix)
mse

0.17148738668471872

## Calculate F1 

In [120]:
f1 = als.calculate_f1(rating_matrix)
f1

dataframe detected


0.6697590797360656

ALS performs much better if we increase number of factors. During experiments I tried different number of iterations and factors and the second property plays much significant role. Maybe after som e iteration matrix of predictions converges and there is no point of computations, however as number of factors stayed the same it's not surprise that results haven't changed after convergance.