In [None]:
import pandas as pd
import html
import unicodedata
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def clean_text(text: str) -> str:
    """
    Clean and normalize game description text:
      1. Unescape HTML entities
      2. Unicode normalization (NFKC)
      3. Replace line breaks/tabs with spaces
      4. Collapse multiple spaces
    """
    if pd.isnull(text):
        return ''
    # 1) HTML unescape
    text = html.unescape(text)
    # 2) Unicode normalize
    text = unicodedata.normalize('NFKC', text)
    # 3) Replace newlines/tabs with space
    text = re.sub(r'[\r\n\t]+', ' ', text)
    # 4) Collapse multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()


class BGGRankPredictor:
    def __init__(self,
                 csv_path: str,
                 text_column: str = 'description',
                 rating_column: str = 'average_rating',
                 title_column: str = 'name'):
        """
        Load dataset, clean descriptions, and build TF-IDF matrix.
        """
        # Load data
        self.df = pd.read_csv(csv_path)

        # Clean descriptions in‑place
        self.df[text_column] = self.df[text_column].fillna('').apply(clean_text)

        # Store column names
        self.text_column = text_column
        self.rating_column = rating_column
        self.title_column = title_column

        # Fit TF-IDF on cleaned descriptions
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.vectorizer.fit_transform(
            self.df[self.text_column]
        )

    def find_similar_games(self,
                           description: str,
                           top_n: int = 3):
        """
        Return top_n games most similar to `description`.
        """
        # Clean the incoming description the same way
        desc_clean = clean_text(description)
        desc_vec = self.vectorizer.transform([desc_clean])

        # Compute cosine similarities
        sims = cosine_similarity(desc_vec, self.tfidf_matrix).flatten()

        # Get top indices
        top_idx = sims.argsort()[::-1][:top_n]

        # Return a DataFrame of title, rating, and similarity
        results = self.df.iloc[top_idx].copy()
        results['similarity'] = sims[top_idx]
        return results[[self.title_column, self.rating_column, 'similarity']]


    def predict_rating(self,
                       description: str,
                       top_n: int = 3):
        """
        Predict rating using a **similarity‑weighted** mean of the top‑n neighbours.
        """
        sims_df = self.find_similar_games(description, top_n)

        # ⋆ weight = similarity score
        weights = sims_df['similarity'].to_numpy()
        ratings = sims_df[self.rating_column].to_numpy()

        if weights.sum() == 0:                         # ⋆ rare corner case
            pred = ratings.mean()
        else:
            pred = (ratings * weights).sum() / weights.sum()

        return pred, sims_df



if __name__ == '__main__':
    predictor = BGGRankPredictor('complete_dataset.csv')
    sample = "tile placement game in which the players draw and place a tile with a piece of habitat."
    avg_rating, similar_games = predictor.predict_rating(sample, top_n=3)
    print(f'Predicted rating: {avg_rating:.2f}')
    print(similar_games)


Predicted rating: 6.97
                name  average_rating  similarity
4951        Cascadia         7.91467    0.409115
4822  Mondriaan 2020         6.16765    0.407009
6172          NMBR 9         6.83969    0.385795


In [None]:
import pandas as pd
import html
import unicodedata
import re
from typing import Tuple

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np


# ---------- text cleaning -------------------------------------------------- #
def clean_text(text: str) -> str:
    """Standardise description strings (HTML → plain text, NFKC etc.)."""
    if pd.isnull(text):
        return ''
    text = html.unescape(text)                         # 1) Un‑escape HTML
    text = unicodedata.normalize('NFKC', text)         # 2) Unicode normalise
    text = re.sub(r'[\r\n\t]+', ' ', text)             # 3) ↵/tab → space
    text = re.sub(r'\s{2,}', ' ', text)                # 4) collapse spaces
    return text.strip()


# ---------- predictor ------------------------------------------------------- #
class BGGRankPredictor:
    def __init__(self,
                 csv_path: str,
                 text_column: str = 'description',
                 rating_column: str = 'average_rating',
                 title_column: str = 'name'):
        """
        Load the dataset, clean descriptions and fit TF‑IDF on **all** rows.
        (The evaluate() method takes care of doing a fresh split/fitting.)
        """
        self.df = pd.read_csv(csv_path)

        self.text_column = text_column
        self.rating_column = rating_column
        self.title_column = title_column

        # Clean descriptions in place
        self.df[self.text_column] = self.df[self.text_column]  \
                                        .fillna('')            \
                                        .apply(clean_text)

        # Vectorise entire corpus – handy for ad‑hoc, non‑evaluative calls
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.vectorizer.fit_transform(
            self.df[self.text_column]
        )

    # ----- core helper ------------------------------------------------------ #
    def _get_top_similar(self,
                         desc_vec,
                         base_matrix,
                         base_df,
                         top_n: int = 3) -> pd.DataFrame:
        """Return top‑n similar rows in *base_df* to *desc_vec*."""
        sims = cosine_similarity(desc_vec, base_matrix).flatten()
        idx = sims.argsort()[::-1][:top_n]
        out = base_df.iloc[idx].copy()
        out['similarity'] = sims[idx]
        return out[[self.title_column, self.rating_column, 'similarity']]

    # ----- public API ------------------------------------------------------- #
    def find_similar_games(self,
                           description: str,
                           top_n: int = 3) -> pd.DataFrame:
        """Top‑n similar titles within the *full* dataset."""
        desc_vec = self.vectorizer.transform([clean_text(description)])
        return self._get_top_similar(desc_vec,
                                     self.tfidf_matrix,
                                     self.df,
                                     top_n=top_n)

# --- inside class BGGRankPredictor ----------------------------------------

    def predict_rating(self,
                       description: str,
                       top_n: int = 3):
        """
        Predict rating using a **similarity‑weighted** mean of the top‑n neighbours.
        """
        sims_df = self.find_similar_games(description, top_n)

        # ⋆ weight = similarity score
        weights = sims_df['similarity'].to_numpy()
        ratings = sims_df[self.rating_column].to_numpy()

        if weights.sum() == 0:                         # ⋆ rare corner case
            pred = ratings.mean()
        else:
            pred = (ratings * weights).sum() / weights.sum()

        return pred, sims_df

    # ----- evaluation -------------------------------------------------- #
    def evaluate(self,
                 test_size: float = 0.2,
                 random_state: int = 42,
                 top_n: int = 3,
                 return_predictions: bool = False
                 ) -> Tuple[float, float, pd.DataFrame]:

        # Train/test split *indices* so we can pull rows later
        train_idx, test_idx = train_test_split(
            self.df.index,
            test_size=test_size,
            random_state=random_state,
            shuffle=True
        )

        train_df = self.df.loc[train_idx]
        test_df  = self.df.loc[test_idx]

        # Fit TF‑IDF **only on train** descriptions
        vect = TfidfVectorizer(stop_words='english')
        train_tfidf = vect.fit_transform(train_df[self.text_column])

        # Build convenience objects
        train_ratings = train_df[self.rating_column].to_numpy()

        y_true, y_pred, titles = [], [], []

        # Loop through test rows, predict via similarity to train

        for _, row in test_df.iterrows():
            desc_vec = vect.transform([row[self.text_column]])
            sims = cosine_similarity(desc_vec, train_tfidf).flatten()

            neigh_idx = sims.argsort()[::-1][:top_n]
            neigh_sims = sims[neigh_idx]
            neigh_rats = train_ratings[neigh_idx]

            # ⋆ similarity‑weighted average
            if neigh_sims.sum() == 0:
                pred = neigh_rats.mean()
            else:
                pred = np.average(neigh_rats, weights=neigh_sims)

            y_true.append(row[self.rating_column])
            y_pred.append(pred)
            titles.append(row[self.title_column])


        # Metrics
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2   = r2_score(y_true, y_pred)

        if not return_predictions:
            return rmse, r2, None

        preds_df = pd.DataFrame({
            self.title_column: titles,
            'actual_rating' : y_true,
            'predicted'     : y_pred
        })
        return rmse, r2, preds_df

if __name__ == '__main__':
    model = BGGRankPredictor('complete_dataset.csv')
    rmse, r2, _ = model.evaluate(top_n=3)
    print(f'Hold‑out RMSE = {rmse:.3f} | R² = {r2:.3f}')


Hold‑out RMSE = 0.789 | R² = 0.111
