In [3]:
from replay.utils import PYSPARK_AVAILABLE, PandasDataFrame, SparkDataFrame
from replay.data import Dataset

class _PandasPopRec:
    def __init__(
            self, 
            use_rating: bool = False,
            add_cold_items: bool = True,
            cold_weight: float = 0.5, sample=True, fill=0.0, seed=42, **kwargs):
        self.use_rating = use_rating
        self.sample = sample
        self.fill = fill
        self.seed = seed
        self.add_cold_items = add_cold_items
        self.cold_weight = cold_weight
        self.item_popularity = None
        self.fit_items = None
        self.fit_queries = None
        self.other_params = kwargs

    @staticmethod
    def _calc_fill(item_popularity: PandasDataFrame, weight: float, rating_column: str) -> float:
        """
        Calculating a fill value a the minimal rating
        calculated during model training multiplied by weight.
        """
        return item_popularity[rating_column].min() * weight
    
    def _get_selected_item_popularity(self, items: PandasDataFrame) -> PandasDataFrame:
        """
        Choose only required item from `item_popularity` dataframe
        for further recommendations generation.
        """
        df = self.item_popularity.merge(
            items, on=self.item_column, how='right' if self.add_cold_items else 'inner'
            )
        df = df.fillna(value=self.fill)
        return df

    def _get_fit_counts(self, entity: str) -> int:
        num_entities = "_num_queries" if entity == "query" else "_num_items"
        fit_entities = self.fit_queries if entity == "query" else self.fit_items
        if not hasattr(self, num_entities):
            setattr(
                self,
                num_entities,
                fit_entities.count(),
            )
        return getattr(self, num_entities)

    @property
    def queries_count(self) -> int:
        """
        :returns: number of queries the model was trained on
        """
        return self._get_fit_counts("query")

    def fit(self, dataset: PandasDataFrame):
        self.query_column = dataset.feature_schema.query_id_column
        self.item_column = dataset.feature_schema.item_id_column
        self.rating_column = dataset.feature_schema.interactions_rating_column
        self.timestamp_column = dataset.feature_schema.interactions_timestamp_column
        self.fit_items =  dataset.interactions[self.item_column].drop_duplicates()
        self.fit_queries =  dataset.interactions[self.query_column].drop_duplicates()
        self._num_queries = self.fit_queries.shape[0]
        self._num_items = self.fit_items.shape[0]
        self._query_dim_size = self.fit_queries.max() + 1
        self._item_dim_size = self.fit_items.max() + 1
        interactions_df = dataset.interactions

        if self.use_rating:
            item_popularity = interactions_df.groupby(self.item_column, as_index=False)[self.rating_column].sum()
        else:
            item_popularity = interactions_df.groupby(self.item_column, as_index=False)[self.query_column].nunique()
            item_popularity.rename(columns={self.query_column: self.rating_column}, inplace=True)

        item_popularity[self.rating_column] = item_popularity[self.rating_column] / self.queries_count

        self.item_popularity = item_popularity
        self.fill = self._calc_fill(self.item_popularity, self.cold_weight, self.rating_column)
        return self

    @staticmethod
    def _calc_max_hist_len(dataset: Dataset, queries: PandasDataFrame) -> int:
        query_column = dataset.feature_schema.query_id_column
        item_column = dataset.feature_schema.item_id_column
        merged_df = dataset.merge(queries, on='query_column', how='left')

        # Группировка по столбцу query_column и подсчет уникальных значений в столбце item_column
        grouped_df = merged_df.groupby('query_column').item_column.nunique()

        # Максимальное количество уникальных значений
        max_hist_len = grouped_df.max()
        # all queries have empty history
        if max_hist_len is None:
            max_hist_len = 0

        return max_hist_len

    def get_items_pd(self, items: PandasDataFrame) -> PandasDataFrame:
        """
        Function to calculate normalized popularities(in fact, probabilities)
        of given items. Returns pandas DataFrame.
        """
        selected_item_popularity = self._get_selected_item_popularity(items)
        selected_item_popularity[self.rating_column] = selected_item_popularity.apply(
            lambda row: 0.1 ** 6 if row['rating_column'] == 0.0 else row['rating_column'],
            axis=1
        )

        total_rating = selected_item_popularity[self.rating_column].sum()

        selected_item_popularity["probability"] = selected_item_popularity[self.rating_column] / total_rating
        return selected_item_popularity
    
    def left_wild_join(left_df, right_df, on):
        

    def predict_without_sampling(
        self,
        dataset: Dataset,
        k: int,
        queries: PandasDataFrame,
        items: PandasDataFrame,
        filter_seen_items: bool = True,
    ) -> PandasDataFrame:
        """
        Regular prediction for popularity-based models,
        top-k most relevant items from `items` are chosen for each query
        """
        selected_item_popularity = self._get_selected_item_popularity(items if items is not None else self.fit_items) 
        # TODO: не учел фильтры из _BaseRecommenderSparkImpl _filter_interactions_queries_items_dataframes
        sorted_df = selected_item_popularity.sort_values(by=[self.rating_column, self.item_column], ascending=False)
        selected_item_popularity["rank"] = sorted_df.index + 1

        if filter_seen_items and dataset is not None:
            queries = PandasDataFrame(queries if queries is not None else self.fit_queries)
            query_to_num_items = (
                dataset.interactions.merge(queries
                                           , on=self.query_column)
                .groupby(self.query_column, as_index=False)[self.item_column].nunique()
            ).rename(columns={self.item_column : "num_items"})
            print(query_to_num_items)
            queries = queries.merge(query_to_num_items, on=self.query_column, how="left")
            queries = queries.fillna(0)
            print(queries)
            max_seen = queries['num_items'].max()
            selected_item_popularity = selected_item_popularity.query("rank <= @k + @max_seen")
            joined = queries.merge(selected_item_popularity, how="cross")
            return joined[joined['rank'] <= (k+joined["num_items"])].drop("num_items",axis=1)  # TODO: на пандас и поларс нет left join с нечетким условием. Нужно реализовать через 2 join и union
        joined = queries.merge(selected_item_popularity, how="cross")
        return joined[joined['rank'] <= k].drop("rank")
    
    def predict(self, dataset: PandasDataFrame, k: int, queries, items: PandasDataFrame, filter_seen_items=True) -> PandasDataFrame:
        return self.predict_without_sampling(dataset, k, queries, items, filter_seen_items)

    def fit_predict(self, dataset: PandasDataFrame, k: int, queries: PandasDataFrame = None, items: PandasDataFrame = None, filter_seen_items=True) -> PandasDataFrame:
        self.fit(dataset)
        return self.predict(dataset, k, queries, items, filter_seen_items)
    
    def predict_pairs(self, pairs: PandasDataFrame, dataset: PandasDataFrame = None) ->PandasDataFrame:
        if self.item_popularity is None:
            raise ValueError("Model not fitted. Please call fit() first.")
        preds = pairs.merge(self.item_popularity, on=self.item_column, how="left" if self.add_cold_items else "inner")
        preds[self.rating_column].fillna(self._calc_fill(), inplace=True)
        return preds[[self.query_column, self.item_column, self.rating_column]]

    def predict_proba(self, dataset: PandasDataFrame, k: int, queries, items: PandasDataFrame, filter_seen_items=True) -> PandasDataFrame:
        pass # большая реализация с наследованием и сторонними функциями

    def to_spark(self):

        pass

    def to_pandas(self):
        _SparkPopRec()
        pass


    def save_model(self, path: str, additional_params = None):
        saved_params = {
            "query_column": self.query_column,
            "item_column": self.item_column,
            "rating_column": self.rating_column,
            "timestamp_column": self.timestamp_column,
        }
        if additional_params is not None:
            saved_params.update(additional_params)
        return saved_params
        #save_picklable_to_parquet(saved_params, join(path, "params.dump"))


IndentationError: expected an indented block after function definition on line 116 (740817293.py, line 119)

In [1]:
import pandas as pd
from replay.data.dataset import Dataset, FeatureSchema, FeatureInfo, FeatureHint, FeatureType
from replay.utils.spark_utils import convert2spark
#from replay.utils.common import convert2polars
from replay.models import PopRec

data_frame = pd.DataFrame(
    {
        "user_id": [1, 1, 2, 2, 3, 4],
        "item_id": [1, 2, 2, 3, 3, 3],
        "rating": [0.5, 1, 0.1, 0.8, 0.7, 1]
    }
)
feature_schema = FeatureSchema([
    FeatureInfo(
        column="user_id",
        feature_type=FeatureType.CATEGORICAL,
        feature_hint=FeatureHint.QUERY_ID,
    ),
    FeatureInfo(
        column="item_id",
        feature_type=FeatureType.CATEGORICAL,
        feature_hint=FeatureHint.ITEM_ID,
    ),
    FeatureInfo(
        
        column="rating",
        feature_type=FeatureType.NUMERICAL,
        feature_hint=FeatureHint.RATING,
    )
]
)

In [2]:
interactions = convert2spark(data_frame)
dataset = Dataset(feature_schema, interactions)
model = PopRec()
res = model.fit_predict(dataset, 1)


25/03/03 14:10:50 WARN Utils: Your hostname, ecs-alaleksepetrov resolves to a loopback address: 127.0.1.1; using 10.11.12.194 instead (on interface eth0)
25/03/03 14:10:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/03 14:10:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/03 14:10:51 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/03/03 14:10:52 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/03/03 14:10:52 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.

In [3]:
interactions = convert2spark(data_frame)
dataset = Dataset(feature_schema, interactions)
model = PopRec()
res = model.fit(dataset)
#res.sort('user_id').toPandas()#.sort_values("user_id", ignore_index=True)

25/03/03 14:11:01 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/03/03 14:11:01 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/03/03 14:11:01 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/03/03 14:11:01 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/03/03 14:11:01 WARN CacheManager: Asked to cache already cached data.


In [4]:
from replay.utils.common import convert2polars
model.to_pandas()
interactions = data_frame#convert2polars(data_frame)
dataset = Dataset(feature_schema, interactions)
res = model.predict(dataset, 1)
print(type(res))
res.sort_values("user_id")

25/03/03 14:11:01 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/03/03 14:11:02 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.


{'use_rating': False, 'add_cold_items': True, 'cold_weight': 0.5}
dataset=<replay.data.dataset.Dataset object at 0x7fbb2fa85bd0>, filter_seen_items=True
items
    item_id  rating
0        1    0.25
1        2    0.50
2        3    0.75
selected_item_popularity
    item_id  rating
0        1    0.25
1        2    0.50
2        3    0.75
selected_item_popularity2
    item_id  rating  rank
0        1    0.25     3
1        2    0.50     2
2        3    0.75     1
   user_id  num_items
0        1          2
1        2          2
2        3          1
3        4          1
   user_id  num_items
0        1          2
1        2          2
2        3          1
3        4          1
joined
     user_id  num_items  item_id  rating  rank
0         1          2        1    0.25     3
1         1          2        2    0.50     2
2         1          2        3    0.75     1
3         2          2        1    0.25     3
4         2          2        2    0.50     2
5         2          2        3

Unnamed: 0,user_id,item_id,rating
2,1,3,0.75
3,2,1,0.25
6,3,2,0.5
8,4,2,0.5


In [5]:
interactions = interactions = convert2polars(data_frame)
dataset = Dataset(feature_schema, interactions)
model = PopRec()
res = model.fit_predict(dataset, 1)#, filter_seen_items=False)
res#.sort_values("user_id", ignore_index=True)

FIT PREDICT: dataset.is_polars=True, filter_seen_items=True
items
 shape: (3, 2)
┌─────────┬────────┐
│ item_id ┆ rating │
│ ---     ┆ ---    │
│ i64     ┆ f64    │
╞═════════╪════════╡
│ 1       ┆ 0.25   │
│ 2       ┆ 0.5    │
│ 3       ┆ 0.75   │
└─────────┴────────┘
selected_item_popularity
 shape: (3, 2)
┌─────────┬────────┐
│ item_id ┆ rating │
│ ---     ┆ ---    │
│ i64     ┆ f64    │
╞═════════╪════════╡
│ 1       ┆ 0.25   │
│ 2       ┆ 0.5    │
│ 3       ┆ 0.75   │
└─────────┴────────┘
selected_item_popularity2
 shape: (3, 3)
┌──────┬─────────┬────────┐
│ rank ┆ item_id ┆ rating │
│ ---  ┆ ---     ┆ ---    │
│ u32  ┆ i64     ┆ f64    │
╞══════╪═════════╪════════╡
│ 1    ┆ 3       ┆ 0.75   │
│ 2    ┆ 2       ┆ 0.5    │
│ 3    ┆ 1       ┆ 0.25   │
└──────┴─────────┴────────┘
joined
 shape: (12, 5)
┌─────────┬───────────┬──────┬─────────┬────────┐
│ user_id ┆ num_items ┆ rank ┆ item_id ┆ rating │
│ ---     ┆ ---       ┆ ---  ┆ ---     ┆ ---    │
│ i64     ┆ u32       ┆ u32  ┆ i64 

user_id,item_id,rating
i64,i64,f64
1,3,0.75
2,1,0.25
3,2,0.5
4,2,0.5


In [6]:
interactions = data_frame
dataset = Dataset(feature_schema, interactions)
model = PopRec(use_rating=True)
res = model.fit_predict(dataset, 1)
res#.sort_values("user_id", ignore_index=True)

dataset=<replay.data.dataset.Dataset object at 0x7fa73ad60c10>, filter_seen_items=True
items
    item_id  rating
0        1   0.125
1        2   0.275
2        3   0.625
selected_item_popularity
    item_id  rating
0        1   0.125
1        2   0.275
2        3   0.625
selected_item_popularity2
    item_id  rating  rank
0        1   0.125     3
1        2   0.275     2
2        3   0.625     1
   user_id  num_items
0        1          2
1        2          2
2        3          1
3        4          1
   user_id  num_items
0        1          2
1        2          2
2        3          1
3        4          1
joined
     user_id  num_items  item_id  rating  rank
0         1          2        1   0.125     3
1         1          2        2   0.275     2
2         1          2        3   0.625     1
3         2          2        1   0.125     3
4         2          2        2   0.275     2
5         2          2        3   0.625     1
6         3          1        1   0.125     3
7    

Unnamed: 0,user_id,item_id,rating
2,1,3,0.625
3,2,1,0.125
6,3,2,0.275
8,4,2,0.275


In [6]:
interactions = convert2polars(data_frame)
dataset = Dataset(feature_schema, interactions)
model = PopRec(use_rating=True)
res = model.fit_predict(dataset, 1)
res#.sort_values("user_id", ignore_index=True)

FIT PREDICT: dataset.is_polars=True, filter_seen_items=True
items
 shape: (3, 2)
┌─────────┬────────┐
│ item_id ┆ rating │
│ ---     ┆ ---    │
│ i64     ┆ f64    │
╞═════════╪════════╡
│ 2       ┆ 0.275  │
│ 3       ┆ 0.625  │
│ 1       ┆ 0.125  │
└─────────┴────────┘
selected_item_popularity
 shape: (3, 2)
┌─────────┬────────┐
│ item_id ┆ rating │
│ ---     ┆ ---    │
│ i64     ┆ f64    │
╞═════════╪════════╡
│ 2       ┆ 0.275  │
│ 3       ┆ 0.625  │
│ 1       ┆ 0.125  │
└─────────┴────────┘
selected_item_popularity2
 shape: (3, 3)
┌──────┬─────────┬────────┐
│ rank ┆ item_id ┆ rating │
│ ---  ┆ ---     ┆ ---    │
│ u32  ┆ i64     ┆ f64    │
╞══════╪═════════╪════════╡
│ 1    ┆ 3       ┆ 0.625  │
│ 2    ┆ 2       ┆ 0.275  │
│ 3    ┆ 1       ┆ 0.125  │
└──────┴─────────┴────────┘
joined
 shape: (12, 5)
┌─────────┬───────────┬──────┬─────────┬────────┐
│ user_id ┆ num_items ┆ rank ┆ item_id ┆ rating │
│ ---     ┆ ---       ┆ ---  ┆ ---     ┆ ---    │
│ i64     ┆ u32       ┆ u32  ┆ i64 

user_id,item_id,rating
i64,i64,f64
1,3,0.625
2,1,0.125
3,2,0.275
4,2,0.275


In [1]:
from typing import Iterable

isinstance([1], Iterable)

True

# Base Class testing

In [7]:
from abc import ABC, abstractmethod
class Base(ABC):
    
    @property
    @abstractmethod
    def _implementation(self):
        """ Impl"""
    
    @property
    def cached_dfs(self):
        
        return self._implementation.cached_dfs

    def fit(self):
        return self._implementation.fit()

class PopRec(Base):
    def __init__(self):
        self.a = 3
        self.__implementation = None

    @property
    def _implementation(self):
        return self.__implementation
    
    @_implementation.setter
    def _implementation(self, value):
        self.__implementation = value

    def _all_attributes_or_functions(self): # TODO: куда лучше вынести?
        cls = self._implementation.__class__
        all_params = []
        all_params.extend(dir(self._implementation))
        all_params.extend(self._implementation.__dict__.keys())
        all_params.extend(getattr(cls, "__annotations__", {}))
        all_params.extend(dir(cls))
        
        return list(set(all_params))

    def f(self):
        self._implementation = PopRecSpark()
        print(list(filter(lambda x: '__' not in x, self._all_attributes_or_functions())))
        print('cached_dfs' in self._all_attributes_or_functions())
        return self.fit()
    
class PopRecSpark:
    _A: int = 3
    B: str    
    def __init__(self, use_rating=True):
        self.use_rating = use_rating
    def fit(self):
        return 4
    
a = PopRec()
a.f()
a.cached_dfs

['use_rating', 'fit', '_A', 'B']
False


AttributeError: 'PopRecSpark' object has no attribute 'cached_dfs'

# LabelEncoderTest

In [None]:
import pandas as pd
from replay.preprocessing.label_encoder import LabelEncoder, LabelEncodingRule
user_interactions = pd.DataFrame([
    ("u1", "item_1", "item_1"),
    ("u2", "item_2", "item_2"),
    ("u3", "item_3", "item_3"),
    ], columns=["user_id", "item_1", "item_2"])

add_user_interactions = pd.DataFrame([
    ("u5", "item_1", "item_1"),
    ("u6", "item_2", "item_2"),
    ("u7", "item_3", "item_3"),
    ], columns=["user_id", "item_1", "item_2"])

encoder = LabelEncoder([
         LabelEncodingRule("user_id"),
         LabelEncodingRule("item_1"),
         LabelEncodingRule("item_2"),
     ])

In [None]:
encoder.fit(user_interactions)

<replay.preprocessing.label_encoder.LabelEncoder at 0x7f269036a190>

In [None]:
encoder.partial_fit(add_user_interactions)



<replay.preprocessing.label_encoder.LabelEncoder at 0x7f269036a190>