In [None]:
class FitPredictStrategy():
    """ Класс для выбора стретагий обучения и предикта """
    strategies = ["spark", "pandas", "polars"]
    not_priority_strategies = ["pandas", "polars"]
    DEFAULT_AVAILABLE_PART_OF_MEMORY = 0.3

    def __init__(self, *dfs, fit_on = "spark", predict_on = "spark", available_memory_part=0.5):
        self.is_local_calculating_enabled = False
        self.fit_on = fit_on
        self.predict_on = predict_on
        if available_memory_part is None:
            self.available_memory = psutil.virtual_memory()[0] * self.DEFAULT_AVAILABLE_PART_OF_MEMORY

        if not self.fit_on in self.strategies or not self.predict_on in self.strategies:
            raise ValueError("Not supported strategies for fit or predict")
        if (
            self.fit_on in self.not_priority_strategies or 
            self.predict_on in self.not_priority_strategies or 
            check_spark_is_local()
        ):
            self.verdict = self.check_df_can_calculated_locally(*dfs)
            if verdict is False:
                raise ValueError("Dataset cant fit inside local computer")
            
    def check_df_can_calculated_locally(self, *dfs):
        sum_of_memory = 0
        for df in dfs:
            sum_of_memory += weight_of_df(df)
        
        if  self.available_memory >= sum_of_memory:
            return True
        return False


class PopRec(NonPersonolized_RecommenderSparkImpl):
    from abc import abstractmethod
    class_map = {
            "spark": _PopRecSpark,
            "pandas": _PopRecPandas,
            "polars": _PopRecPolars
        }

    def __init__(
            self, 
            strategy: FitPredictStrategy, 
            use_rating: bool = False,
            add_cold_items: bool = True,
            cold_weight: float = 0.5,):
        
        self.implementation = self.class_map[strategy.fit_on](strategy, use_rating, add_cold_items, cold_weight)
        self.use_rating = use_rating
        self.add_cold_items = add_cold_items
        self.cold_weight = cold_weight
    
    @abstractmethod
    def _fit(self):
        return NotImplementedError("Реализуйте в классе PopRecSpark,PopRecPolars, PopRecPandas _fit")

class _PopRecSpark(PopRec):
    def _fit(self, dataset):
        self.implementation = 

In [None]:

class PopRec:
    from abc import abstractmethod, a
    class_map = {
            "spark": _PopRecSpark,
            "pandas": _PopRecPandas,
            "polars": _PopRecPolars
        }

    def __init__(
            self, 
            use_rating: bool = False,
            add_cold_items: bool = True,
            cold_weight: float = 0.5,):
        
        self.implementation = None
        self.use_rating = use_rating
        self.add_cold_items = add_cold_items
        self.cold_weight = cold_weight

        self.is_fitted = False
        super().__init__(add_cold_items=add_cold_items, cold_weight=cold_weight)

    @property
    def item_popularity(self):
        return self.implementation._item_popularity
    
    @property
    def fill(self):
        return self.implementation._fill
    
    def _fit(self, dataset):
        if dataset.is_spark:
            self.implementation = _PopRecSpark(self.use_rating, self.add_cold_items, self.cold_weight)
        elif dataset.is_pandas:
            self.implementation = _PopRecPandas(self.use_rating, self.add_cold_items, self.cold_weight)
        elif dataset.is_polars:
            self.implementation = _PopRecPolars(self.use_rating, self.add_cold_items, self.cold_weight)

        self.implementation._fit(dataset)
        self.is_fitted = True
        self._item_popularity = self.implementation.item_popularity
        self._fill = self.implementation._fill

    def to_pandas(self)
        self.implentation.to_pandas()

def _copy_simple_parameters_after_fit(self, impl, copy_implementation):
        """ В _BaseRecommenderSparkImpl """
        copy_implementation.query_column = impl.feature_schema.query_id_column
        copy_implementation.item_column = impl.feature_schema.item_id_column
        copy_implementation.rating_column = impl.feature_schema.interactions_rating_column
        copy_implementation.timestamp_column = impl.feature_schema.interactions_timestamp_column
        copy_implementation._num_queries = impl._num_queries
        copy_implementation._num_items = impl._num_items
        copy_implementation._query_dim_size = impl._query_dim_size
        copy_implementation._item_dim_size = impl._item_dim_size
        return copy_implementation

class _PopRecSpark(NonPersonolizedRecommender):
    def _fit(self, dataset):
        # стандартный пайплайн обработки, как в текущем пайплайне, на спарке
        pass

    def to_pandas(self):
        copy_implementation = _PopRecPandas(self.use_rating, self.add_cold_items, self.cold_weight)
        if self.is_fitted:
            copy_implementation = self.copy_simple_parameters_after_fit(copy_implementation)
            copy_implementation.fit_items = convert2pandas(self.fit_items)
            copy_implementation.fit_queries = convert2pandas(self.fit_queries)
            copy_implementation.item_popularity = convert2pandas(self.item_popularity)
            copy_implementation.fill = self.fill

    def to_polars(self):
        copy_implementation = _PopRecPolars(self.use_rating, self.add_cold_items, self.cold_weight)
        if self.is_fitted:
            copy_implementation = self.copy_simple_parameters_after_fit(copy_implementation)
            copy_implementation.fit_items = convert2polars(self.fit_items)
            copy_implementation.fit_queries = convert2polars(self.fit_queries)
            copy_implementation.item_popularity = convert2polars(self.item_popularity)
            copy_implementation.fill = self.fill

    
class _PopRecPandas(PopRec):
    def _fit(self, dataset):
        # стандартный пайплайн обработки, как в текущем пайплайне, на пандасе
        pass
    
    def copy_simple_parameters_after_fit(self, copy_implementation):
        """ В _BaseRecommenderSparkImpl """
        copy_implementation.query_column = self.feature_schema.query_id_column
        copy_implementation.item_column = self.feature_schema.item_id_column
        copy_implementation.rating_column = self.feature_schema.interactions_rating_column
        copy_implementation.timestamp_column = self.feature_schema.interactions_timestamp_column
        copy_implementation._num_queries = self._num_queries
        copy_implementation._num_items = self._num_items
        copy_implementation._query_dim_size = self._query_dim_size
        copy_implementation._item_dim_size = self._item_dim_size
        return copy_implementation

    def to_spark(self):
        copy_implementation = _PopRecPandas(self.use_rating, self.add_cold_items, self.cold_weight)
        if self.is_fitted:
            copy_implementation = self.copy_simple_parameters_after_fit(copy_implementation)
            copy_implementation.fit_items = convert2pandas(self.fit_items)
            copy_implementation.fit_queries = convert2pandas(self.fit_queries)
            copy_implementation.item_popularity = convert2pandas(self.item_popularity)
            copy_implementation.fill = self.fill

    def to_polars(self):
        copy_implementation = _PopRecPolars(self.use_rating, self.add_cold_items, self.cold_weight)
        if self.is_fitted:
            copy_implementation = self.copy_simple_parameters_after_fit(copy_implementation)
            copy_implementation.fit_items = convert2polars(self.fit_items)
            copy_implementation.fit_queries = convert2polars(self.fit_queries)
            copy_implementation.item_popularity = convert2polars(self.item_popularity)
            copy_implementation.fill = self.fill



In [1]:
import pandas as pd
from replay.data.dataset import Dataset, FeatureSchema, FeatureInfo, FeatureHint, FeatureType
from replay.utils.spark_utils import convert2spark
from replay.models.implementations.spark.pop_rec import PopRec

In [2]:
data_frame = pd.DataFrame(
   {"user_id": [1, 1, 2, 2, 3, 4],
    "item_id": [1, 2, 2, 3, 3, 3],
    "rating": [0.5, 1, 0.1, 0.8, 0.7, 1]}
)


feature_schema = FeatureSchema(
    [
        FeatureInfo(
            column="user_id",
            feature_type=FeatureType.CATEGORICAL,
            feature_hint=FeatureHint.QUERY_ID,
        ),
        FeatureInfo(
            column="item_id",
            feature_type=FeatureType.CATEGORICAL,
            feature_hint=FeatureHint.ITEM_ID,
        ),
        FeatureInfo(
            column="rating",
            feature_type=FeatureType.NUMERICAL,
            feature_hint=FeatureHint.RATING,
        ),
    ]
)


In [3]:
interactions = convert2spark(data_frame)
dataset = Dataset(feature_schema, interactions)
model = PopRec()
model.fit(dataset)

25/02/11 16:21:17 WARN Utils: Your hostname, ecs-alaleksepetrov resolves to a loopback address: 127.0.1.1; using 10.11.12.194 instead (on interface eth0)
25/02/11 16:21:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/11 16:21:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/11 16:21:18 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
25/02/11 16:21:19 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/02/11 16:21:19 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.

In [None]:
print(model.implementation.is_fitted)
print(type(model.implementation.fit_items))
print(id(model))
model.to_pandas()
print(id(model))
print(model.implementation.is_fitted)
print(type(model.implementation.fit_items))

True
<class 'pyspark.sql.dataframe.DataFrame'>
140017715916432


25/02/11 16:15:20 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:15:20 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.


140017715916432
True
<class 'pandas.core.frame.DataFrame'>


In [4]:
model.predict(dataset, 1)

25/02/11 16:21:25 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:26 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/02/11 16:21:26 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/02/11 16:21:27 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/02/11 16:21:27 WARN HintErrorLogger: Hint (strategy=broadcast) is not supported in the query: build right for right outer join.
25/02/11 16:21:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a sin

DataFrame[user_id: bigint, item_id: bigint, rating: double]

In [5]:
model.fit_predict(dataset, 1)

25/02/11 16:21:28 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:28 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:28 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:28 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:29 WARN CacheManager: Asked to cache already cached data.
25/02/11 16:21:29 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:29 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.
25/02/11 16:21:30 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has b

DataFrame[user_id: bigint, item_id: bigint, rating: double]

In [6]:
dataset = dataset.to_pandas(inplace=False)
model.to_pandas()
model.predict(dataset, 1)

25/02/11 16:21:33 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.
25/02/11 16:21:33 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [3]:
interactions = data_frame
dataset = Dataset(feature_schema, interactions)
model = PopRec()
model.fit(dataset)

AttributeError: 'DataFrame' object has no attribute 'select'