In [1]:
from functools import reduce
import re
import string

from pandas import read_csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


In [2]:
class StemmerWrapper:
    """ Class wrapper to some language stemmer; Via wrapping, I think,
        it is comfortable to operate with stemmer and functions,
        that formats text for systems.
    """
    def __init__(self, lang="russian"):
        """
        :param lang: Initializing stemmer with setting `lang` language
        """
        self.stemmer = SnowballStemmer(lang)
    
    def stem(self, *args, **kwargs):
        return self.stemmer.stem(*args, **kwargs)

    @staticmethod
    def clean_string(sample_s: str) -> str:
        """
        :param sample_s: string to be formatted
        :return: formatted string
        formats given string by removing unnecessary components for
        building recommendation systems
        """
        # string to lowercase
        sample_s = sample_s.strip().lower()
        # removing one-symbol words
        sample_s = re.sub(r'\b[ЁёА-я]{1}\b', '', sample_s)
        # removing punctuation
        sample_s = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', sample_s)
        # removing one-digit numbers
        sample_s = re.sub(r'\b[0-9]{1}\b', '', sample_s)
        # replacing several-in-a-row space symbols with only one space
        sample_s = re.sub(r'\s+', ' ', sample_s)
        return sample_s.strip()


In [3]:
class Loader:
    def __init__(self, stemmer):
        """
        :param stemmer: language stemmer to be used
        """
        self.stemmer = stemmer

    def merge_contents(self,
                       table: str,
                       main_id: str,
                       content_cname: str,
                       columns: list):
        """
        merges content of selected `columns` from `table`;
        check overwritten function for more info.
        """
        pass

    @staticmethod
    def split_series(series):
        """
        :return: (pd.core.series.Series) updated `series`-copy
        :param series: (pd.core.series.Series)
        Splitting values in cell's of column `series` (in-place)
        """
        # handling None values separately
        return series.apply(lambda x: x.split() if type(x) == str else [])

    def format_columns(self,
                       dataframe,
                       main_id_cname: str,
                       content_cname: str,
                       columns: list):
        """
        :param dataframe: contains data to be formatted and used
        :param main_id_cname: item-representing-column's name
        :param content_cname: content-representing-column's name
        :param columns: names of columns in dataframe
        :return: 2-column dataframe, named as main_id_cname and
            content_cname; second columns contains formatted
            and merged `columns` content
        """
        # initializing new column in dataframe with empty strings
        dataframe[content_cname] = ''

        # remember items-id-representing column
        id_series = dataframe[main_id_cname]

        # set dataframe to a `columns`-containing table,
        # where all string infos was split into lists
        dataframe = dataframe[[content_cname] + columns].apply(self.split_series)

        # formatting all rows
        dataframe[content_cname] = reduce(  # firstly, we put add all lists to content containing column
                lambda prev, el: prev + dataframe[el],
                columns,
                dataframe[content_cname]
        ).apply(  # then we would stem all words in this column
                lambda iterable: [self.stemmer.stem(w) for w in iterable]
        ).apply(  # lastly, we join lists to string
                lambda iterable: ' '.join(iterable)
        ).apply(
                StemmerWrapper.clean_string
        )

        # set item-representing-column's data
        dataframe[main_id_cname] = id_series

        # return table representing relationship item - formatted_item_content
        return dataframe[[main_id_cname, content_cname]]

    def parse(self, table: str, columns: list):
        """
        parses selected `columns` from `table`;
        check overwritten function for more info.
        """
        pass


In [4]:
class CsvLoader(Loader):
    def __init__(self, stemmer):
        """
        :param stemmer: language stemmer to be used
        """
        super().__init__(stemmer)

    def merge_contents(self,
                       path: str,
                       main_id_cname: str,
                       content_cname: str,
                       columns: list):
        """
        :param path: (str) path to table, where from data will be read
        :param main_id_cname: (str) item-representing-column's name;
            it is explicit for `path` to have such column
        :param content_cname: (str) content-representing-column's name
        :param columns: (list) columns containing main content,
            that will be used to build a content-based model
        :return: (pd.core.frame.DataFrame) 2-column dataframe
            representing relationship of item and it's content
            (one-to-one relationship)
        """
        return self.format_columns(
                self.parse(path, columns + [main_id_cname]),
                main_id_cname,
                content_cname,
                columns
        )

    def parse(self, table: str, columns: list):
        """
        :param table: database table name
        :param columns: columns to be parsed
        :return: dataframe with parsed columns
        """
        return read_csv(
                table,
                skipinitialspace=True,
                usecols=columns
        )


In [5]:
class RecommendationSystem:
    """
    Root-class representing recommendation system
    """
    def __init__(self):
        pass

    def load_and_build(self, *args):
        pass

    def load(self, *args):
        pass

    def build(self, *args):
        pass


In [6]:
class ContentBasedFiltering(RecommendationSystem):
    """
    Represents recommendation system based on content-based-filtering
    """
    def __init__(self,
                 stemmer):
        super(ContentBasedFiltering, self).__init__()
        # using stemmer
        self.stemmer = stemmer
        # similarity matrix (items_count x items_count)
        self.df_data = None
        # item-representing and content columns names
        self.item_id_cname = None
        self.content_cname = None

    def load(self,
             table: str,
             item_id_cname: str,
             content_cname: str,
             content_columns: list,
             loader_type: str = "csv",
             connection=None):
        """
        :param table: table name where from data will be read (this
            either table of database or path to csv file, depending
            on loader_typ)
        :param item_id_cname: item-representing-column's name
        :param content_cname: content-representing-column's name
        :param content_columns: columns, which contents will be used
            to build model
        :param loader_type: equals either to "csv" or "db", depending
            on `table`
        :param connection: if loader_type equals to db, it must be
            connection to using database (otherwise - whatever)

        Loads data necessary to build model
        """
        self.item_id_cname = item_id_cname
        self.content_cname = content_cname
        if loader_type == "csv":
            loader = CsvLoader(self.stemmer)
        elif loader_type == "db":
            if connection is None:
                raise RuntimeError("SearchEngine::load: received connection equals "
                                   "to None with a loader_type equals db")
            loader = DataBaseLoader(self.stemmer, connection)
        else:
            raise RuntimeError("SearchEngine::load: no loader available for given loader_type")
        self.df_data = loader.merge_contents(
                table,
                self.item_id_cname,
                content_cname,
                content_columns
        )


In [7]:
class SubstituteItemRS(ContentBasedFiltering):
    """
    Class represents recommendation system, that recommends items,
    which are most similar to exact item with their contents.
    Actually, it is substitute items recommendation system.
    """
    def __init__(self,
                 stemmer,
                 max_features=int(1e5),
                 stop_words=stopwords.words("russian")):
        """
        :param stemmer: stemmer to stem words from text data
        :param max_features: argument for CountVectorizer transformer
            (for saving `max_features` most common words from data)
        :param stop_words: argument for CountVectorizer transformer
            (list of meaningless words to be ignored)
        """
        super(SubstituteItemRS, self).__init__(stemmer)
        self.transformer = CountVectorizer(max_features=max_features,
                                           stop_words=stop_words)
        # similarity matrix
        self.similarity = None

    def build(self):
        """
        building model
        """
        raw_data = self.transformer.fit_transform(self.df_data[self.content_cname])
        
        # we will not drop value in ipynb, cuz want to see content
        #  self.df_data.drop(self.content_cname, axis=1, inplace=True)
        
        self.similarity = cosine_similarity(raw_data.toarray())

    def find_closest(self,
                     item_id: str,
                     res_n: int = 20):
        """
            :param item_id: item's id (as it's saved in data source),
                which substitutes will be found
            :param res_n: substitutes count to be find
            :return: list (of size no more res_n elements) of most
                competent items' indices (as they saved in data source)
        """
        # get index of item as it is saved in our dataframe
        item_index = self.df_data[self.df_data[self.item_id_cname] == item_id].index[0]

        # get it's similarities to all other items (as vector)
        cos_distances = self.similarity[item_index]
        # choose top `res_n` items with most similarities scores
        items_list = sorted(list(enumerate(cos_distances)), reverse=True, key=lambda x: x[1])[1:res_n + 1]
        indices = [item[0] for item in items_list]

        # return found items ids
        return self.df_data.iloc[indices][self.item_id_cname].values

In [8]:
ws = StemmerWrapper()
substitutes_rs = SubstituteItemRS(ws)

In [9]:
item_id_cname, item_content_cname = "product_id", "content_info"

print("loading started...")
substitutes_rs.load(
    "data/product.csv",
    item_id_cname,
    item_content_cname,
    ["product_name", "description", "seller"]
)
print("loading finished...")

print("building started...")
substitutes_rs.build()
print("building finished...")


loading started...
loading finished...
building started...
building finished...


In [10]:
m_item_id = substitutes_rs.df_data[item_id_cname].iloc[2440]
print("item id:\t", m_item_id)

item id:	 11211760


In [11]:
res_indices = substitutes_rs.find_closest(m_item_id)
print("top closest items_ids:\t", res_indices)

top closest items_ids:	 [11211761 10993176 11663605 11663604 11100029  9888271 12118763  9888278
 12043925  9888268  9888270  9888276  9888269  9685986  9888274  9888275
 11990949 24372858 11990952 11990950]


In [12]:
def retranslate(df, indices, to_print = False):
    content_info = df.loc[df[item_id_cname].isin(indices)][item_content_cname].tolist()
    if to_print:
        print(*content_info, sep='\n\n')
    else:
        return content_info


In [13]:
retranslate(substitutes_rs.df_data, [m_item_id], True)

кольц спас сохран миниатюрн колечк ярк камн огранк кабошон внутр кольц начерта слов молитв спас сохрани цвет камешк красив просвечива сквоз крест спрята закрепк камня церковн искусств абсолютн кажд элемент имеет значение поэт даж такие на перв взгляд утончен изыска издел служат напоминан христианин божествен свете свет излуча камнями символизир сиян божествен славы фаворск света высота камешка мм ширина камешка мм материалы серебр 925 пробы гранат техник изготовления литье чернение закрепка ручн работ примечания издел соответств православн канон освящ на оборотн стороне внутри спас сохрани соф


In [14]:
retranslate(substitutes_rs.df_data, res_indices, True)

православн кольц серебрян спас сохрани издел выполн из серебр 925 пробы покрыт чернение ширин верхн част кольца 10 мм ширин шинки мм православн кольц спас сохрани ширин изделия мм освящено серебрян дом

православн кольц серебрян спас сохрани издел выполн из серебр 925 пробы покрыт чернение ширин кольца мм вставки 14 фиан бесцвет кольц надпис спас сохрани освящено серебрян дом

православн кольц серебрян спас сохрани издел выполн из серебр 925 пробы покрыт чернение православн кольц спас сохрани освящ серебрян дом

православн кольц спас сохрани ширин издел мм кольц из серебр алмазн огранк молитв внутри господи спас сохран мя artis

кольц православн спас сохрани ювелирн украшен выполн из серебр 925 пробы покрыт оксидирован чернение ширин верхн част кольц мм ширин шинки мм вставки фиан бесцв круг мм кольц спас сохрани освящено серебрян дом

православн кольц серебрян спас сохрани ювелирн издел выполн из серебр 925 пробы покрыт оксидирован чернение диаметр верх кольца мм ширин шинк мм вставк 

In [15]:
print("items count:\t", substitutes_rs.df_data.shape[0])
%timeit res_indices = substitutes_rs.find_closest(m_item_id)

items count:	 3475
1.88 ms ± 27 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
