In [1]:
from functools import reduce
import re
import string

import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer


In [2]:
class StemmerWrapper:
    """ Class wrapper to some language stemmer; Via wrapping, I think,
        it is comfortable to operate with stemmer and functions,
        that formats text for systems.
    """
    def __init__(self, lang="russian"):
        """
        :param lang: Initializing stemmer with setting `lang` language
        """
        self.stemmer = SnowballStemmer(lang)
    
    def stem(self, *args, **kwargs):
        return self.stemmer.stem(*args, **kwargs)

    @staticmethod
    def clean_string(sample_s: str) -> str:
        """
        :param sample_s: string to be formatted
        :return: formatted string
        formats given string by removing unnecessary components for
        building recommendation systems
        """
        # string to lowercase
        sample_s = sample_s.strip().lower()
        # removing one-symbol words
        sample_s = re.sub(r'\b[ЁёА-я]{1}\b', '', sample_s)
        # removing punctuation
        sample_s = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', sample_s)
        # removing one-digit numbers
        sample_s = re.sub(r'\b[0-9]{1}\b', '', sample_s)
        # replacing several-in-a-row space symbols with only one space
        sample_s = re.sub(r'\s+', ' ', sample_s)
        return sample_s.strip()


In [3]:
class Loader:
    def __init__(self, stemmer):
        """
        :param stemmer: language stemmer to be used
        """
        self.stemmer = stemmer

    def merge_contents(self,
                       table: str,
                       main_id: str,
                       content_cname: str,
                       columns: list):
        """
        merges content of selected `columns` from `table`;
        check overwritten function for more info.
        """
        pass

    @staticmethod
    def split_series(series):
        """
        :return: (pd.core.series.Series) updated `series`-copy
        :param series: (pd.core.series.Series)
        Splitting values in cell's of column `series` (in-place)
        """
        # handling None values separately
        return series.apply(lambda x: x.split() if type(x) == str else [])

    def format_columns(self,
                       dataframe,
                       main_id_cname: str,
                       content_cname: str,
                       columns: list):
        """
        :param dataframe: contains data to be formatted and used
        :param main_id_cname: item-representing-column's name
        :param content_cname: content-representing-column's name
        :param columns: names of columns in dataframe
        :return: 2-column dataframe, named as main_id_cname and
            content_cname; second columns contains formatted
            and merged `columns` content
        """
        # initializing new column in dataframe with empty strings
        dataframe[content_cname] = ''

        # remember items-id-representing column
        id_series = dataframe[main_id_cname]

        # set dataframe to a `columns`-containing table,
        # where all string infos was split into lists
        dataframe = dataframe[[content_cname] + columns].apply(self.split_series)

        # formatting all rows
        dataframe[content_cname] = reduce(  # firstly, we put add all lists to content containing column
                lambda prev, el: prev + dataframe[el],
                columns,
                dataframe[content_cname]
        ).apply(  # then we would stem all words in this column
                lambda iterable: [self.stemmer.stem(w) for w in iterable]
        ).apply(  # lastly, we join lists to string
                lambda iterable: ' '.join(iterable)
        ).apply(
                StemmerWrapper.clean_string
        )

        # set item-representing-column's data
        dataframe[main_id_cname] = id_series

        # return table representing relationship item - formatted_item_content
        return dataframe[[main_id_cname, content_cname]]

    def parse(self, table: str, columns: list):
        """
        parses selected `columns` from `table`;
        check overwritten function for more info.
        """
        pass


In [4]:
class CsvLoader(Loader):
    def __init__(self, stemmer):
        """
        :param stemmer: language stemmer to be used
        """
        super().__init__(stemmer)

    def merge_contents(self,
                       path: str,
                       main_id_cname: str,
                       content_cname: str,
                       columns: list):
        """
        :param path: (str) path to table, where from data will be read
        :param main_id_cname: (str) item-representing-column's name;
            it is explicit for `path` to have such column
        :param content_cname: (str) content-representing-column's name
        :param columns: (list) columns containing main content,
            that will be used to build a content-based model
        :return: (pd.core.frame.DataFrame) 2-column dataframe
            representing relationship of item and it's content
            (one-to-one relationship)
        """
        return self.format_columns(
                self.parse(path, columns + [main_id_cname]),
                main_id_cname,
                content_cname,
                columns
        )

    def parse(self, table: str, columns: list):
        """
        :param table: database table name
        :param columns: columns to be parsed
        :return: dataframe with parsed columns
        """
        return read_csv(
                table,
                skipinitialspace=True,
                usecols=columns
        )


In [5]:
class RecommendationSystem:
    """
    Root-class representing recommendation system
    """
    def __init__(self):
        pass

    def load_and_build(self, *args):
        pass

    def load(self, *args):
        pass

    def build(self, *args):
        pass


In [6]:
class ContentBasedFiltering(RecommendationSystem):
    """
    Represents recommendation system based on content-based-filtering
    """
    def __init__(self,
                 stemmer):
        super(ContentBasedFiltering, self).__init__()
        # using stemmer
        self.stemmer = stemmer
        # similarity matrix (items_count x items_count)
        self.df_data = None
        # item-representing and content columns names
        self.item_id_cname = None
        self.content_cname = None

    def load(self,
             table: str,
             item_id_cname: str,
             content_cname: str,
             content_columns: list,
             loader_type: str = "csv",
             connection=None):
        """
        :param table: table name where from data will be read (this
            either table of database or path to csv file, depending
            on loader_typ)
        :param item_id_cname: item-representing-column's name
        :param content_cname: content-representing-column's name
        :param content_columns: columns, which contents will be used
            to build model
        :param loader_type: equals either to "csv" or "db", depending
            on `table`
        :param connection: if loader_type equals to db, it must be
            connection to using database (otherwise - whatever)

        Loads data necessary to build model
        """
        self.item_id_cname = item_id_cname
        self.content_cname = content_cname
        if loader_type == "csv":
            loader = CsvLoader(self.stemmer)
        elif loader_type == "db":
            if connection is None:
                raise RuntimeError("SearchEngine::load: received connection equals "
                                   "to None with a loader_type equals db")
            loader = DataBaseLoader(self.stemmer, connection)
        else:
            raise RuntimeError("SearchEngine::load: no loader available for given loader_type")
        self.df_data = loader.merge_contents(
                table,
                self.item_id_cname,
                content_cname,
                content_columns
        )


In [7]:
class SearchEngine(ContentBasedFiltering):
    """
    Class represents search engine. After loading data and building
    model, object of this type will be able to find items, which
    contents are the most similar to given query.
    """
    def __init__(self, stemmer):
        """
        :param stemmer: stemmer to stem words from text data
        """
        super(SearchEngine, self).__init__(stemmer)
        # transformer to vectorize contents
        self.transformer = TfidfVectorizer()
        # tf/idf matrix of items' contents
        self.tfidf_matrix = None

    def build(self):
        """
        building model
        """
        # applying transformer (tf/idf)
        raw_data = self.transformer.fit_transform(self.df_data[self.content_cname])
        # casting to pd.DataFrame type
        self.tfidf_matrix = pd.DataFrame(raw_data.T.toarray())

    @staticmethod
    def cosine_sim(a, b):
        """
        :param a: some vector of size n
        :param b: other vector of size n
        :return: cosine similarity of vectors a and b
        """
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def search(self,
               query: str,
               res_n: int = 20):
        """
        :param query: search query
        :param res_n: items count to be find
        :return: list (of size no more res_n elements) of most
            competent items' indices (as they saved in data source)
        """
        rows_count, columns_count = self.tfidf_matrix.shape
        # applying transformer's transform to query and getting it's vector
        query_raw = self.transformer.transform([StemmerWrapper.clean_string(query)])
        query_vec = query_raw.toarray().reshape(rows_count)

        # counting cosine similarity of query and each item-content (in tfidf_matrix)
        """
            I have tried this code, but it was slower for 10% compared to raw iterations,
            `sim = self.tfidf_matrix.apply(lambda x: self.cosine_sim(x, query_vec))`
        """

        # NOTE: able to be parallelized
        similarities = list()
        for i in range(columns_count):
            similarities.append(
                    self.cosine_sim(
                            self.tfidf_matrix.loc[:, i].values,
                            query_vec
                    )
            )

        # sort from the best matching to the worst
        similarities = sorted(
                enumerate(similarities),
                key=lambda x: x[1],
                reverse=True
        )

        # getting top `res_count` results' indices
        indices = [pair[0] for pair in similarities[:res_n]]

        # return found items ids
        return self.df_data.iloc[indices][self.item_id_cname].values


In [8]:
ws = StemmerWrapper()
search_eng = SearchEngine(ws)


In [9]:
item_id_cname, item_content_cname = "product_id", "content_info"

print("loading started...")
search_eng.load(
    "data/product.csv",
    item_id_cname,
    item_content_cname,
    ["product_name", "description", "seller"]
)
print("loading finished...")

print("building started...")
search_eng.build()
print("building finished...")


loading started...
loading finished...
building started...
building finished...


In [10]:
query = "золотое с бриллиантами"
res_indices = search_eng.search(query)
print(res_indices)


In [12]:
def retranslate(df, indices, to_print = False):
    content_info = df.loc[df[item_id_cname].isin(indices)][item_content_cname].tolist()
    if to_print:
        print(*content_info, sep='\n\n')
    else:
        return content_info


In [13]:
retranslate(search_eng.df_data, res_indices, True)

кольц золотое кольц из каучук золотом кольц каучук золот 585 стильн кольц кольц из каучука кольц золотое хит сезона каучуков кольц золот 585 пробы кольц выполн из экологическ безопасн материала не вызыва аллергическ реакцию мягк легк материа очен прият телу украшен дополн образ люб стиле классике спортивном повседневн этническ стилях ширин шинк кольц мм размер золот накладк 9х15 мм соблюда некотор рекомендации чтоб сохран красот эластичн украшения берег украшен от механическ повреждений необходим снима аксессуар перед принят ванны поход бассейн спа ил баню резк перепад температур высок уровен влажност плох влия на украшения хран каучуков украшен подальш от воды он может привест аксессуар непригодн состоян ил повредиться для чистк использ мягк ткань например микрофибру каучук предпочита сух чистку аргентерик

кольц кольц из бел золот бриллиантами издел выполн из бел золот 585 пробы вставка бриллиант характеристики 20 бр круг 17 80 90 бр круг 57 70 80 sokolov

ювелирн кольц кольцо обруча

In [14]:
print("items count:\t", search_eng.df_data.shape[0])
%timeit res_indices = search_eng.search(query)

items count:	 3475
138 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
