In [1]:
from functools import reduce
import re
import string

import numpy as np
import scipy.sparse as sparse   
from pandas import read_csv
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer


In [2]:
class StemmerWrapper:
    """ Class wrapper to some language stemmer; Via wrapping, I think,
        it is comfortable to operate with stemmer and functions,
        that formats text for systems.
    """
    def __init__(self, lang="russian"):
        """
        :param lang: Initializing stemmer with setting `lang` language
        """
        self.stemmer = SnowballStemmer(lang)

    def stem(self, *args, **kwargs):
        """ just for beauty and comfortable call"""
        return self.stemmer.stem(*args, **kwargs)

    @staticmethod
    def clean_string(sample_s: str) -> str:
        """
        :param sample_s: string to be formatted
        :return: formatted string
        formats given string by removing unnecessary components for
        building recommendation systems
        """
        # string to lowercase
        sample_s = sample_s.strip().lower()
        # removing one-symbol words
        sample_s = re.sub(r'\b[ЁёА-я]{1}\b', '', sample_s)
        # removing punctuation
        sample_s = re.sub(r'[%s]' % re.escape(string.punctuation), ' ',
                          sample_s)
        # removing one-digit numbers
        sample_s = re.sub(r'\b[0-9]{1}\b', '', sample_s)
        # replacing several-in-a-row space symbols with only one space
        sample_s = re.sub(r'\s+', ' ', sample_s)
        return sample_s.strip()


In [3]:
class Loader:
    """
    Class represents data-loader for systems. This is a base-class,
    so some virtual function must be overwritten.
    """
    def __init__(self, stemmer):
        """
        :param stemmer: language stemmer to be used
        """
        self.stemmer = stemmer

    def merge_contents(self,
                       table: str,
                       main_id: str,
                       content_cname: str,
                       columns: list):
        """
        merges content of selected `columns` from `table`;
        check overwritten function for more info.
        """
        pass

    @staticmethod
    def split_series(series):
        """
        :return: (pd.core.series.Series) updated `series`-copy
        :param series: (pd.core.series.Series)
        Splitting values in cell's of column `series` (in-place)
        """
        # handling None values separately
        return series.apply(lambda x: x.split() if type(x) == str else [])

    def format_columns(self,
                       dataframe,
                       main_id_cname: str,
                       content_cname: str,
                       columns: list):
        """
        :param dataframe: contains data to be formatted and used
        :param main_id_cname: item-representing-column's name
        :param content_cname: content-representing-column's name
        :param columns: names of columns in dataframe
        :return: 2-column dataframe, named as main_id_cname and
            content_cname; second columns contains formatted
            and merged `columns` content
        """
        # initializing new column in dataframe with empty strings
        dataframe[content_cname] = ''

        # remember items-id-representing column
        id_series = dataframe[main_id_cname]

        # set dataframe to a `columns`-containing table,
        # where all string infos was split into lists
        dataframe = dataframe[[content_cname] + columns].apply(
                self.split_series)

        # formatting all rows
        # firstly, we put add all lists to content containing column
        dataframe[content_cname] = reduce(
                lambda prev, el: prev + dataframe[el],
                columns,
                dataframe[content_cname]
        ).apply(  # then we would stem all words in this column
                lambda iterable: [self.stemmer.stem(w) for w in iterable]
        ).apply(  # lastly, we join lists to string
                lambda iterable: ' '.join(iterable)
        ).apply(
                StemmerWrapper.clean_string
        )

        # set item-representing-column's data
        dataframe[main_id_cname] = id_series

        # return table representing relationship item
        return dataframe[[main_id_cname, content_cname]]

    def parse(self, table: str, columns: list):
        """
        parses selected `columns` from `table`;
        check overwritten function for more info.
        """
        pass


In [4]:
class CsvLoader(Loader):
    """
    Represents data-loader from csv-file
    """
    def __init__(self, stemmer):
        """
        :param stemmer: language stemmer to be used
        """
        super().__init__(stemmer)

    def merge_contents(self,
                       path: str,
                       main_id_cname: str,
                       content_cname: str,
                       columns: list):
        """
        :param path: (str) path to table, where from data will be read
        :param main_id_cname: (str) item-representing-column's name;
            it is explicit for `path` to have such column
        :param content_cname: (str) content-representing-column's name
        :param columns: (list) columns containing main content,
            that will be used to build a content-based model
        :return: (pd.core.frame.DataFrame) 2-column dataframe
            representing relationship of item and it's content
            (one-to-one relationship)
        """
        return self.format_columns(
                self.parse(path, columns + [main_id_cname]),
                main_id_cname,
                content_cname,
                columns
        )

    def parse(self, table: str, columns: list):
        """
        :param table: database table name
        :param columns: columns to be parsed
        :return: dataframe with parsed columns
        """
        return read_csv(
                table,
                skipinitialspace=True,
                usecols=columns
        )


In [5]:
class RecommendationSystem:
    """
    Root-class representing recommendation system
    """
    def __init__(self):
        pass

    def load_and_build(self, *args):
        pass

    def load(self, *args):
        pass

    def build(self, *args):
        pass


In [45]:
class CollaborativeFiltering(RecommendationSystem):
    """
    TODO checkout and comment this class
    Represents recommendation system based on collaborative-filtering
    """
    def __init__(self):
        super(CollaborativeFiltering, self).__init__()
        # sparse matrix of implicit user-item interactions
        self.sparse_matrix = None
        # users matrix in lower rank
        self.users_matrix = None
        # items matrix in lower rank
        self.items_matrix = None

        # dataframes, so we could map indices used in class
        # methods with indices used in database
        self.user_indices_decode = None
        self.item_indices_decode = None

        # column names in database corresponding to user and item
        self.user_cname = None
        self.item_cname = None

    def load(self,
             table: str,
             columns: list,
             loader_type: str = "csv",
             connection=None):
        if len(columns) < 2 or len(columns) > 3:
            raise RuntimeError("CollaborativeFiltering::load: columns "
                               "argument must contain exactly 3 "
                               "string-values")

        # initialize loader
        if loader_type == "csv":
            loader = CsvLoader(None)
        elif loader_type == "db":
            if connection is None:
                raise RuntimeError("CollaborativeFiltering::load: received "
                                   "connection equals to None with a "
                                   "loader_type equals db")
            loader = DataBaseLoader(None, connection)
        else:
            raise RuntimeError("CollaborativeFiltering::load: no loader "
                               "available for given loader_type")

        # load implicit data in 3-column dataframe
        dataframe = loader.parse(table, columns)
        dataframe.dropna(inplace=True)
        
        relations_count = dataframe.shape[0]
        # copying column names explicitly
        self.user_cname, self.item_cname = columns[0] + "", columns[1] + ""
        user_id_cname, item_id_cname = columns[0] + "_id", columns[1] + "_id"

        # save user and item columns, so we will be able to return to
        # caller recommendation as it is stored in database
        dataframe[user_id_cname] = dataframe[columns[0]].astype(
                "category").cat.codes
        dataframe[item_id_cname] = dataframe[columns[1]].astype(
                "category").cat.codes
        self.user_indices_decode = dataframe[
            [user_id_cname, columns[0]]].drop_duplicates()
        self.item_indices_decode = dataframe[
            [item_id_cname, columns[1]]].drop_duplicates()
        dataframe.drop(columns[:2], axis=1, inplace=True)

        # initializing sparse matrix
        users = dataframe[user_id_cname].astype(int)
        items = dataframe[item_id_cname].astype(int)
        users_count = len(dataframe[user_id_cname].unique())
        items_count = len(dataframe[item_id_cname].unique())
        if len(columns) == 3:
            self.sparse_matrix = sparse.csr_matrix(
                    (dataframe[columns[2]], (users, items)),
                    shape=(users_count, items_count)
            )
        elif len(columns) == 2:
            self.sparse_matrix = sparse.csr_matrix(
                    ([1 for _ in range(relations_count)], (users, items)),
                    shape=(users_count, items_count)
            )

    def item2index(self,
                   item):
        """
        :param item: item as it is stored in database
        :return: index used in this object for `item`
        """
        return self.item_indices_decode[
            self.item_cname + "_id"
            ].loc[self.item_indices_decode[self.item_cname] == item].iloc[0]

    def user2index(self,
                   user):
        """
        :param user: user as it is stored in database
        :return: index used in this object for `user`
        """
        return self.user_indices_decode[
            self.user_cname + "_id"
            ].loc[self.user_indices_decode[self.user_cname] == user].iloc[0]

    def index2item(self,
                   items_ids: list):
        """
        :param items_ids: list of indices as they are stored in this
            object
        :return: corresponding to `indices` values of items
        """
        result = list()
        for item_id in items_ids:
            result.append(
                    self.item_indices_decode[
                        self.item_cname
                    ].loc[
                        self.item_indices_decode[
                            self.item_cname + "_id"] == item_id
                        ].iloc[0]
            )
        return result


In [132]:
class ComplementItemRS(CollaborativeFiltering):
    """
    COMPLEMENTARY ITEM
    collaborative user-based filtering recommendation system
    to find item that is frequently being consumed with some item

    будет точь-в-точь complementary, если матрица будет строиться
    по отношениям order-item, где order-item = 1, если item
    был в order, иначе 0.
    """

    def __init__(self):
        super(ComplementItemRS, self).__init__()
        # sparse matrix of implicit user-item interactions
        self.sparse_matrix = None

        # dataframes, so we could map indices used in class
        # methods with indices used in database
        self.user_indices_decode = None
        self.item_indices_decode = None

        # column names in database corresponding to user and item
        self.user_cname = None
        self.item_cname = None

        self.df_data = None
        self.similarities = None

    def build(self, via_normalize: bool = False):
        if via_normalize:
            self.sparse_matrix = self.sparse_matrix.astype(np.float32)
            magnitude = np.sqrt(self.sparse_matrix.power(2).sum(axis=1))
            for i in range(magnitude.shape[0]):
                self.sparse_matrix.data[self.sparse_matrix.indptr[i]:
                                        self.sparse_matrix.indptr[i + 1]] = self.sparse_matrix.data[
                    self.sparse_matrix.indptr[i]:
                                        self.sparse_matrix.indptr[i + 1]] / magnitude[i][0]
        self.similarities = cosine_similarity(self.sparse_matrix.transpose())

    def find_similar_item(self,
                          item,
                          res_n: int = 10):
        item_id = self.item2index(item)
        item_vec = self.similarities[item_id].reshape(1, -1)[0]
        return self.index2item(np.argsort(item_vec)[::-1][1:res_n + 1])

    def recommend_to_user(self,
                          user: str,
                          res_n: int = 10):
        """ если хотим recommend for this order, то user (здесь это order)
            должен быть пустым; для лучше работы нужен метод update или тп
        """
        user_id = self.user2index(user)

        scores_vec = (self.similarities @ self.sparse_matrix[user_id, :].T)
        scores_vec /= np.array([np.abs(self.similarities).sum(axis=1)]).T
        non_zero_c = 0
        for i in range(self.sparse_matrix.indptr[user_id],
                       self.sparse_matrix.indptr[user_id + 1]):
            scores_vec[self.sparse_matrix.indices[i], 0] = 0
        return self.index2item(np.argsort(scores_vec.T)[0][::-1][:res_n])


In [133]:
complement_items = ComplementItemRS()
complement_items.load("formatted_data/lastfm2collab.csv", ["user_id", "item_id"])
complement_items.build(True)


In [131]:
complement_items.similarities.shape

(285, 285)

In [130]:
complement_items.sparse_matrix.power(2).sum(axis=1).shape

(1, 285)

In [124]:
complement_items.find_similar_item("linkin park")

['limp bizkit',
 'three days grace',
 'simple plan',
 'bullet for my valentine',
 'billy talent',
 'breaking benjamin',
 'papa roach',
 'evanescence',
 'rise against',
 'foo fighters']

In [127]:
complement_items.recommend_to_user(5985)

['joy division',
 'the smiths',
 'david bowie',
 'yann tiersen',
 'the rolling stones',
 'tom waits',
 'eric clapton',
 'misfits',
 'led zeppelin',
 'belle and sebastian']

In [134]:
complement_items.recommend_to_user(5985)

['joy division',
 'the smiths',
 'david bowie',
 'yann tiersen',
 'the rolling stones',
 'tom waits',
 'eric clapton',
 'misfits',
 'led zeppelin',
 'belle and sebastian']