# Import necessary library

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import urllib.request
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

Getting information about the events on the site from the api

In [5]:
class Recommender:
    def __init__(self):
        self.url = None
        self.df_title = None
        self.df_location = None
        self.df_date = None
        self.buy_history = None
        self.iter_history = None
        self.interaction = None
        self.merged_df = None
        self.dfs = None
        self.recommender = None

    def data_scrap(self, url: str):
        """use site url to scrap necessary data

        Args:
            url (str): site address

        """
        url = url
        response = requests.get(url)

        if response.status_code == 200:

            soup = BeautifulSoup(response.text, "html.parser")

            h3_tags_title = soup.find_all("h3", class_="blog_post_title my-2")
            h3_tags_location = soup.find_all("div", class_="blog_post_title my-2")
            h3_tags_date = soup.find_all("div", class_="theater-date my-2")

            titles = []
            location = []
            date = []

            for title in h3_tags_title:
                if title.text.strip():
                    titles.append(title.text.strip())

            self.df_title = pd.DataFrame({"Titles": titles})
            df_title = self.df_title

            for loc in h3_tags_location:
                if loc.text.strip():
                    location.append(loc.text.strip())

            self.df_location = pd.DataFrame({"Titles": location})
            df_location = self.df_location

            for dt in h3_tags_date:
                if dt.text.strip():
                    date.append(dt.text.strip())

            self.df_date = pd.DataFrame({"Titles": date})
            df_date = self.df_date

            return df_title, df_location, df_date

    def user_buy_interaction_from_api(self, buy_api: str, iter_api: str):
        """use api to scrap buy and interaction users

        Args:
            buy_api (str): api to scrape user buy history
            iter_api (str): api to scrape user iter history
        """
        buy_link = buy_api
        iter_link = iter_api

        urllib.request.urlretrieve(iter_link, "log.xlsx")
        self.iter_history = pd.read_excel("log.xlsx")
        iter_history = pd.read_excel("log.xlsx")

        urllib.request.urlretrieve(buy_link, "visitor.xlsx")
        self.buy_history = pd.read_excel("visitor.xlsx")
        buy_history = pd.read_excel("visitor.xlsx")

        return iter_history, buy_history

    def preprocessing_interaction(self, interaction_df: pd.DataFrame):
        """preprocessing interaction data for use in model

        Args:
            interaction_df (pd.DataFrame): interaction pd from user_buy_interaction_from_api func

        Returns:
            interaction_df (pd.DataFrame): interaction_df
        """
        interaction_df["بازدید"] = interaction_df["بازدید"].fillna("ffill")
        interaction_df["نام و نام خانوادگی"] = interaction_df[
            "نام و نام خانوادگی"
        ].fillna("none")
        interaction_df["شماره موبایل"] = interaction_df["شماره موبایل"].fillna("none")
        interaction_df = interaction_df[interaction_df["بازدید"] != "صفحه اصلی"]

        le = LabelEncoder()
        interaction_df["userId"] = le.fit_transform(
            interaction_df["نام و نام خانوادگی"]
        )

        return interaction_df

    def merged_all_df(
        self,
        df_title: pd.DataFrame,
        df_location: pd.DataFrame,
        df_date: pd.DataFrame,
        df_interaction: pd.DataFrame,
        df_buy_history: pd.DataFrame,
    ):
        """merged all df to concat all titles under each other

        Args:
            df_title (pd.DataFrame): df_title scrape from data_scrap func output
            df_location (pd.DataFrame): df_location scrape from data_scrap func output
            df_date (pd.DataFrame): df_date scrape from data_scrap func output
            df_interaction (pd.DataFrame): df_interaction scrape from user_buy_interaction_from_api func output
            df_buy_history (pd.DataFrame): df_buy_history scrape from user_buy_interaction_from_api func output

        Returns:
            merged df: Pandas DataFrame
        """
        merge_df = pd.DataFrame(
            {
                "Titles": df_title["Titles"],
                "Location": df_location["Titles"],
                "Date": df_date["Titles"],
            }
        )

        merge_df = pd.concat(
            [
                merge_df["Titles"],
                df_interaction["بازدید"],
                df_buy_history["رویداد"],
            ]
        ).reset_index()

        merge_df.columns = ["index", "Titles"]

        return merge_df

    def list_to_string(self, row):
        return " ".join(row)

    def preprocessing_merged_df(self, merged_df: pd.DataFrame):
        """preprocessing merged_df data for use in model

        Args:
            merged_df (pd.DataFrame): merged_df pd from merged_all_df func output

        Returns:
            merged_df: Pandas DataFrame
        """

        df_ohe = merged_df["Titles"].str.split(" ").reset_index().astype("str")
        df_ohe["Titles"] = df_ohe["Titles"].apply(self.list_to_string)

        le = LabelEncoder()
        merged_df["ohe"] = le.fit_transform(df_ohe["Titles"])

        self.merged_df = merged_df

        return merged_df

    def vectorized_text(self, df_title: pd.DataFrame):
        """vectorized_text for merged Convert a collection of text documents to a matrix of token counts

        Args:
            df_title (pd.DataFrame): use df_title from data_scrap func output

        Returns:
            X : array of shape (n_samples, n_features)
        """

        vectorized = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
        X = vectorized.fit_transform(self.merged_df["Titles"])

        feature_names = vectorized.get_feature_names_out()
        one_hot_df = pd.DataFrame(X.toarray(), columns=feature_names)

        dfs = pd.concat([df_title, one_hot_df], axis=1)
        dfs.drop(columns=["Titles"], inplace=True)

        self.dfs = dfs
        return dfs

    def creat_X(self, interaction_df):
        """Compressed Sparse Row matrix.

        Args:
            iteraction_df (_type_): use preprocessing_interaction func output

        Returns:
            sparse matrix of type '<class 'numpy.float64'>
        """

        M = interaction_df["userId"].nunique()
        N = interaction_df["بازدید"].nunique()

        user_mapper = dict(zip(np.unique(interaction_df["userId"]), list(range(M))))
        item_mapper = dict(zip(np.unique(interaction_df["بازدید"]), list(range(N))))

        user_inv_mapper = dict(zip(list(range(M)), np.unique(interaction_df["userId"])))
        item_inv_mapper = dict(zip(list(range(N)), np.unique(interaction_df["بازدید"])))

        user_index = [user_mapper[i] for i in interaction_df["userId"]]
        item_indx = [item_mapper[i] for i in interaction_df["بازدید"]]

        X = csr_matrix(
            (interaction_df["زمان تعامل(تانیه)"], (user_index, item_indx)), shape=(M, N)
        )

        return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

    def cosine_similioraty(
        dfs: pd.DataFrame,
        merged_df: pd.DataFrame,
        interaction_df: pd.DataFrame,
        idx: str,
        n_recommendations: int = 1,
    ):
        """Compute cosine similarity between samples in X and Y.

        Cosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y:

                Args:
                    dfs (pd.DataFrame): use vectorized_text func outputs
                    merged_df (pd.DataFrame): use preprocessing_merged_df func outputs
                    interaction_df (pd.DataFrame): use preprocessing_interaction func output
                    idx (str): idx of user interation and buy
                    n_recommendations (int, optional): Number of outgoing recommenders. Defaults to 1.

                Returns:
                    list: user best recommenders
        """
        cosine_sim = cosine_similarity(dfs, dfs)
        iter_idx = dict(zip(merged_df["Titles"].unique(), list(interaction_df.index)))
        idx = iter_idx[idx]
        n_recommendations = n_recommendations
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1 : (n_recommendations + 1)]
        similar_item = [i[0] for i in sim_scores]
        recomended = merged_df["Titles"].iloc[similar_item]
        recomended = recomended.to_list()

        return recomended

    def recomender_users(
        self, interaction_df: pd.DataFrame, dfs: pd.DataFrame, merged_df: pd.DataFrame
    ):
        """use interaction_df ,dfs,merged_df to recommend best for each user

        Args:
            interaction_df (pd.DataFrame): use preprocessing_interaction func output
            merged_df (pd.DataFrame): use preprocessing_merged_df func outputs
            dfs (pd.DataFrame): use vectorized_text func outputs

        Returns:
            dict: user(phone number) recommender
        """
        users_phone = interaction_df["شماره موبایل"].unique()
        user_iter = {}
        for i in users_phone:
            user_it = (
                interaction_df[interaction_df["شماره موبایل"] == i][
                    ["زمان تعامل(تانیه)", "بازدید"]
                ]
                .max()
                .reset_index()
                .T
            )
            user_it.columns = ["زمان تعامل(تانیه)", "بازدید"]
            user_it.drop(index="index", inplace=True)
            user_it["بازدید"]
            idx = user_it["بازدید"].to_list()[0]
            names = i
            iters = self.cosine_similioraty(dfs, merged_df, interaction_df, idx=idx)
            user_dict = {names: iters}
            user_iter.update(user_dict)
        return user_iter

In [None]:
recomender = Recommender()

In [10]:
df_title, df_location, df_date = recomender.data_scrap("https://www.6234.ir/")

In [11]:
interaction, buy_history = recomender.user_buy_interaction_from_api(
    buy_api="https://6234.ir/api/ticket?token=apiqazxcvbnm&ofDate=1402/08/20&toDate=1402/12/29",
    iter_api="https://6234.ir/api/log?token=apiqazxcvbnm&ofDate=1402/08/20&toDate=1402/12/29",
)

In [15]:
interaction = recomender.preprocessing_interaction(interaction)

In [13]:
merged_df = recomender.merged_all_df(
    df_title, df_location, df_date, interaction, buy_history
)

In [None]:
merged_df = recomender.preprocessing_merged_df(merged_df)
dfs = recomender.vectorized_text(df_title)

In [14]:
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = recomender.creat_X(
    interaction,
)

In [18]:
users_phone = interaction["شماره موبایل"].unique()
user_iter = {}
for i in users_phone:
    user_it = (
        interaction[interaction["شماره موبایل"] == i][["زمان تعامل(تانیه)", "بازدید"]]
        .max()
        .reset_index()
        .T
    )
    user_it.columns = ["زمان تعامل(تانیه)", "بازدید"]
    user_it.drop(index="index", inplace=True)
    user_it["بازدید"]
    idx = user_it["بازدید"].to_list()[0]
    names = i
    iters = recomender.cosine_similioraty(dfs, merged_df, interaction, idx=idx)
    user_dict = {names: iters}
    user_iter.update(user_dict)
user_iter

TypeError: Recommender.cosine_similioraty() got multiple values for argument 'idx'