In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
from datetime import timedelta
import jdatetime
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np


class Recommender:
    def __init__(self):
        self.url = None
        self.df_title = None
        self.df_location = None
        self.df_date = None
        self.buy_history = None
        self.iter_history = None
        self.interaction = None
        self.merged_df = None
        self.dfs = None
        self.recommender = None
        self.event_df = None

    def data_scrap(self, url: str):
        """Use site url to scrap necessary data"""
        self.url = url
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            titles = [
                title.text.strip()
                for title in soup.find_all("h3", class_="blog_post_title my-2")
            ]
            locations = [
                loc.text.strip()
                for loc in soup.find_all("div", class_="blog_post_location my-2")
            ]
            dates = [
                dt.text.strip()
                for dt in soup.find_all("div", class_="theater-date my-2")
            ]

            self.df_title = pd.DataFrame({"Titles": titles})
            self.df_location = pd.DataFrame({"Location": locations})
            self.df_date = pd.DataFrame({"Date": dates})

            return self.df_title, self.df_location, self.df_date
        else:
            raise Exception(f"Failed to retrieve data from {url}")

    def user_buy_interaction_from_api(self, buy_api: str, iter_api: str):
        """Use API to scrap buy and interaction users"""
        urllib.request.urlretrieve(iter_api, "log.xlsx")
        self.iter_history = pd.read_excel("log.xlsx")

        urllib.request.urlretrieve(buy_api, "visitor.xlsx")
        self.buy_history = pd.read_excel("visitor.xlsx")

        return self.iter_history, self.buy_history

    def generate_date_ranges(self, start_date, end_date):
        date_ranges = []
        current_start_date = start_date
        while current_start_date < end_date:
            current_end_date = current_start_date + timedelta(days=60)
            if current_end_date > end_date:
                current_end_date = end_date
            date_ranges.append((current_start_date, current_end_date))
            current_start_date = current_end_date
        return date_ranges

    def fetch_data_from_api_log(self, start_date, end_date):
        start_date = start_date.strftime("%Y/%m/%d")
        end_date = end_date.strftime("%Y/%m/%d")
        url = f"https://6234.ir/api/log?token=aiapiqazxcvbnm1403&ofDate={start_date}&toDate={end_date}"
        return url

    def interaction_auto(self):
        start_date_jalali = jdatetime.datetime.strptime(
            jdatetime.date(1402, 1, 1).strftime("%Y/%m/%d"), "%Y/%m/%d"
        ).date()
        end_date_jalali = jdatetime.datetime.strptime(
            jdatetime.datetime.now().strftime("%Y/%m/%d"), "%Y/%m/%d"
        ).date()
        date_ranges = self.generate_date_ranges(start_date_jalali, end_date_jalali)
        for start, end in date_ranges:
            urllib.request.urlretrieve(
                self.fetch_data_from_api_log(start, end), f"log{start.month}.xlsx"
            )

        df_api = {}
        for start, end in date_ranges:
            month = start.month
            df = pd.read_excel(f"log{month}.xlsx")

            if month in df_api:
                df_api[month] = pd.concat([df_api[month], df], ignore_index=True)
            else:
                df_api[month] = df

        combined_df = pd.concat(df_api.values(), ignore_index=True)
        return combined_df

    def fetch_data_from_api_buy(self, start_date, end_date):
        start_date = start_date.strftime("%Y/%m/%d")
        end_date = end_date.strftime("%Y/%m/%d")
        url = f"https://6234.ir/api/ticket?token=aiapiqazxcvbnm1403&ofDate={start_date}&toDate={end_date}"
        return url

    def buy_auto(self):
        start_date_jalali = jdatetime.datetime.strptime(
            jdatetime.date(1402, 1, 1).strftime("%Y/%m/%d"), "%Y/%m/%d"
        ).date()
        end_date_jalali = jdatetime.datetime.strptime(
            jdatetime.datetime.now().strftime("%Y/%m/%d"), "%Y/%m/%d"
        ).date()
        date_ranges = self.generate_date_ranges(start_date_jalali, end_date_jalali)
        for start, end in date_ranges:
            urllib.request.urlretrieve(
                self.fetch_data_from_api_buy(start, end), f"log{start.month}.xlsx"
            )

        df_api = {}
        for start, end in date_ranges:
            month = start.month
            df = pd.read_excel(f"log{month}.xlsx")

            if month in df_api:
                df_api[month] = pd.concat([df_api[month], df], ignore_index=True)
            else:
                df_api[month] = df

        combined_df = pd.concat(df_api.values(), ignore_index=True)
        return combined_df

    def preprocessing_interaction(self, interaction_df: pd.DataFrame):
        """Preprocessing interaction data for use in model"""
        interaction_df["بازدید"] = interaction_df["بازدید"].fillna(method="ffill")
        interaction_df["نام و نام خانوادگی"] = interaction_df[
            "نام و نام خانوادگی"
        ].fillna("none")
        interaction_df["شماره موبایل"] = interaction_df["شماره موبایل"].fillna("none")
        interaction_df = interaction_df[interaction_df["بازدید"] != "صفحه اصلی"]

        le = LabelEncoder()
        interaction_df["userId"] = le.fit_transform(
            interaction_df["نام و نام خانوادگی"]
        )

        return interaction_df

    def event_api(self, api: str):
        event_link = api
        urllib.request.urlretrieve(event_link, "event.xlsx")
        self.event_df = pd.read_excel("event.xlsx")
        self.event_df["Titles"] = self.event_df["عنوان"]
        return self.event_df

    def merged_all_df(
        self,
        df_title: pd.DataFrame,
        df_location: pd.DataFrame,
        df_date: pd.DataFrame,
        df_interaction: pd.DataFrame,
        df_buy_history: pd.DataFrame,
        event_df: pd.DataFrame,
    ):
        """Merged all df to concat all titles under each other"""
        merged_df = pd.DataFrame(
            {
                "Titles": pd.concat(
                    [
                        df_title["Titles"],
                        df_interaction["بازدید"],
                        df_buy_history["رویداد"],
                        event_df["Titles"],
                    ]
                )
            }
        ).reset_index(drop=True)

        return merged_df

    def list_to_string(self, row):
        return " ".join(row)

    def remove_excel(self, excel_list: list):
        for i in excel_list:
            os.remove(i)

    def preprocessing_merged_df(self, merged_df: pd.DataFrame):
        """Preprocessing merged_df data for use in model"""
        df_ohe = merged_df["Titles"].str.split(" ").reset_index().astype("str")
        df_ohe["Titles"] = df_ohe["Titles"].apply(self.list_to_string)

        le = LabelEncoder()
        merged_df["ohe"] = le.fit_transform(df_ohe["Titles"])

        self.merged_df = merged_df
        return merged_df

    def vectorized_text(self, df_title: pd.DataFrame):
        """Vectorized_text for merged Convert a collection of text documents to a matrix of token counts"""
        vectorized = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
        X = vectorized.fit_transform(self.merged_df["Titles"])

        feature_names = vectorized.get_feature_names_out()
        one_hot_df = pd.DataFrame(X.toarray(), columns=feature_names)

        dfs = pd.concat([df_title, one_hot_df], axis=1)
        dfs.drop(columns=["Titles"], inplace=True)

        self.dfs = dfs
        return dfs

    def creat_X(self, interaction_df):
        """Compressed Sparse Row matrix."""
        M = interaction_df["userId"].nunique()
        N = interaction_df["بازدید"].nunique()

        user_mapper = dict(zip(np.unique(interaction_df["userId"]), list(range(M))))
        item_mapper = dict(zip(np.unique(interaction_df["بازدید"]), list(range(N))))

        user_inv_mapper = dict(zip(list(range(M)), np.unique(interaction_df["userId"])))
        item_inv_mapper = dict(zip(list(range(N)), np.unique(interaction_df["بازدید"])))

        user_index = [user_mapper[i] for i in interaction_df["userId"]]
        item_indx = [item_mapper[i] for i in interaction_df["بازدید"]]

        X = csr_matrix(
            (interaction_df["زمان تعامل(تانیه)"], (user_index, item_indx)), shape=(M, N)
        )

        return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

    def cosine_similioraty(
        self,
        dfs: pd.DataFrame,
        event_df: pd.DataFrame,
        interaction_df: pd.DataFrame,
        idx: str,
        n_recommendations: int = 1,
    ):
        """Compute cosine similarity between samples in X and Y."""
        cosine_sim = cosine_similarity(dfs, dfs)
        iter_idx = dict(zip(event_df["Titles"].unique(), list(event_df.index)))
        if idx not in iter_idx:
            raise ValueError(f"Event with title '{idx}' not found in event data")
        idx = iter_idx[idx]
        n_recommendations = n_recommendations
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1 : (n_recommendations + 1)]
        similar_item = [
            i[0] for i in sim_scores if i[0] < len(event_df)
        ]  # اضافه کردن شرط
        recomended = event_df["Titles"].iloc[similar_item]
        recomended = recomended.to_list()

        return recomended

    def recomender_users(
        self,
        interaction_df: pd.DataFrame,
        dfs: pd.DataFrame,
        n_recommendations=1,
    ):
        """use interaction_df, dfs, and event_df to recommend best for each user

        Args:
            interaction_df (pd.DataFrame): use preprocessing_interaction func output
            dfs (pd.DataFrame): use vectorized_text func outputs
            n_recommendations (int, optional): Number of recommendations per user. Defaults to 1.

        Returns:
            dict: user (phone number) to recommended events mapping
        """
        users_phone = interaction_df["شماره موبایل"].unique()
        user_iter = {}
        for i in users_phone:
            user_it = (
                interaction_df[interaction_df["شماره موبایل"] == i][
                    ["زمان تعامل(تانیه)", "بازدید"]
                ]
                .max()
                .reset_index()
                .T
            )
            user_it.columns = ["زمان تعامل(تانیه)", "بازدید"]
            user_it.drop(index="index", inplace=True)
            user_it["بازدید"]
            idx = user_it["بازدید"].to_list()[0]
            phone = str(i)
            phone = phone[:-2]
            iters = self.cosine_similioraty(
                dfs,
                self.event_df,  # Use self.event_df here
                interaction_df,
                idx=idx,
                n_recommendations=n_recommendations,
            )
            user_dict = {phone: iters}
            user_iter.update(user_dict)

        temp = []
        res = dict()

        for key, val in user_iter.items():
            if val not in res.values():
                res[key] = val

        return res

In [2]:
recomender = Recommender()

In [3]:
interaction = recomender.interaction_auto()

  combined_df = pd.concat(df_api.values(), ignore_index=True)


In [4]:
buy_history = recomender.buy_auto()

  combined_df = pd.concat(df_api.values(), ignore_index=True)


In [5]:
df_title, df_location, df_date = recomender.data_scrap("https://www.6234.ir/")

In [6]:
event_df = recomender.event_api("https://6234.ir/api/event?token=aiapiqazxcvbnm1403")

In [7]:
interaction = recomender.preprocessing_interaction(interaction)

  interaction_df["بازدید"] = interaction_df["بازدید"].fillna(method="ffill")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction_df["userId"] = le.fit_transform(


In [8]:
merged_df = recomender.merged_all_df(
    df_title, df_location, df_date, interaction, buy_history, event_df
)

In [9]:
event_df_ = event_df[["Titles"]]
event_df_ = event_df_.dropna()

In [10]:
merged_df = recomender.preprocessing_merged_df(merged_df)
dfs = recomender.vectorized_text(event_df_)

In [11]:
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = recomender.creat_X(
    interaction,
)

In [13]:
recomender.recomender_users(interaction, dfs, merged_df)

ValueError: Event with title 'گروه سون - جذاب' not found in event data

In [None]:
event_df_.dropna()

Unnamed: 0,Titles,ohe
0,نمایشگاه ترکیه,14
1,پارک امیرگان ( Test ),20
2,دهمین همایش سالانه بانکداری الکترونیک,8
3,کنسرت نمایش کلنل,24
4,نمایشگاه گردشگری در فضای مجازی,17
5,نمایشگاه تبلیغات در فضای مجازی,13
6,تور پوکت,5
7,تور چابهار مکران,6
8,پارکینگ VIP,21
9,هتل کاروانسرای کوهاب,19


In [None]:
merged_df["Titles"].unique()

array(['نمایشگاه ترکیه', 'پارک امیرگان ( Test )',
       'دهمین همایش سالانه بانکداری الکترونیک', 'کنسرت نمایش کلنل',
       'نمایشگاه گردشگری در فضای مجازی', 'نمایشگاه تبلیغات در فضای مجازی',
       'تور پوکت', 'تور چابهار مکران', 'پارکینگ VIP',
       'هتل کاروانسرای کوهاب', 'نمونه رستوران و کافه',
       'سالن اجرای صحنه ای 360', 'مسابقه آنلاین',
       'نمایشگاه جشنواره هوش مصنوعی در تبلیغات و اطلاع رسانی',
       'نمایشگاه و جشنواره هوش مصنوعی در گردشگری و تفریحات',
       'بازدید برج آزادی', 'کنسرت تست ( با انتخاب صندلی )',
       'کنسرت تست ( بدون انتخاب صندلی )', 'مجموعه تفریحی اپارک',
       'کنگره بین\u200cالمللی جامعه دندانپزشکی ایران', 'نمایش کمدی ژیلت',
       'تور قشم', 'تور آنتالیا', 'تور دبی', 'تور کوش آداسی', 'تور باتومی'],
      dtype=object)