# Import necessary library

In [1]:
import os
import jdatetime
import pandas as pd


import numpy as np


import requests


import json


import urllib.request
from datetime import timedelta


from bs4 import BeautifulSoup


from sklearn.metrics.pairwise import cosine_similarity


from sklearn.preprocessing import LabelEncoder


from sklearn.feature_extraction.text import CountVectorizer


from scipy.sparse import csr_matrix

In [9]:
class Recommender:
    def __init__(self):
        self.url = None
        self.df_title = None
        self.df_location = None
        self.df_date = None
        self.buy_history = None
        self.iter_history = None
        self.interaction = None
        self.merged_df = None
        self.dfs = None
        self.recommender = None
        self.event_df = None

    def data_scrap(self, url: str):
        """use site url to scrap necessary data

        Args:
            url (str): site address

        """
        url = url
        response = requests.get(url)

        if response.status_code == 200:

            soup = BeautifulSoup(response.text, "html.parser")

            h3_tags_title = soup.find_all("h3", class_="blog_post_title my-2")
            h3_tags_location = soup.find_all("div", class_="blog_post_title my-2")
            h3_tags_date = soup.find_all("div", class_="theater-date my-2")

            titles = []
            location = []
            date = []

            for title in h3_tags_title:
                if title.text.strip():
                    titles.append(title.text.strip())

            self.df_title = pd.DataFrame({"Titles": titles})
            df_title = self.df_title

            for loc in h3_tags_location:
                if loc.text.strip():
                    location.append(loc.text.strip())

            self.df_location = pd.DataFrame({"Titles": location})
            df_location = self.df_location

            for dt in h3_tags_date:
                if dt.text.strip():
                    date.append(dt.text.strip())

            self.df_date = pd.DataFrame({"Titles": date})
            df_date = self.df_date

            return df_title, df_location, df_date

    def user_buy_interaction_from_api(self, buy_api: str, iter_api: str):
        """use api to scrap buy and interaction users

        Args:
            buy_api (str): api to scrape user buy history
            iter_api (str): api to scrape user iter history
        """
        pd.options.mode.copy_on_write = True
        buy_link = buy_api
        iter_link = iter_api

        urllib.request.urlretrieve(iter_link, "log.xlsx")
        self.iter_history = pd.read_excel("log.xlsx")
        iter_history = pd.read_excel("log.xlsx")

        urllib.request.urlretrieve(buy_link, "visitor.xlsx")
        self.buy_history = pd.read_excel("visitor.xlsx")
        buy_history = pd.read_excel("visitor.xlsx")

        return iter_history, buy_history

    def generate_date_ranges(self, start_date, end_date):
        date_ranges = []
        current_start_date = start_date
        while current_start_date < end_date:
            current_end_date = current_start_date + timedelta(days=60)
            if current_end_date > end_date:
                current_end_date = end_date
            date_ranges.append((current_start_date, current_end_date))
            current_start_date = current_end_date
        return date_ranges

    def fetch_data_from_api(self, url, start_date, end_date):
        start_date = start_date.strftime("%Y/%m/%d")
        end_date = end_date.strftime("%Y/%m/%d")
        full_url = (
            f"{url}?token=aiapiqazxcvbnm1403&ofDate={start_date}&toDate={end_date}"
        )
        response = requests.get(full_url)
        try:
            response.raise_for_status()  # Check for HTTP errors
            if response.text:  # Check if the response is not empty
                data = response.json()
                return pd.DataFrame(data)
            else:
                print(f"Empty response for dates {start_date} to {end_date}")
                return pd.DataFrame()  # Return an empty DataFrame if no data
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error occurred: {e}")
            return pd.DataFrame()
        except ValueError as e:
            print(f"Error decoding JSON: {e}")
            return pd.DataFrame()

    def interaction_auto(self):
        start_date_jalali = jdatetime.date(1402, 1, 1)
        end_date_jalali = jdatetime.date.today()

        date_ranges = self.generate_date_ranges(start_date_jalali, end_date_jalali)
        all_data = []

        for start, end in date_ranges:
            df_log = self.fetch_data_from_api("https://6234.ir/api/log", start, end)
            all_data.append(df_log)

        if all_data:
            df_log_combined = pd.concat(all_data, ignore_index=True)
            return df_log_combined
        else:
            return pd.DataFrame()

    def fetch_data_from_api_buy(self, start_date, end_date):
        start_date = start_date.strftime("%Y/%m/%d")
        end_date = end_date.strftime("%Y/%m/%d")
        url = f"https://6234.ir/api/ticket?token=aiapiqazxcvbnm1403&ofDate={start_date}&toDate={end_date}"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return pd.DataFrame(data)

    def buy_auto(self):
        start_date_jalali = jdatetime.date(1402, 1, 1)
        end_date_jalali = jdatetime.date.today()

        date_ranges = self.generate_date_ranges(start_date_jalali, end_date_jalali)
        all_data = []

        for start, end in date_ranges:
            df_buy = self.fetch_data_from_api("https://6234.ir/api/ticket", start, end)
            all_data.append(df_buy)

        if all_data:
            df_buy_combined = pd.concat(all_data, ignore_index=True)
            return df_buy_combined
        else:
            return pd.DataFrame()


    def preprocessing_interaction(self, interaction_df: pd.DataFrame):
        """preprocessing interaction data for use in model

        Args:
            interaction_df (pd.DataFrame): interaction pd from user_buy_interaction_from_api func

        Returns:
            interaction_df (pd.DataFrame): interaction_df
        """
        interaction_df["بازدید"] = interaction_df["بازدید"].fillna("ffill")
        interaction_df["نام و نام خانوادگی"] = interaction_df[
            "نام و نام خانوادگی"
        ].fillna("none")
        interaction_df["شماره موبایل"] = interaction_df["شماره موبایل"].fillna("none")
        interaction_df = interaction_df[interaction_df["بازدید"] != "صفحه اصلی"]

        le = LabelEncoder()
        interaction_df.loc[:, "userId"] = le.fit_transform(
            interaction_df["نام و نام خانوادگی"]
        )

        return interaction_df

    def event_api(self, api: str):
        event_link = api

        urllib.request.urlretrieve(event_link, "event.xlsx")
        self.iter_history = pd.read_excel("event.xlsx")
        event_df = pd.read_excel("event.xlsx")
        event_df["Titles"] = event_df["عنوان"]

        return event_df

    def merged_all_df(
        self,
        df_title: pd.DataFrame,
        df_location: pd.DataFrame,
        df_date: pd.DataFrame,
        df_interaction: pd.DataFrame,
        df_buy_history: pd.DataFrame,
        event_df: pd.DataFrame,
    ):
        """merged all df to concat all titles under each other

        Args:
            df_title (pd.DataFrame): df_title scrape from data_scrap func output
            df_location (pd.DataFrame): df_location scrape from data_scrap func output
            df_date (pd.DataFrame): df_date scrape from data_scrap func output
            df_interaction (pd.DataFrame): df_interaction scrape from user_buy_interaction_from_api func output
            df_buy_history (pd.DataFrame): df_buy_history scrape from user_buy_interaction_from_api func output

        Returns:
            merged df: Pandas DataFrame
        """
        merge_df = pd.DataFrame(
            {
                "Titles": df_title["Titles"],
                "Location": df_location["Titles"],
                "Date": df_date["Titles"],
            }
        )

        merge_df = pd.concat(
            [
                merge_df["Titles"],
                df_interaction["بازدید"],
                df_buy_history["رویداد"],
                event_df["Titles"],
            ]
        ).reset_index()

        merge_df.columns = ["index", "Titles"]

        return merge_df

    def list_to_string(self, row):
        return " ".join(row)

    def remove_excel(self, excel_list: list):
        for i in excel_list:
            os.remove(i)

    def preprocessing_merged_df(self, merged_df: pd.DataFrame):
        """preprocessing merged_df data for use in model

        Args:
            merged_df (pd.DataFrame): merged_df pd from merged_all_df func output

        Returns:
            merged_df: Pandas DataFrame
        """

        df_ohe = merged_df["Titles"].str.split(" ").reset_index().astype("str")
        df_ohe["Titles"] = df_ohe["Titles"].apply(self.list_to_string)

        le = LabelEncoder()
        merged_df["ohe"] = le.fit_transform(df_ohe["Titles"])

        self.merged_df = merged_df

        return merged_df

    def vectorized_text(self, df_title: pd.DataFrame):
        """vectorized_text for merged Convert a collection of text documents to a matrix of token counts

        Args:
            df_title (pd.DataFrame): use df_title from data_scrap func output

        Returns:
            X : array of shape (n_samples, n_features)
        """

        vectorized = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
        X = vectorized.fit_transform(self.merged_df["Titles"])

        feature_names = vectorized.get_feature_names_out()
        one_hot_df = pd.DataFrame(X.toarray(), columns=feature_names)

        dfs = pd.concat([df_title, one_hot_df], axis=1)
        dfs.drop(columns=["Titles"], inplace=True)

        self.dfs = dfs
        return dfs

    def creat_X(self, interaction_df):
        """Compressed Sparse Row matrix.

        Args:
            iteraction_df (_type_): use preprocessing_interaction func output

        Returns:
            sparse matrix of type '<class 'numpy.float64'>
        """

        M = interaction_df["userId"].nunique()
        N = interaction_df["بازدید"].nunique()

        user_mapper = dict(zip(np.unique(interaction_df["userId"]), list(range(M))))
        item_mapper = dict(zip(np.unique(interaction_df["بازدید"]), list(range(N))))

        user_inv_mapper = dict(zip(list(range(M)), np.unique(interaction_df["userId"])))
        item_inv_mapper = dict(zip(list(range(N)), np.unique(interaction_df["بازدید"])))

        user_index = [user_mapper[i] for i in interaction_df["userId"]]
        item_indx = [item_mapper[i] for i in interaction_df["بازدید"]]

        X = csr_matrix(
            (interaction_df["زمان تعامل(تانیه)"], (user_index, item_indx)), shape=(M, N)
        )

        return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

    def cosine_similioraty(
        self,
        dfs: pd.DataFrame,
        event_df: pd.DataFrame,
        interaction_df: pd.DataFrame,
        idx: str,
        n_recommendations: int = 1,
    ):
        """Compute cosine similarity between samples in X and Y.

        Cosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y:

                Args:
                    dfs (pd.DataFrame): use vectorized_text func outputs
                    merged_df (pd.DataFrame): use preprocessing_merged_df func outputs
                    interaction_df (pd.DataFrame): use preprocessing_interaction func output
                    idx (str): idx of user interation and buy
                    n_recommendations (int, optional): Number of outgoing recommenders. Defaults to 1.

                Returns:
                    list: user best recommenders
        """
        cosine_sim = cosine_similarity(dfs, dfs)
        iter_idx = dict(zip(event_df["Titles"].unique(), list(event_df.index)))
        idx = iter_idx[idx]
        n_recommendations = n_recommendations
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1 : (n_recommendations + 1)]
        similar_item = [i[0] for i in sim_scores]
        recomended = event_df["Titles"].iloc[similar_item]
        recomended = recomended.to_list()

        return recomended

    def recomender_users(
        self,
        interaction_df: pd.DataFrame,
        dfs: pd.DataFrame,
        event_df: pd.DataFrame,
        n_recommendations=1,
    ):
        """use interaction_df ,dfs,merged_df to recommend best for each user

        Args:
            interaction_df (pd.DataFrame): use preprocessing_interaction func output
            merged_df (pd.DataFrame): use preprocessing_merged_df func outputs
            dfs (pd.DataFrame): use vectorized_text func outputs

        Returns:
            dict: user(phone number) recommender
        """
        users_phone = interaction_df["شماره موبایل"].unique()
        user_iter = {}
        for i in users_phone:
            user_it = (
                interaction_df[interaction_df["شماره موبایل"] == i][
                    ["زمان تعامل(تانیه)", "بازدید"]
                ]
                .max()
                .reset_index()
                .T
            )
            user_it.columns = ["زمان تعامل(تانیه)", "بازدید"]
            user_it.drop(index="index", inplace=True)
            user_it["بازدید"]
            idx = user_it["بازدید"].to_list()[0]
            phone = str(i)
            phone = phone[:-2]
            iters = self.cosine_similioraty(
                dfs,
                event_df,
                interaction_df,
                idx=idx,
                n_recommendations=n_recommendations,
            )
            user_dict = {phone: iters}
            user_iter.update(user_dict)

        temp = []
        res = dict()

        for key, val in user_iter.items():
            if val not in res.values():
                res[key] = val

        return user_iter

In [10]:
recomender = Recommender()

In [11]:
interaction = recomender.interaction_auto()

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)


In [5]:
buy_history = recomender.buy_auto()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [67]:
df_title, df_location, df_date = recomender.data_scrap("https://www.6234.ir/")

In [5]:
# interaction, buy_history = recomender.user_buy_interaction_from_api(
#     buy_api="https://6234.ir/api/ticket?token=aiapiqazxcvbnm1403&ofDate=1402/08/20&toDate=1403/12/29",
#     iter_api="https://6234.ir/api/log?token=aiapiqazxcvbnm1403&ofDate=1403/02/01&toDate=1403/03/29",
# )

In [68]:
event_df = recomender.event_api("https://6234.ir/api/event?token=aiapiqazxcvbnm1403")

In [69]:
interaction = recomender.preprocessing_interaction(interaction)

In [70]:
merged_df = recomender.merged_all_df(
    df_title, df_location, df_date, interaction, buy_history, event_df
)

In [71]:
merged_df = recomender.preprocessing_merged_df(merged_df)
dfs = recomender.vectorized_text(merged_df)

In [72]:
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = recomender.creat_X(
    interaction,
)

In [74]:
recomender.recomender_users(interaction, dfs, merged_df, n_recommendations=4)

{'no': ['نمایش کمدی ژیلت',
  'کنسرت هانیبال صلح و دوستی',
  'کنسرت امید حاجیلی',
  'قوانین و مقررات'],
 '9195920275': ['کنسرت نمایش کلنل',
  'پارک امیرگان ( Test )',
  'کنسرت تست ( با انتخاب صندلی )',
  'نمایشگاه ترکیه']}

In [12]:
recomender.remove_excel(["event.xlsx", "log.xlsx", "visitor.xlsx"])

In [13]:
def generate_date_ranges(start_date, end_date):
    date_ranges = []
    current_start_date = start_date
    while current_start_date < end_date:
        current_end_date = current_start_date + timedelta(days=60)
        if current_end_date > end_date:
            current_end_date = end_date
        date_ranges.append((current_start_date, current_end_date))
        current_start_date = current_end_date
    return date_ranges

In [14]:
def fetch_data_from_api(start_date, end_date):
    start_date = start_date.strftime("%Y/%m/%d")
    end_date = end_date.strftime("%Y/%m/%d")
    url = f"https://6234.ir/api/log?token=aiapiqazxcvbnm1403&ofDate={start_date}&toDate={end_date}"
    urllib.request.urlretrieve(
        url,
        "log.xlsx",
    )
    iter_history = pd.read_excel("log.xlsx")

    return iter_history

In [85]:
start_date_jalali = jdatetime.datetime.strptime(
    jdatetime.date(1402, 1, 1).strftime("%Y/%m/%d"), "%Y/%m/%d"
).date()
end_date_jalali = jdatetime.datetime.strptime(
    jdatetime.datetime.now().strftime("%Y/%m/%d"), "%Y/%m/%d"
).date()

In [86]:
date_ranges = generate_date_ranges(start_date_jalali, end_date_jalali)
date_ranges

[(jdatetime.date(1402, 1, 1), jdatetime.date(1402, 2, 30)),
 (jdatetime.date(1402, 2, 30), jdatetime.date(1402, 4, 28)),
 (jdatetime.date(1402, 4, 28), jdatetime.date(1402, 6, 26)),
 (jdatetime.date(1402, 6, 26), jdatetime.date(1402, 8, 25)),
 (jdatetime.date(1402, 8, 25), jdatetime.date(1402, 10, 25)),
 (jdatetime.date(1402, 10, 25), jdatetime.date(1402, 12, 25)),
 (jdatetime.date(1402, 12, 25), jdatetime.date(1403, 2, 25)),
 (jdatetime.date(1403, 2, 25), jdatetime.date(1403, 3, 31))]

In [96]:
all_data = {}

for start, end in date_ranges:
    data = fetch_data_from_api(start, end)

ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [97]:
data["تاریخ"].sort_values(ascending=False)

0       1402/08/24 23:22:52
1       1402/08/24 23:11:35
2       1402/08/24 21:51:22
3       1402/08/24 21:45:31
4       1402/08/24 21:45:30
               ...         
2126    1402/07/20 19:46:31
2127    1402/07/20 19:46:30
2128    1402/07/20 19:46:28
2129    1402/07/20 19:43:19
2130    1402/07/20 19:32:59
Name: تاریخ, Length: 2131, dtype: object

In [98]:
for start, end in date_ranges:
    print(start, end)

1402-01-01 1402-02-30
1402-02-30 1402-04-28
1402-04-28 1402-06-26
1402-06-26 1402-08-25
1402-08-25 1402-10-25
1402-10-25 1402-12-25
1402-12-25 1403-02-25
1403-02-25 1403-03-31


In [91]:
start

jdatetime.date(1403, 2, 25)

In [89]:
end

jdatetime.date(1403, 3, 31)

In [82]:
interaction["شماره موبایل"].unique()

array(['none', 9195920275.0], dtype=object)