In [57]:
import pandas as pd
import numpy as np
import requests
import urllib.request
from bs4 import BeautifulSoup
import keras
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.preprocessing import LabelEncoder

In [6]:
def data_scrap(url: str):
    """use site url to scrap necessary data

    Args:
        url (str): site address

    """
    url = url
    response = requests.get(url)

    if response.status_code == 200:

        soup = BeautifulSoup(response.text, "html.parser")

        h3_tags_title = soup.find_all("h3", class_="blog_post_title my-2")
        h3_tags_location = soup.find_all("div", class_="blog_post_title my-2")
        h3_tags_date = soup.find_all("div", class_="theater-date my-2")

        titles = []
        location = []
        date = []

        for title in h3_tags_title:
            if title.text.strip():
                titles.append(title.text.strip())

        df_title = pd.DataFrame({"Titles": titles})

        for loc in h3_tags_location:
            if loc.text.strip():
                location.append(loc.text.strip())

        df_location = pd.DataFrame({"Titles": location})

        for dt in h3_tags_date:
            if dt.text.strip():
                date.append(dt.text.strip())

        df_date = pd.DataFrame({"Titles": date})

        return df_title, df_location, df_date

In [8]:
def user_buy_interaction_from_api(buy_api: str, iter_api: str):
    """use api to scrap buy and interaction users

    Args:
        buy_api (str): api to scrape user buy history
        iter_api (str): api to scrape user iter history
    """
    buy_link = buy_api
    iter_link = iter_api

    urllib.request.urlretrieve(iter_link, "log.xlsx")
    iter_history = pd.read_excel("log.xlsx")

    urllib.request.urlretrieve(buy_link, "visitor.xlsx")
    buy_history = pd.read_excel("visitor.xlsx")

    return iter_history, buy_history

In [9]:
def merged_all_df(
    df_title: pd.DataFrame,
    df_location: pd.DataFrame,
    df_date: pd.DataFrame,
    df_interaction: pd.DataFrame,
    df_buy_history: pd.DataFrame,
):
    """merged all df to concat all titles under each other

    Args:
        df_title (pd.DataFrame): df_title scrape from data_scrap func output
        df_location (pd.DataFrame): df_location scrape from data_scrap func output
        df_date (pd.DataFrame): df_date scrape from data_scrap func output
        df_interaction (pd.DataFrame): df_interaction scrape from user_buy_interaction_from_api func output
        df_buy_history (pd.DataFrame): df_buy_history scrape from user_buy_interaction_from_api func output

    Returns:
        merged df: Pandas DataFrame
    """
    merge_df = pd.DataFrame(
        {
            "Titles": df_title["Titles"],
            "Location": df_location["Titles"],
            "Date": df_date["Titles"],
        }
    )

    merge_df = pd.concat(
        [
            merge_df["Titles"],
            df_interaction["بازدید"],
            df_buy_history["رویداد"],
        ]
    ).reset_index()

    merge_df.columns = ["index", "Titles"]

    return merge_df

In [10]:
df_title, df_location, df_date = data_scrap("https://www.6234.ir/")

In [12]:
interaction, buy_history = user_buy_interaction_from_api(
    buy_api="https://6234.ir/api/ticket?token=apiqazxcvbnm&ofDate=1402/08/20&toDate=1402/12/29",
    iter_api="https://6234.ir/api/log?token=apiqazxcvbnm&ofDate=1402/08/20&toDate=1402/12/29",
)

In [13]:
merged_df = merged_all_df(df_title, df_location, df_date, interaction, buy_history)

In [20]:
le = LabelEncoder()
interaction["userId"] = le.fit_transform(interaction["نام و نام خانوادگی"])
interaction["itemId"] = le.fit_transform(interaction["بازدید"])
interaction["item_title"] = interaction["بازدید"]
interaction["user_rating"] = interaction["زمان تعامل(تانیه)"]

In [21]:
rating = interaction[["userId", "itemId", "item_title", "user_rating"]]

In [22]:
rating

Unnamed: 0,userId,itemId,item_title,user_rating
0,2,38,صفحه اصلی,0.0
1,2,38,صفحه اصلی,0.0
2,2,38,صفحه اصلی,0.0
3,2,38,صفحه اصلی,0.0
4,2,38,صفحه اصلی,0.0
...,...,...,...,...
30199,2,38,صفحه اصلی,0.0
30200,2,38,صفحه اصلی,0.0
30201,2,37,سالن اجرای صحنه ای 360,0.0
30202,2,38,صفحه اصلی,0.0


In [23]:
training_size = 0.8 * len(rating)

In [27]:
rating_train = rating.loc[: training_size - 1]
rating_test = rating.loc[training_size:]

In [43]:
user_id_lookup_layer = keras.layers.StringLookup(mask_token=None)
rating_train.loc[:, "userId"] = rating_train["userId"].astype(str)
user_id_lookup_layer.adapt(rating_train["userId"])

In [44]:
user_id_embedding_dim = 32
user_id_embedding_layer = keras.layers.Embedding(
    input_dim=user_id_lookup_layer.vocabulary_size(), output_dim=user_id_embedding_dim
)

In [45]:
user_id_model = keras.Sequential([user_id_lookup_layer, user_id_embedding_layer])

In [47]:
item_id_lookup_layer = keras.layers.StringLookup(mask_token=None)
rating_train.loc[:, "itemId"] = rating_train["itemId"].astype(str)
item_id_lookup_layer.adapt(rating_train["itemId"])

In [48]:
item_id_embedding_dim = 32
item_id_embedding_layer = keras.layers.Embedding(
    input_dim=item_id_lookup_layer.vocabulary_size(), output_dim=item_id_embedding_dim
)

In [50]:
item_id_model = keras.Sequential([item_id_lookup_layer, item_id_embedding_layer])

In [52]:
item_title_vectorization_layer = keras.layers.TextVectorization()
rating_train.loc[:, "item_title"] = rating_train["item_title"].astype(str)
item_title_vectorization_layer.adapt(rating_train["item_title"])

In [53]:
item_title_embedding_dim = 32
item_title_embedding_layer = keras.layers.Embedding(
    input_dim=item_title_vectorization_layer.vocabulary_size(),
    output_dim=item_title_embedding_dim,
    mask_zero=True,
)

In [54]:
item_title_model = keras.Sequential(
    [item_title_vectorization_layer, item_title_embedding_layer]
)

In [55]:
query_model = user_id_model
candidate_model = item_title_model

In [58]:
dataset = tf.data.Dataset.from_tensor_slices(rating_train["itemId"])

batched_dataset = dataset.batch(128)

factorized_top_k_metrics = tfrs.metrics.FactorizedTopK(
    candidates=batched_dataset.map(candidate_model)
)

ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 

In [61]:
tfrs.__version__

'v0.7.3'