In [26]:
# %pip install pandas google-cloud-bigquery db-dtypes scikit-learn

In [2]:
import joblib
from query.customers_items_matrix import QUERY 

from google.cloud.bigquery import Client
from pandas import DataFrame

In [5]:
bq_client = Client()

TRAIN_DATA_QUERY = """
    WITH exploded_orders AS (
            SELECT 
                o.customer_id,
                i.item_id
            FROM `ing-datos-avanzado.main_data.orders` AS o,
            UNNEST(o.order_items) AS i
    ),

    all_combinations AS (
        SELECT
            c.customer_id,
            i.item_id
        FROM
            (SELECT customer_id FROM `ing-datos-avanzado.main_data.customer`) AS c
        CROSS JOIN
            (SELECT item_id FROM `ing-datos-avanzado.main_data.item`) AS i
    ),
    
    customer_product_interactions AS (
        SELECT
        ac.customer_id,
        ac.item_id,
        CASE
            WHEN eo.item_id IS NOT NULL THEN 1
            ELSE 0
        END AS interaction
        FROM
        all_combinations AS ac
        LEFT JOIN
        exploded_orders AS eo
        ON
        ac.customer_id = eo.customer_id
        AND ac.item_id = eo.item_id
    )
    
    SELECT
        customer_id,
        item_id,
        COUNT(interaction) AS interaction
    FROM
        customer_product_interactions
    GROUP BY
      customer_id,
      item_id
"""

In [30]:
def get_train_data(bq_client: Client, query: str) -> DataFrame:
    data: DataFrame = (
        bq_client
        .query(query)
        .to_dataframe()
    )
    user_item_matrix = data.pivot_table(
        index="customer_id", 
        columns="item_id", 
        values="interaction", 
        fill_value = 0
    )
    return user_item_matrix

In [31]:
user_item_matrix: DataFrame = get_train_data(bq_client, TRAIN_DATA_QUERY)

In [32]:
user_item_matrix

ModuleNotFoundError: No module named 'numpy.rec'

ModuleNotFoundError: No module named 'numpy.rec'

### KNN model

In [33]:
from sklearn.neighbors import NearestNeighbors
from numpy import array

ModuleNotFoundError: No module named 'numpy.rec'

In [8]:
knn = NearestNeighbors(metric="cosine", algorithm="brute")  # let's use cosine distance

In [9]:
user_item_array: array = user_item_matrix.to_numpy()
knn.fit(user_item_matrix)

In [10]:
user_id = 7
user = user_item_matrix.index.get_loc(user_id)  # user to make a recommendation to

In [16]:
user

6

In [17]:
user_item_array[user].reshape(1, -1)

array([[15.0, 10.0, 13.0, 9.0, 4.0, 7.0, 9.0, 7.0, 5.0, 9.0, 10.0, 13.0,
        7.0, 6.0, 8.0, 10.0, 9.0, 15.0, 9.0, 9.0, 12.0, 10.0, 8.0, 9.0,
        9.0, 5.0, 9.0, 10.0, 14.0, 15.0, 6.0, 9.0]], dtype=object)

In [11]:
distances, indexes = knn.kneighbors(user_item_array[user].reshape(1, -1), n_neighbors=2)  # let's find the 3 nearest neighboors

In [12]:
findings = f"""
Users similar to customer_id {user_id}: {user_item_matrix.index[indexes.flatten()]}
Cosine ditances: {distances.flatten()}
"""
print(findings)


Users similar to customer_id 7: Index([7, 5], dtype='Int64', name='customer_id')
Cosine ditances: [0.         0.05661491]



Let's get the products bought by the similar users

In [13]:
similar_customers_ids = indexes.flatten()

In [14]:
[
    user_item_matrix.columns[user_item_matrix.iloc[user_id] > 0]
    for user_id in similar_customers_ids
]

[Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
       dtype='Int64', name='item_id'),
 Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
       dtype='Int64', name='item_id')]

### Export the model

Let's save the model

In [15]:
joblib.dump(knn, r"C:\Users\Admin\Desktop\Repos\CFIGDA\CFIGDA-recommenderSystem\src\knn\knn_model.pkl")

['C:\\Users\\Admin\\Desktop\\Repos\\CFIGDA\\CFIGDA-recommenderSystem\\src\\knn\\knn_model.pkl']