In [None]:
!pip install datasketch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime




### Import dataset

In [None]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

print(customers.info())
print(products.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------

### Merging the datasets

In [None]:
transactions_customers = pd.merge(transactions, customers, on='CustomerID', how='inner')

data = pd.merge(transactions_customers, products, on='ProductID', how='inner')

print(data.info())
data.to_csv("final_data.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB
None


In [None]:
final_data = pd.read_csv("/content/final_data.csv")
final_data.info()
final_data = final_data.loc[:, ~final_data.columns.str.contains('^Unnamed')]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1000 non-null   int64  
 1   CustomerID       1000 non-null   object 
 2   CustomerName     1000 non-null   object 
 3   Region           1000 non-null   object 
 4   SignupDate       1000 non-null   object 
 5   TransactionID    1000 non-null   object 
 6   ProductID        1000 non-null   object 
 7   TransactionDate  1000 non-null   object 
 8   Quantity         1000 non-null   int64  
 9   TotalValue       1000 non-null   float64
 10  Price            1000 non-null   float64
 11  ProductName      1000 non-null   object 
 12  Category         1000 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 101.7+ KB


### Grouping the data based on unique customers

In [None]:
merged_data = final_data.groupby(
    ['CustomerID', 'CustomerName', 'Region', 'SignupDate'], as_index=False
).agg({
    'TransactionID': list,
    'ProductID': list,
    'TransactionDate': list,
    'Quantity': list,
    'TotalValue': list,
    'Price': list,
    'ProductName': list,
    'Category': list
})
merged_data.to_csv("merged_data.csv")

In [None]:
import pandas as pd

# Select the first 20 customer IDs
sampled_customer_ids = merged_data['CustomerID'].iloc[:20]

# Filter the data to only include the sampled customer IDs
sampled_data = merged_data[merged_data['CustomerID'].isin(sampled_customer_ids)]

# Create a DataFrame with the customer IDs
customer_ids_df = sampled_data[['CustomerID']]

# Save the DataFrame to a CSV file
customer_ids_df.to_csv('/content/data/seed.csv', index=False)


### Using LSH to find similar users

Locality-Sensitive Hashing (LSH) is an efficient algorithm for finding similar items in large datasets by hashing items into buckets such that similar items have a high probability of colliding in the same bucket. Using the Jaccard similarity metric, LSH can efficiently compare sets (e.g., user behavior or preferences) by focusing on overlapping elements rather than comparing all pairwise combinations.

Metric:
Jaccard Similarity representing users' attributes or behaviors.

In [None]:
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter
from yaml import CLoader as Loader, load





# Function for reading a YAML configuration file
def read_config(path):
    """
    Read a YAML configuration file and return it as a dictionary.

    :param path: Path to the YAML configuration file
    :return: Dictionary containing the configuration settings
    """
    with open(path) as stream:
        config = load(stream, Loader=Loader)
    return config

# Calculate feature importance (i.e. ranking the users) based on probabilities
def feat_imp(p, q):
    """
    Calculate feature importance given seed set probability of a feature and global
    probability for the same feature.

    :param p: Probability in the seed set
    :param q: Global probability
    :return: Feature importance score
    """
    return (p - q) * np.log((p * (1 - q)) / ((1 - p) * q))

# Calculate the counts of features in the dataset
def count_fn(data, features, list_cols):
    """
    Calculate the count of features in the dataset and their probabilities.

    :param data: DataFrame containing user records
    :param features: List of columns in the dataset
    :param list_cols: Columns that can have multiple values per user
    :return: DataFrame with feature count values and probabilities
    """
    count_df = pd.DataFrame(columns=["value", "count", "feature"])
    for col in features:
        if col not in list_cols:
            counts = data[col].value_counts()
        else:
            counts = pd.Series(Counter(chain.from_iterable(x for x in data[col])))
        counts = counts.reset_index()
        counts.columns = ["value", "count"]
        counts["feature"] = col
        count_df = pd.concat([count_df, counts])
    count_df["sum"] = count_df.groupby("feature")["count"].transform(sum)
    count_df["prob"] = count_df["count"] / count_df["sum"]
    count_df = count_df[["feature", "value", "prob"]]
    return count_df

# Convert column values into strings with column name as a prefix
def conv_values(v, c, list_c):
    """
    Convert column values into strings with the column name as a prefix.

    :param v: Feature value
    :param c: Column name
    :param list_c: Boolean, True if c is a list column, False otherwise
    :return: List of string-prefixed feature values or a single string
    """
    if list_c:
        final_v = []
        for v_ in v:
            final_v.append(f"{c}_{str(v_)}")
    else:
        final_v = f"{c}_{str(v)}"
    return final_v

def flatten_list(f):
    """
    Flatten a list of lists.

    :param f: List of lists
    :return: A flat list
    """
    f_l = []
    for f_ in f:
        if isinstance(f_, list):
            f_l.extend(f_)
        else:
            f_l.append(f_)
    return f_l


In [None]:
import pandas as pd

def score_fn(data, count_path, features, list_cols, seed_ids, neighbors, label):
    """
    Calculate a score for each user in the extended set based on feature importance.

    :param data: DataFrame containing user features
    :param count_path: Path to the feature count file
    :param features: Features in the dataset
    :param list_cols: Columns that can have multiple values per user
    :param seed_ids: Customer IDs part of the seed set
    :param neighbors: Customer IDs extracted from the LSH graph
    :param label: Label column indicating whether the user clicked the ad or not
    :return: DataFrame containing user features along with a score for each user
    """

    # Read the feature count file
    count_df = pd.read_csv(count_path)
    seed_df = data[data["CustomerID"].isin(seed_ids)]
    seed_count = count_fn(seed_df, features, list_cols)
    seed_count.rename({"prob": "s_prob"}, axis=1, inplace=True)
    seed_count = seed_count.merge(count_df, on=["feature", "value"], how="left")
    seed_count["imp"] = seed_count.apply(lambda x: feat_imp(x["s_prob"], x["prob"]), axis=1)
    seed_count["feat"] = seed_count["feature"] + "_" + seed_count["value"].astype(str)
    seed_count = seed_count[["feat", "imp"]]
    df = data.drop(label, axis=1)
    df = df[df["id"].isin(neighbors)]
    for f in features:
        list_c = f in list_cols
        df[f] = df[f].apply(lambda x: conv_values(x, f, list_c))
    df["feat"] = df.apply(lambda x: list(x[1:].values), axis=1)
    df["feat"] = df["feat"].apply(flatten_list)
    df = df[["id", "feat"]]
    df = df.explode("feat").reset_index(drop=True)
    df = df.merge(seed_count, on="feat", how="left")
    df = df[["id", "imp"]]
    df = df.groupby("id")["imp"].sum().reset_index()
    df.columns = ["id", "score"]
    data = data.merge(df, on="id", how="left")
    return data

def get_extn(data, seed_ids, label, extn_path, x=2):
    """
    Retrieve a set of users from the neighbor set based on their score and save them to a file.

    :param data: DataFrame containing user data along with the scores
    :param seed_ids: List of customer IDs that are in the seed set
    :param label: Label column indicating whether the user clicked the ad
    :param extn_path: Path to store the extended user set
    :param x: Scale of extension needed
    :return: Click rate of the extended user set
    """
    # Drop users who don't have a score (those are not neighbors)
    data = data.dropna(subset=["score"])
    # Sort the users by score in descending order
    data = data.sort_values(by="score", ascending=False)
    # Select the top users
    extn = data.iloc[:x * len(seed_ids), :][["CustomerID"]]
    # Write the extended user IDs to a file
    extn.to_csv(extn_path, index=False)
    # Calculate the click rate of the extended user set
    extn_click_rate = data.iloc[:x * len(seed_ids), :][label].mean()
    extn_click_rate = round(extn_click_rate * 100, 2)
    return extn_click_rate


### LSH graph model

In [None]:
from datasketch import MinHash, MinHashLSHForest

class LSHGraph:
    """
    Locality-Sensitive Hashing (LSH) Graph Model
    """

    def __init__(self, df, model, features, id_col="id", n_perm=10):
        """
        Initialize the LSHGraph.

        :param df: DataFrame containing user features
        :param model: MinHashLSHForest model for LSH operations
        :param features: List of features in the dataset
        :param id_col: Column name for user IDs
        :param n_perm: Number of permutations for the LSH model
        """
        self.df = df
        self.model = model
        self.features = features
        self.id_col = id_col
        self.n_perm = n_perm

    def update_graph(self):
        """
        Update the LSH graph by adding MinHash values for each user in the DataFrame.
        """
        for i, row in self.df[self.features].iterrows():
            if i % 5000 == 0:
                print(f"Processing {i} of {self.df.shape[0]}")
            m = MinHash(num_perm=self.n_perm)
            m = self.get_hash(m, row)
            self.model.add(self.df[self.id_col][i], m)
        self.model.index()

    def extract_neighbors(self, seed, k=3):
     """
    Retrieve neighbors of seed set users from the LSH graph along with their similarity scores.

    :param seed: List of customer IDs from the seed set
    :param k: Number of neighbors to retrieve for each seed set user
    :return: Dictionary where keys are seed_ids and values are lists of tuples (neighbor_id, similarity_score)
     """
     neighbors_with_scores_dict = {}

    # Filter the seed set from the dataframe
     seed_df = self.df[self.df[self.id_col].isin(seed)]

     for i, row in seed_df[self.features].iterrows():
        # Initialize a MinHash object for the current row
        m = MinHash(num_perm=self.n_perm)
        m = self.get_hash(m, row)

        # Query for similar neighbors from the model
        similar_neighbors = self.model.query(m, k)

        # List to hold the neighbors with their similarity scores
        neighbor_scores = []

        # Loop over each neighbor found and compute similarity score
        for neighbor_id in similar_neighbors:
            # Recompute the MinHash for the neighbor directly here, since MinHash is not stored
            neighbor_row = self.df[self.df[self.id_col] == neighbor_id].iloc[0]
            neighbor_m = MinHash(num_perm=self.n_perm)
            neighbor_m = self.get_hash(neighbor_m, neighbor_row[self.features])

            # Calculate Jaccard similarity between the seed and the neighbor
            similarity_score = m.jaccard(neighbor_m)  # Jaccard similarity score
            neighbor_scores.append((neighbor_id, similarity_score))

        # Sort by similarity score and select the top k neighbors
        neighbor_scores.sort(key=lambda x: x[1], reverse=True)

        # Store the top k neighbors and their similarity scores in the dictionary
        neighbors_with_scores_dict[self.df[self.id_col][i]] = neighbor_scores[:k]

     return neighbors_with_scores_dict



    def get_hash(self, m, row):
        """
        Encode a user's features using MinHash.

        :param m: MinHash object to update
        :param row: User's feature list
        :return: Updated MinHash object
        """
        for d in row:
            if type(d) == list:
                for e in d:
                    m.update(str(e).encode('utf-8'))
            else:
                m.update(str(d).encode('utf-8'))
        return m


### Processing LSH graph

In [None]:


import pickle
import random
import numpy as np
import pandas as pd

from yaml import CLoader as Loader, load
from datasketch import MinHash, MinHashLSHForest

with open("config.yaml") as stream:
    config = load(stream, Loader=Loader)

columns_list = [

    'ProductID',
    'TransactionDate',
    'Quantity',
    'TotalValue',
    'Price',
    'ProductName',
    'Category'
]
label = "TotalValue"
n_perm = 100
id_col = "CustomerID"
features = [ 'CustomerName', 'Region', 'SignupDate'
     , 'TransactionDate', 'Quantity', 'Price',
       'ProductName', 'Category']

data_path = "/content/merged_dataset.csv"
seed_path = "/content/data/seed.csv"
extn_path = "content/data/extn.csv"
list_cols = columns_list
thresh = 16
model_path = "/content/data/lshgraph"
count_path = "/content/data/count_df.csv"
clean_data_path = "/content/data/processed.json"


data = pd.read_csv(data_path)


data.to_json(clean_data_path)


count_df = count_fn(data, features, list_cols)

count_df.to_csv(count_path, index=False)

data = pd.read_json(clean_data_path)


df = data.drop(label, axis=1)


lsh = MinHashLSHForest(num_perm=n_perm)

lsh_graph = LSHGraph(df, lsh, features, id_col=id_col, n_perm=n_perm)

lsh_graph.update_graph()

with open(model_path, "wb") as f:
    pickle.dump(lsh_graph, f)



data = pd.read_json(clean_data_path)

seed = pd.read_csv(seed_path)

seed_ids = list(seed["CustomerID"])


lsh_graph = pickle.load(open(model_path, "rb"))







  count_df["sum"] = count_df.groupby("feature")["count"].transform(sum)


Processing 0 of 199


In [None]:

neighbors = lsh_graph.extract_neighbors(seed_ids,k=4)


In [None]:
def filter_and_sort_neighbors(neighbors_dict):

    filtered_dict = {}

    for customer_id, neighbors in neighbors_dict.items():
        # Remove self-referencing neighbors
        filtered_neighbors = [(neighbor_id, score) for neighbor_id, score in neighbors if neighbor_id != customer_id]

        # Sort neighbors by similarity score in descending order
        filtered_neighbors.sort(key=lambda x: x[1], reverse=True)

        # Only keep entries with 3 or more neighbors
        filtered_dict[customer_id] = filtered_neighbors[:3]

    return filtered_dict






{'C0001': [], 'C0002': [('C0054', 0.09), ('C0143', 0.08), ('C0022', 0.06)], 'C0003': [], 'C0004': [], 'C0005': [('C0054', 0.09), ('C0143', 0.08), ('C0022', 0.07)], 'C0006': [('C0191', 0.06), ('C0190', 0.05), ('C0126', 0.05)], 'C0007': [('C0053', 0.1), ('C0186', 0.03), ('C0146', 0.03)], 'C0008': [('C0116', 0.07), ('C0157', 0.06), ('C0020', 0.05)], 'C0009': [('C0057', 0.08), ('C0019', 0.05), ('C0081', 0.05)], 'C0010': [('C0057', 0.05), ('C0081', 0.05), ('C0019', 0.03)], 'C0011': [('C0031', 0.06), ('C0052', 0.05), ('C0181', 0.05)], 'C0012': [], 'C0013': [], 'C0014': [('C0060', 0.25), ('C0151', 0.11), ('C0057', 0.08)], 'C0015': [('C0123', 0.11)], 'C0016': [], 'C0017': [('C0075', 0.06), ('C0164', 0.06), ('C0019', 0.05)], 'C0018': [('C0110', 0.08), ('C0098', 0.05), ('C0033', 0.05)], 'C0019': [('C0075', 0.07), ('C0164', 0.07), ('C0017', 0.05)], 'C0020': [('C0157', 0.06), ('C0116', 0.05), ('C0189', 0.04)]}


In [None]:
len(filtered_neighbors_dict)

20