In [5]:
#Collabarotive with priority frequency

import pandas as pd
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("./DataSources/M1_OptimizeFullReport.csv", encoding='latin1')

# Drop the original 'Order Value' column
df.drop(columns=['Order Value'], inplace=True)

# Rename 'Original Order Value' column to 'Order Value'
df.rename(columns={'Original Order Value': 'Order Value'}, inplace=True)

# Select only the desired columns
df = df.loc[:, ['Conversion ID', 'Advertiser', 'Order ID', 'Conversion Time', 'Timezone', 'Currency', 'Order Value', 'Country Code', 'Device Type', 'ex3/category', 'Cust Type', 'UID']]

# Clean the 'Advertiser' column to ensure consistency
df['Advertiser'] = df['Advertiser'].replace('Shopee SG', 'Shopee Singapore')

# Find distinct values and their counts in 'Advertiser'
distinct_advertisers_counts = df['Advertiser'].value_counts()

# Clean the 'Cust Type' column to ensure consistency
df['Cust Type'] = df['Cust Type'].replace('new', 'NEW').replace('existing', 'EXISTING')

# Find distinct values and their counts in 'Cust Type'
distinct_cust_types_counts = df['Cust Type'].value_counts()

# Convert 'Conversion Time' column to datetime
#df['Conversion Time'] = pd.to_datetime(df['Conversion Time'])
df['Conversion Time'] = pd.to_datetime(df['Conversion Time'], format='%d/%m/%Y %H:%M')


# Define reference date as the current date
reference_date = datetime.now()

# Recency Calculation
# Group by UID and find the most recent purchase date
recency_df = df.groupby('UID')['Conversion Time'].max().reset_index()
recency_df['Recency'] = (reference_date - recency_df['Conversion Time']).dt.days

# Frequency Calculation
# Group by UID and count the number of transactions
frequency_df = df.groupby('UID').size().reset_index(name='Frequency')

# AOV Calculation
# Group by UID and calculate the average order value
# Remove outliers based on your criteria (e.g., high-value purchases)
# For simplicity, let's remove any orders above a certain threshold
threshold = 1000  # You can adjust this threshold as per your requirement
filtered_df = df[df['Order Value'] <= threshold]
aov_df = filtered_df.groupby('UID')['Order Value'].mean().reset_index(name='AOV')

# Unique category count calculation
unique_category_count_df = df.groupby('UID')['ex3/category'].nunique().reset_index(name='Unique Category Counts')

# Total spending calculation
total_spending_df = filtered_df.groupby('UID')['Order Value'].sum().reset_index(name='Total Spending')

# Merge all calculated values into a single dataframe
result_df_2 = (
    recency_df.merge(frequency_df, on='UID').merge(aov_df, on='UID').merge(total_spending_df, on='UID').merge(unique_category_count_df, on='UID')
)

# Group by UID and category, and count the number of transactions for each category
category_transactions = df.groupby(['UID', 'ex3/category']).size().reset_index(name='Transaction Count')

# Pivot the dataframe to have UID as rows and categories as columns
transaction_count_by_category = category_transactions.pivot_table(index='UID', columns='ex3/category', values='Transaction Count', aggfunc='sum', fill_value=0)

# Reset index to make UID a column again
transaction_count_by_category.reset_index(inplace=True)

# If UID is not set as the index, you can use 'on' parameter instead
merged_df = pd.merge(result_df_2, transaction_count_by_category, on='UID')

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Calculate user-user similarity based on RFAT and wallet share by category
def calculate_user_similarity(merged_df):
    user_features = merged_df.drop(columns=['UID', 'Conversion Time'])  # Exclude UID and Conversion Time
    user_similarity_matrix = cosine_similarity(user_features)
    return user_similarity_matrix

# Example: Get top N similar users for a given user
def get_top_similar_users(merged_df, user_similarity_matrix, user_id, n=5):
    if user_id not in merged_df['UID'].values:
        print(f"User ID {user_id} does not exist. Using default recommendations.")
        return []
    user_index = merged_df.index[merged_df['UID'] == user_id][0]
    similar_users = user_similarity_matrix[user_index].argsort()[::-1][1:n+1]
    return similar_users

# Example: Aggregate top categories from similar users
def aggregate_top_categories_from_similar_users(merged_df, similar_users, n=10):
    category_columns = [col for col in merged_df.columns if col not in ['UID', 'Conversion Time', 'Recency', 'Frequency', 'AOV', 'Total Spending','Unique Category Counts']]
    top_categories = merged_df.iloc[similar_users][category_columns].sum().nlargest(n).index.tolist()
    return top_categories

# Example: Recommend top categories for a given user
def recommend_top_categories(merged_df, user_similarity_matrix, user_id, n=10):
    similar_users = get_top_similar_users(merged_df, user_similarity_matrix, user_id)
    if len(similar_users) == 0:  # Check if no similar users were found
        return default_recommendations(merged_df), []  # Fallback to default recommendations and empty offer categories
    top_categories = aggregate_top_categories_from_similar_users(merged_df, similar_users, n)
    return top_categories

# Example: Default recommendations for new users based on frequency
def default_recommendations(df, n=10):
    # Provide default recommendations based on popular or trending categories using frequency
    # Group by category and count the occurrences of each UID within each category
    top_categories = df.groupby('ex3/category').size().nlargest(n).index.tolist()
    return top_categories

# Example: Format output nicely
def format_output(user_id, top_categories):
    output = f"Top {len(top_categories)} categories for user: {user_id}"
    for i, category in enumerate(top_categories, start=1):
        output += f"\n{i}. {category}"
    return output

# Load the offer dataset from the CSV file
offer_df = pd.read_csv("./DataSources/M1_offers.csv")

# Define the mapping between transaction sub-categories and offer main categories
category_mapping = {
    'Beauty': ['Beauty'],
    'Electronics': ['Audio', 'Mobile & Gadgets', 'Cameras & Drones', 'Computers & Accessories', 'Electronics Accessories', 'Monitors & Printers', 'Smart Devices', 'Digital Utilities', 'Televisions & Videos', 'Digital Goods', 'Data Storage', 'Mobiles & Tablets'],
    'Entertainment': ['Gaming & Consoles', 'Media, Music & Books'],
    'Fashion & Retail': ['Fashion Accessories', 'Women Shoes', 'Women Clothes', 'Men Clothes', 'Baby & Kids Fashion', 'Women Bags', 'Muslim Fashion', 'Men Bags', 'Men Shoes', "Women's Shoes and Clothing", "Men's Shoes and Clothing", "Kids' Shoes and Clothing"],
    'Health and Wellness': ['Health'],
    'Hobbies': ['Hobbies & Collections', 'Toys & Games'],
    'Home Improvement/Furniture': ['Home & Living', 'Home Appliances', 'Kitchen & Dining', 'Lighting & Décor', 'Small Appliances', 'Household Supplies', 'Bedding & Bath', 'Outdoor & Garden', 'Furniture & Organization', 'Tools & Home Improvement', 'Laundry & Cleaning Equipment', 'Large Appliances'],
    'Jewelry & Timepieces': ['Watches', 'Watches Sunglasses Jewellery'],
    'Miscellaneous': ['Automobiles', 'Pets', 'Mom & Baby', 'Books & Magazines', 'Stationery', 'Tickets, Vouchers & Services', 'Motorcycles', 'Free Sample (Flexi Combo)', 'Pet Supplies', 'Mother & Baby', 'Stationery, Craft & Gift Cards', 'Motors', 'Surprise Box', 'Telco', 'Stationery & Craft', 'Local Service', 'Services'],
    'Sports and Fitness': ['Sports & Outdoors', 'Sports Shoes and Clothing'],
    'Supermarkets': ['Food & Beverage', 'Groceries'],
    'Travel': ['Travel & Luggage', 'Bags and Travel'],
    'Retailers': [],
    'Marketplace': [],
    'Cafe': [],
    'Finance and Banking': [],
    'Department Stores': [],
    'Visa Generic': [],
    'Real Estate': [],
    'Restaurants': []
}

# Map transaction sub-categories to offer main categories
def map_to_offer_category(sub_categories):
    offer_categories = set()
    for sub_category in sub_categories:
        for offer_category, mapped_sub_categories in category_mapping.items():
            if sub_category in mapped_sub_categories:
                offer_categories.add(offer_category)
    return list(offer_categories)

# Map top categories to offer categories
def map_top_categories_to_offers(top_categories):
    offer_categories = []
    for category in top_categories:
        mapped_category = map_to_offer_category([category])[0]
        if mapped_category not in offer_categories:
            offer_categories.append(mapped_category)
    return offer_categories

# Example: Get top category recommendations for a user using collaborative filtering
def get_top_and_offer_categories(merged_df, user_similarity_matrix, user_id, offer_df):
    if user_id not in merged_df['UID'].values:
        print(f"User ID {user_id} does not exist. Using default recommendations.")
        default_categories = default_recommendations(df)
        default_offer_categories = map_top_categories_to_offers(default_categories)
        return default_categories, default_offer_categories
    top_categories_collaborative = recommend_top_categories(merged_df, user_similarity_matrix, user_id)
    offer_categories_collaborative = map_top_categories_to_offers(top_categories_collaborative)
    return top_categories_collaborative, offer_categories_collaborative

# Define function to recommend offers for each offer category
def recommend_offers_for_category(offer_df, offer_category):
    # Filter offers based on the "Merchant_type" column to match the offer category
    matching_offers = offer_df[offer_df['Merchant_type'] == offer_category]
    return matching_offers

# Define function to recommend offers for each offer category
def recommend_offers_for_categories(offer_df, offer_categories):
    recommended_offers = {}
    for category in offer_categories:
        recommended_offers[category] = recommend_offers_for_category(offer_df, category)
    return recommended_offers

# Example: Print top and offer categories for a user
def print_top_and_offer_categories(merged_df, user_similarity_matrix, user_id, offer_df):
    top_categories, offer_categories = get_top_and_offer_categories(merged_df, user_similarity_matrix, user_id, offer_df)
    
    print(f"Top categories for user {user_id}:")
    for i, category in enumerate(top_categories, start=1):
        print(f"{i}. {category}")

    print("\nOffer categories:")
    for i, category in enumerate(offer_categories, start=1):
        print(f"{i}. {category}")

    recommended_offers = recommend_offers_for_categories(offer_df, offer_categories)

    print(f"\nOffers we recommend for user {user_id}:")
    for i, category in enumerate(offer_categories, start=1):
        print(f"\n{i}. {category}")
        if not recommended_offers[category].empty:
            print(recommended_offers[category][['offer_id', 'merchant_name', 'offer_title', 'Offer_amount(%)']].to_string(index=False))
        else:
            print("No offers available for this category.")



# Example: Get top category recommendations for a user
#003d363e-5e74-4bd9-9b7f-77d1ddf0ad62 Unique Category Counts 1
#05e3cacf-ce64-488f-9f42-f8c1c0a78fd5 Unique Category Counts 22
#009caacf-c094-4b84-adde-424c584b0b33 Unique Category Counts 3
#01 New User
user_id = '05e3cacf-ce64-488f-9f42-f8c1c0a78fd5'
user_similarity_matrix = calculate_user_similarity(merged_df)
print_top_and_offer_categories(merged_df, user_similarity_matrix, user_id, offer_df)


Top categories for user 05e3cacf-ce64-488f-9f42-f8c1c0a78fd5:
1. Home & Living
2. Food & Beverage
3. Groceries
4. Pets
5. Beauty
6. Electronics Accessories
7. Automobiles
8. Computers & Accessories
9. Household Supplies
10. Health

Offer categories:
1. Home Improvement/Furniture
2. Supermarkets
3. Miscellaneous
4. Beauty
5. Electronics
6. Health and Wellness

Offers we recommend for user 05e3cacf-ce64-488f-9f42-f8c1c0a78fd5:

1. Home Improvement/Furniture
No offers available for this category.

2. Supermarkets
 offer_id merchant_name       offer_title Offer_amount(%)
      699  Fairprice ON       5% Cashback               5
      699  Fairprice ON       5% Cashback               5
      699  Fairprice ON       5% Cashback               5
      699  Fairprice ON       5% Cashback               5
      699  Fairprice ON       5% Cashback               5
      699  Fairprice ON       5% Cashback               5
      699  Fairprice ON       5% Cashback               5
      699  Fairprice

## JSON RESPONSE UPDATE

In [1]:
#Collabarotive with priority frequency

import pandas as pd
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
import json
df = pd.read_csv("./DataSources/M1_OptimizeFullReport.csv", encoding='latin1')

# Drop the original 'Order Value' column
df.drop(columns=['Order Value'], inplace=True)

# Rename 'Original Order Value' column to 'Order Value'
df.rename(columns={'Original Order Value': 'Order Value'}, inplace=True)

# Select only the desired columns
df = df.loc[:, ['Conversion ID', 'Advertiser', 'Order ID', 'Conversion Time', 'Timezone', 'Currency', 'Order Value', 'Country Code', 'Device Type', 'ex3/category', 'Cust Type', 'UID']]

# Clean the 'Advertiser' column to ensure consistency
df['Advertiser'] = df['Advertiser'].replace('Shopee SG', 'Shopee Singapore')

# Find distinct values and their counts in 'Advertiser'
distinct_advertisers_counts = df['Advertiser'].value_counts()

# Clean the 'Cust Type' column to ensure consistency
df['Cust Type'] = df['Cust Type'].replace('new', 'NEW').replace('existing', 'EXISTING')

# Find distinct values and their counts in 'Cust Type'
distinct_cust_types_counts = df['Cust Type'].value_counts()

# Convert 'Conversion Time' column to datetime
#df['Conversion Time'] = pd.to_datetime(df['Conversion Time'])
df['Conversion Time'] = pd.to_datetime(df['Conversion Time'], format='%d/%m/%Y %H:%M')


# Define reference date as the current date
reference_date = datetime.now()

# Recency Calculation
# Group by UID and find the most recent purchase date
recency_df = df.groupby('UID')['Conversion Time'].max().reset_index()
recency_df['Recency'] = (reference_date - recency_df['Conversion Time']).dt.days

# Frequency Calculation
# Group by UID and count the number of transactions
frequency_df = df.groupby('UID').size().reset_index(name='Frequency')

# AOV Calculation
# Group by UID and calculate the average order value
# Remove outliers based on your criteria (e.g., high-value purchases)
# For simplicity, let's remove any orders above a certain threshold
threshold = 1000  # You can adjust this threshold as per your requirement
filtered_df = df[df['Order Value'] <= threshold]
aov_df = filtered_df.groupby('UID')['Order Value'].mean().reset_index(name='AOV')

# Unique category count calculation
unique_category_count_df = df.groupby('UID')['ex3/category'].nunique().reset_index(name='Unique Category Counts')

# Total spending calculation
total_spending_df = filtered_df.groupby('UID')['Order Value'].sum().reset_index(name='Total Spending')

# Merge all calculated values into a single dataframe
result_df_2 = (
    recency_df.merge(frequency_df, on='UID').merge(aov_df, on='UID').merge(total_spending_df, on='UID').merge(unique_category_count_df, on='UID')
)

# Group by UID and category, and count the number of transactions for each category
category_transactions = df.groupby(['UID', 'ex3/category']).size().reset_index(name='Transaction Count')

# Pivot the dataframe to have UID as rows and categories as columns
transaction_count_by_category = category_transactions.pivot_table(index='UID', columns='ex3/category', values='Transaction Count', aggfunc='sum', fill_value=0)

# Reset index to make UID a column again
transaction_count_by_category.reset_index(inplace=True)

# If UID is not set as the index, you can use 'on' parameter instead
merged_df = pd.merge(result_df_2, transaction_count_by_category, on='UID')

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Calculate user-user similarity based on RFAT and wallet share by category
def calculate_user_similarity(merged_df):
    user_features = merged_df.drop(columns=['UID', 'Conversion Time'])  # Exclude UID and Conversion Time
    user_similarity_matrix = cosine_similarity(user_features)
    return user_similarity_matrix

# Example: Get top N similar users for a given user
def get_top_similar_users(merged_df, user_similarity_matrix, user_id, n=5):
    if user_id not in merged_df['UID'].values:
        print(f"User ID {user_id} does not exist. Using default recommendations.")
        return []
    user_index = merged_df.index[merged_df['UID'] == user_id][0]
    similar_users = user_similarity_matrix[user_index].argsort()[::-1][1:n+1]
    return similar_users

# Example: Aggregate top categories from similar users
def aggregate_top_categories_from_similar_users(merged_df, similar_users, n=10):
    category_columns = [col for col in merged_df.columns if col not in ['UID', 'Conversion Time', 'Recency', 'Frequency', 'AOV', 'Total Spending','Unique Category Counts']]
    top_categories = merged_df.iloc[similar_users][category_columns].sum().nlargest(n).index.tolist()
    return top_categories

# Example: Recommend top categories for a given user
def recommend_top_categories(merged_df, user_similarity_matrix, user_id, n=10):
    similar_users = get_top_similar_users(merged_df, user_similarity_matrix, user_id)
    if len(similar_users) == 0:  # Check if no similar users were found
        return default_recommendations(merged_df), []  # Fallback to default recommendations and empty offer categories
    top_categories = aggregate_top_categories_from_similar_users(merged_df, similar_users, n)
    return top_categories

# Example: Default recommendations for new users based on frequency
def default_recommendations(df, n=10):
    # Provide default recommendations based on popular or trending categories using frequency
    # Group by category and count the occurrences of each UID within each category
    top_categories = df.groupby('ex3/category').size().nlargest(n).index.tolist()
    return top_categories

# Example: Format output nicely
def format_output(user_id, top_categories):
    output = f"Top {len(top_categories)} categories for user: {user_id}"
    for i, category in enumerate(top_categories, start=1):
        output += f"\n{i}. {category}"
    return output

# Load the offer dataset from the CSV file
offer_df = pd.read_csv("./DataSources/M1_offers.csv")

# Define the mapping between transaction sub-categories and offer main categories
category_mapping = {
    'Beauty': ['Beauty'],
    'Electronics': ['Audio', 'Mobile & Gadgets', 'Cameras & Drones', 'Computers & Accessories', 'Electronics Accessories', 'Monitors & Printers', 'Smart Devices', 'Digital Utilities', 'Televisions & Videos', 'Digital Goods', 'Data Storage', 'Mobiles & Tablets'],
    'Entertainment': ['Gaming & Consoles', 'Media, Music & Books'],
    'Fashion & Retail': ['Fashion Accessories', 'Women Shoes', 'Women Clothes', 'Men Clothes', 'Baby & Kids Fashion', 'Women Bags', 'Muslim Fashion', 'Men Bags', 'Men Shoes', "Women's Shoes and Clothing", "Men's Shoes and Clothing", "Kids' Shoes and Clothing"],
    'Health and Wellness': ['Health'],
    'Hobbies': ['Hobbies & Collections', 'Toys & Games'],
    'Home Improvement/Furniture': ['Home & Living', 'Home Appliances', 'Kitchen & Dining', 'Lighting & Décor', 'Small Appliances', 'Household Supplies', 'Bedding & Bath', 'Outdoor & Garden', 'Furniture & Organization', 'Tools & Home Improvement', 'Laundry & Cleaning Equipment', 'Large Appliances'],
    'Jewelry & Timepieces': ['Watches', 'Watches Sunglasses Jewellery'],
    'Miscellaneous': ['Automobiles', 'Pets', 'Mom & Baby', 'Books & Magazines', 'Stationery', 'Tickets, Vouchers & Services', 'Motorcycles', 'Free Sample (Flexi Combo)', 'Pet Supplies', 'Mother & Baby', 'Stationery, Craft & Gift Cards', 'Motors', 'Surprise Box', 'Telco', 'Stationery & Craft', 'Local Service', 'Services'],
    'Sports and Fitness': ['Sports & Outdoors', 'Sports Shoes and Clothing'],
    'Supermarkets': ['Food & Beverage', 'Groceries'],
    'Travel': ['Travel & Luggage', 'Bags and Travel'],
    'Retailers': [],
    'Marketplace': [],
    'Cafe': [],
    'Finance and Banking': [],
    'Department Stores': [],
    'Visa Generic': [],
    'Real Estate': [],
    'Restaurants': []
}

# Map transaction sub-categories to offer main categories
def map_to_offer_category(sub_categories):
    offer_categories = set()
    for sub_category in sub_categories:
        for offer_category, mapped_sub_categories in category_mapping.items():
            if sub_category in mapped_sub_categories:
                offer_categories.add(offer_category)
    return list(offer_categories)

# Map top categories to offer categories
def map_top_categories_to_offers(top_categories):
    offer_categories = []
    for category in top_categories:
        mapped_category = map_to_offer_category([category])[0]
        if mapped_category not in offer_categories:
            offer_categories.append(mapped_category)
    return offer_categories

# Example: Get top category recommendations for a user using collaborative filtering
def get_top_and_offer_categories(merged_df, user_similarity_matrix, user_id, offer_df):
    if user_id not in merged_df['UID'].values:
        print(f"User ID {user_id} does not exist. Using default recommendations.")
        default_categories = default_recommendations(df)
        default_offer_categories = map_top_categories_to_offers(default_categories)
        return default_categories, default_offer_categories
    top_categories_collaborative = recommend_top_categories(merged_df, user_similarity_matrix, user_id)
    offer_categories_collaborative = map_top_categories_to_offers(top_categories_collaborative)
    return top_categories_collaborative, offer_categories_collaborative

# Define function to recommend offers for each offer category
def recommend_offers_for_category(offer_df, offer_category):
    # Filter offers based on the "Merchant_type" column to match the offer category
    matching_offers = offer_df[offer_df['Merchant_type'] == offer_category]
    return matching_offers

# Define function to recommend offers for each offer category
def recommend_offers_for_categories(offer_df, offer_categories):
    recommended_offers = {}
    for category in offer_categories:
        recommended_offers[category] = recommend_offers_for_category(offer_df, category)
    return recommended_offers

# Example: Print top and offer categories for a user
def get_top_and_offer_categories_json(merged_df, user_similarity_matrix, user_id, offer_df):
    top_categories, offer_categories = get_top_and_offer_categories(merged_df, user_similarity_matrix, user_id, offer_df)
    
    if user_id not in merged_df['UID'].values:
        print(f"User ID {user_id} does not exist. Using default recommendations.")
        default_categories = default_recommendations(merged_df)
        default_offer_categories = map_top_categories_to_offers(default_categories)
        top_categories = default_categories
        offer_categories = default_offer_categories
    
    recommended_offers = recommend_offers_for_categories(offer_df, offer_categories)
    
    response = {
        "user_id": user_id,
        "top_categories": top_categories,
        "offer_categories": offer_categories,
        "recommended_offers": {category: recommended_offers[category][['offer_id', 'merchant_name', 'offer_title', 'Offer_amount(%)']]
                                           .to_dict('records')
                                for category in offer_categories
                                if not recommended_offers[category].empty}
    }
    
    return json.dumps(response, indent=2)



# Example: Get top category recommendations for a user
#003d363e-5e74-4bd9-9b7f-77d1ddf0ad62 Unique Category Counts 1
#05e3cacf-ce64-488f-9f42-f8c1c0a78fd5 Unique Category Counts 22
#009caacf-c094-4b84-adde-424c584b0b33 Unique Category Counts 3
#01 New User
user_id = '05e3cacf-ce64-488f-9f42-f8c1c0a78fd5'
user_similarity_matrix = calculate_user_similarity(merged_df)
v = get_top_and_offer_categories_json(merged_df, user_similarity_matrix, user_id, offer_df)
with open('recommendations.json', 'w') as f:
    json.dump(json.loads(v), f, indent=4)
df

Unnamed: 0,Conversion ID,Advertiser,Order ID,Conversion Time,Timezone,Currency,Order Value,Country Code,Device Type,ex3/category,Cust Type,UID
0,800ae09b745c4a878e749ef6037e9fc7-1,Shopee Singapore,240225GF5Y0MSP_22621880753_1,2024-02-25 21:14:00,Singapore Standard Time,SGD,11.92,SG,Smartphone,Health,EXISTING,54a36b64-716b-45df-819d-7ce8c50426dd
1,e4f553b2fcd346119d35388066557ccf-1,Shopee Singapore,24030353GBVUV9_4242488971_1,2024-03-03 22:32:00,Singapore Standard Time,SGD,25.74,SG,Smartphone,Home & Living,EXISTING,54a36b64-716b-45df-819d-7ce8c50426dd
2,e1e37cffadcd40129e30cc84e4c45b70-1,Shopee Singapore,2403034WQE9DE0_12623040963_1,2024-03-03 20:49:00,Singapore Standard Time,SGD,5.42,SG,Smartphone,Home & Living,EXISTING,f8c9636e-169c-4585-a257-b581fe918905
3,4cecac4dbd344176b5b4db3a0bb90fd3-1,Shopee Singapore,2403034VPEM67N_22658187118_1,2024-03-03 20:30:00,Singapore Standard Time,SGD,72.02,SG,Smartphone,Health,EXISTING,67fb3754-3796-4bdf-99c1-4e15d7d56a8c
4,38fd946a59494200af8d333720650d5c-1,Shopee Singapore,2403034QE3AW2N_23721151323_1,2024-03-03 18:56:00,Singapore Standard Time,SGD,3.54,SG,Smartphone,Home & Living,EXISTING,f679c241-b7af-46ac-8c53-8c10c3fde516
...,...,...,...,...,...,...,...,...,...,...,...,...
6628,0357e24ac0cb4b0d943575c0584163cd-1,Lazada SG,74991601763195x892100046_SGAMZ-3092106584,2022-03-27 10:56:00,Singapore Standard Time,SGD,9.09,0.15,Smartphone,Groceries,EXISTING,e95263ae-9a50-4a6a-8bda-62f7bb13927e
6629,9d6edfaaa1204a1e8697d97b704efa18-1,Lazada SG,74991601563195x1244234191_SGAMZ-5093156479,2022-03-27 10:56:00,Singapore Standard Time,SGD,29.80,0.72,Smartphone,Health,EXISTING,e95263ae-9a50-4a6a-8bda-62f7bb13927e
6630,7884219f45b649d9adb33f9ac5062635-1,Lazada SG,74991601863195x892112718_SGAMZ-3092240245,2022-03-27 10:56:00,Singapore Standard Time,SGD,11.22,0.18,Smartphone,Groceries,EXISTING,e95263ae-9a50-4a6a-8bda-62f7bb13927e
6631,94425b058e5143e78cab613fd7f13d24-1,Lazada SG,74991601663195x607530721_SGAMZ-1776898922,2022-03-27 10:56:00,Singapore Standard Time,SGD,13.44,0.32,Smartphone,Health,EXISTING,e95263ae-9a50-4a6a-8bda-62f7bb13927e


In [6]:
offer_df.isna().sum()

offer_id                    0
merchant_name               0
Merchant_type              98
offer_title                 0
Offer_amount(%)           457
Offer                     397
valid_from                499
valid_to                  502
created_at                  0
affiliate_partner_name    499
tracking_url              500
affiliate_offer_id        500
Unnamed: 12               605
Unnamed: 13               605
Unnamed: 14               605
Unnamed: 15               605
Unnamed: 16               605
Unnamed: 17               605
dtype: int64

In [7]:
offer_df

Unnamed: 0,offer_id,merchant_name,Merchant_type,offer_title,Offer_amount(%),Offer,valid_from,valid_to,created_at,affiliate_partner_name,tracking_url,affiliate_offer_id,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,15,Watsons,Health and Wellness,1.5% Cashback,1.5,1.5,2021-11-23 16:01:00,2024-06-30 15:59:59,2021-11-18 8:38:28,Optimise,http://clk.omgt3.com/?PID=38514&AID=2202076,38514,,,,,,
1,17,Adidas,,3% Cashback,3,3.0,2021-11-23 16:01:00,2024-06-30 15:59:59,2021-11-22 5:51:30,Optimise,https://clk.omgt3.com/?PID=36073&AID=2202076,36073,,,,,,
2,18,Nike,,3.5% Cashback,3.5,3.5,2022-08-22 16:00:00,2023-12-31 15:59:00,2021-11-22 6:03:54,Optimise,https://clk.omgt3.com/?PID=38870&AID=2202076,38870,,,,,,
3,19,Puma,,3% Cashback,3,3.0,2021-11-23 16:01:00,2024-06-30 15:59:59,2021-11-22 6:08:46,Optimise,http://clk.omgt3.com/?PID=51181&AID=2202076,51181,,,,,,
4,21,Pat Pat,Fashion & Retail,7% Cashback at Pat Pat,7,7.0,2021-11-23 16:01:00,2023-12-31 15:59:00,2021-11-22 6:39:56,Optimise,https://clk.omgt3.com/?PID=39954&AID=2202076,39954,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,743,Lazada SG,Marketplace,Up to 4% Cashback,,,,,2023-10-09 16:45:27,,,,,,,,,
602,220,Shopee SG,Marketplace,Up to 4.4% Cashback,,,,,2023-10-09 16:54:12,,,,,,,,,
603,220,Shopee SG,Marketplace,Up to 4.4% Cashback,,,,,2023-10-09 16:56:08,,,,,,,,,
604,220,Shopee SG,Marketplace,Up to 4.4% Cashback,,,,,2023-10-09 17:16:58,,,,,,,,,


In [8]:
offer_df = pd.read_csv("./DataSources/M1_offers.csv")

In [14]:
offer_df.fillna("", inplace=True)

  offer_df.fillna("", inplace=True)


In [15]:
offer_df

Unnamed: 0,offer_id,merchant_name,Merchant_type,offer_title,Offer_amount(%),Offer,valid_from,valid_to,created_at,affiliate_partner_name,tracking_url,affiliate_offer_id,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,15,Watsons,Health and Wellness,1.5% Cashback,1.5,1.5,2021-11-23 16:01:00,2024-06-30 15:59:59,2021-11-18 8:38:28,Optimise,http://clk.omgt3.com/?PID=38514&AID=2202076,38514,,,,,,
1,17,Adidas,,3% Cashback,3,3.0,2021-11-23 16:01:00,2024-06-30 15:59:59,2021-11-22 5:51:30,Optimise,https://clk.omgt3.com/?PID=36073&AID=2202076,36073,,,,,,
2,18,Nike,,3.5% Cashback,3.5,3.5,2022-08-22 16:00:00,2023-12-31 15:59:00,2021-11-22 6:03:54,Optimise,https://clk.omgt3.com/?PID=38870&AID=2202076,38870,,,,,,
3,19,Puma,,3% Cashback,3,3.0,2021-11-23 16:01:00,2024-06-30 15:59:59,2021-11-22 6:08:46,Optimise,http://clk.omgt3.com/?PID=51181&AID=2202076,51181,,,,,,
4,21,Pat Pat,Fashion & Retail,7% Cashback at Pat Pat,7,7.0,2021-11-23 16:01:00,2023-12-31 15:59:00,2021-11-22 6:39:56,Optimise,https://clk.omgt3.com/?PID=39954&AID=2202076,39954,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,743,Lazada SG,Marketplace,Up to 4% Cashback,,,,,2023-10-09 16:45:27,,,,,,,,,
602,220,Shopee SG,Marketplace,Up to 4.4% Cashback,,,,,2023-10-09 16:54:12,,,,,,,,,
603,220,Shopee SG,Marketplace,Up to 4.4% Cashback,,,,,2023-10-09 16:56:08,,,,,,,,,
604,220,Shopee SG,Marketplace,Up to 4.4% Cashback,,,,,2023-10-09 17:16:58,,,,,,,,,


In [17]:
df = pd.read_csv("./DataSources/merged_data_with_cluster_and_label.csv")
df.columns

Index(['Conversion ID', 'Advertiser', 'Campaign', 'Conversion Status',
       'Order ID', 'Conversion Time', 'Click Time', 'Timezone', 'Currency',
       'Original Order Value', 'Country Code', 'Device Type', 'ex3/category',
       'Cust Type', 'UID', 'Year', 'Month', 'Recency', 'Frequency',
       'Total Spend by Customer', 'Total Spend by Customer (All Advertisers)',
       'Wallet Share', 'Wallet Share by Merchant%', 'Total Spend in Category',
       'Total Spend by Customer in Category', 'Category Wallet Share',
       'Wallet Share by category %', 'Cluster', 'Recency_Score',
       'Frequency_Score', 'Wallet_Share_Merchant_Score',
       'Wallet_Share_Category_Score', 'Value_Score', 'ABC_Category'],
      dtype='object')

In [20]:
df.Cluster.value_counts()

Cluster
0    2496
3     816
2     437
1     290
Name: count, dtype: int64

In [24]:
# Assuming df is your DataFrame with 'Cluster' and 'uid' columns
clusters_dict = {}

for cluster_num in df['Cluster'].unique():
    uids_in_cluster = df[df['Cluster'] == cluster_num]['UID'].tolist()
    clusters_dict[cluster_num] = uids_in_cluster

print(clusters_dict)


{0: ['dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b546', 'dd59455b-4e32-4488-bfae-31a885c7b5

In [26]:
clusters_dict.keys()

dict_keys([0, 1, 2, 3])