In [1]:
import pandas as pd

# Load the datasets
customers = pd.read_csv("C:\\Users\\saran\\OneDrive\\Desktop\\Customers.csv")
products = pd.read_csv("C:\\Users\\saran\\OneDrive\\Desktop\\Products.csv")
transactions = pd.read_csv("C:\\Users\\saran\\OneDrive\\Desktop\\Transactions.csv")

# Merge Customers and Transactions on CustomerID
customer_transactions = pd.merge(transactions, customers, on="CustomerID", how="left")

# Merge the result with Products on ProductID
full_data = pd.merge(customer_transactions, products, on="ProductID", how="left")

# Check the combined dataset
print(full_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [3]:
# Aggregate customer data to create a profile (total spent per category, etc.)
customer_profile = full_data.groupby("CustomerID").agg(
    total_spent=('TotalValue', 'sum'),
    electronics_spent=('TotalValue', lambda x: x[full_data['Category'] == 'Electronics'].sum()),
    home_decor_spent=('TotalValue', lambda x: x[full_data['Category'] == 'Home Decor'].sum()),
    num_purchases=('TransactionID', 'nunique'),
    region=('Region', 'first')  # Assuming all transactions are from the same region per customer
).reset_index()

# Check the customer profile
print(customer_profile.head())

  CustomerID  total_spent  electronics_spent  home_decor_spent  num_purchases  \
0      C0001      3354.52            2827.30            412.62              5   
1      C0002      1862.74               0.00            837.28              4   
2      C0003      2725.38            1385.20           1217.82              4   
3      C0004      5354.88            1355.74           2110.66              8   
4      C0005      2034.24            1180.38            853.86              3   

          region  
0  South America  
1           Asia  
2  South America  
3  South America  
4           Asia  


In [31]:
import pandas as pd

# Function to get top N lookalikes along with similarity scores
def get_top_lookalikes(customer_id, num_lookalikes=3):
    # Get similarity scores for the given customer
    sim_scores = cosine_sim_df[customer_id].sort_values(ascending=False)
    
    # Remove the customer themselves from the list
    sim_scores = sim_scores[sim_scores.index != customer_id]
    
    # Filter out NaN values in the similarity scores
    sim_scores = sim_scores.dropna()
    
    # Get the customer IDs of the top N lookalikes
    top_lookalikes = sim_scores.head(num_lookalikes).index.tolist()  # Customer IDs
    
    # Get the similarity scores for these lookalikes
    top_scores = sim_scores.head(num_lookalikes).tolist()
    
    # Create a list of tuples: [(lookalike_id, similarity_score)]
    lookalike_info = list(zip(top_lookalikes, top_scores))
    
    return lookalike_info

# Initialize a dictionary to store the results
lookalike_map = {}

# Get the top 3 lookalikes for the first 20 customers
for customer_id in customer_profile['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_lookalikes(customer_id)

# Prepare the data for CSV
lookalike_list = []

# For each customer, store their lookalikes and similarity scores
for cust_id, lookalikes in lookalike_map.items():
    # Convert list of tuples to string format '[(lookalike_id, score), ...]'
    lookalikes_str = str(lookalikes)
    
    # Append to the list
    lookalike_list.append({
        "cust_id": cust_id,
        "lookalikes": lookalikes_str  # Save the string representation of the list
    })

# Create a DataFrame
lookalike_df = pd.DataFrame(lookalike_list)

# Save the DataFrame to a CSV
lookalike_df.to_csv("C:\\Users\\saran\\OneDrive\\Desktop\\Lookalike.csv", index=False, quotechar='"')

# Check the result in the DataFrame
print(lookalike_df.head())


  cust_id                                         lookalikes
0   C0001  [('C0069', 0.9926792355564207), ('C0018', 0.99...
1   C0002  [('C0189', 0.9994738527939935), ('C0103', 0.99...
2   C0003  [('C0005', 0.9949259154318374), ('C0166', 0.99...
3   C0004  [('C0086', 0.9982683706787342), ('C0075', 0.99...
4   C0005  [('C0085', 0.997272860876509), ('C0163', 0.996...
