# TASK 2 : Lookalike Model

### import libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

### Load data

In [None]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

### Convert date columns to datetime

In [None]:
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

### Merge transactions with products

In [None]:
transactions = transactions.merge(products, on="ProductID", how="left")

### Aggregate customer transaction data

In [None]:
customer_spending = transactions.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "TransactionID": "count",
    "Category": lambda x: x.mode()[0]  # Most frequently bought category
}).reset_index()

In [None]:
customer_spending.columns = ["CustomerID", "TotalSpend", "TransactionCount", "TopCategory"]

In [None]:
customer_spending.head()

Unnamed: 0,CustomerID,TotalSpend,TransactionCount,TopCategory
0,C0001,3354.52,5,Electronics
1,C0002,1862.74,4,Clothing
2,C0003,2725.38,4,Home Decor
3,C0004,5354.88,8,Books
4,C0005,2034.24,3,Electronics


### Merge customer profiles

In [None]:
customer_profiles = customers.merge(customer_spending, on="CustomerID", how="left").fillna(0)

In [None]:
customer_profiles.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalSpend,TransactionCount,TopCategory
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,5.0,Electronics
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,4.0,Clothing
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,4.0,Home Decor
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,8.0,Books
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,3.0,Electronics


### Encode categorical variables (Region & TopCategory)

In [None]:
#Convert Region & TopCategory to String

customer_profiles["Region"] = customer_profiles["Region"].astype(str)
customer_profiles["TopCategory"] = customer_profiles["TopCategory"].astype(str)

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(customer_profiles[["Region", "TopCategory"]])

encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(["Region", "TopCategory"]))

customer_profiles = pd.concat([customer_profiles, encoded_df], axis=1).drop(["Region", "TopCategory", "CustomerName"], axis=1)


In [None]:
customer_profiles.head()

Unnamed: 0,CustomerID,SignupDate,TotalSpend,TransactionCount,Region_Asia,Region_Europe,Region_North America,Region_South America,TopCategory_0,TopCategory_Books,TopCategory_Clothing,TopCategory_Electronics,TopCategory_Home Decor
0,C0001,2022-07-10,3354.52,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,C0002,2022-02-13,1862.74,4.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,C0003,2024-03-07,2725.38,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,C0004,2022-10-09,5354.88,8.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,C0005,2022-08-15,2034.24,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
print(customer_profiles["TopCategory_0"].sum())  # If this prints 0, the column is useless

1.0


In [None]:
print(customer_profiles['TopCategory_0'].value_counts())

TopCategory_0
0.0    199
1.0      1
Name: count, dtype: int64


In [None]:
customer_profiles = customer_profiles.drop(columns=["TopCategory_0"])
print("Dropped TopCategory_0 since it had only one non-zero value.")

Dropped TopCategory_0 since it had only one non-zero value.


In [None]:
customer_profiles.head()

Unnamed: 0,CustomerID,SignupDate,TotalSpend,TransactionCount,Region_Asia,Region_Europe,Region_North America,Region_South America,TopCategory_Books,TopCategory_Clothing,TopCategory_Electronics,TopCategory_Home Decor
0,C0001,2022-07-10,3354.52,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,C0002,2022-02-13,1862.74,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,C0003,2024-03-07,2725.38,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,C0004,2022-10-09,5354.88,8.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,C0005,2022-08-15,2034.24,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Compute cosine similarity

In [None]:
numeric_features = customer_profiles.drop(columns=["CustomerID", "SignupDate"], errors="ignore")
similarity_matrix = cosine_similarity(numeric_features)

In [None]:
similarity_matrix

array([[1.        , 0.99999941, 0.99999989, ..., 0.99999854, 0.99999967,
        0.99999977],
       [0.99999941, 1.        , 0.99999935, ..., 0.99999914, 0.99999945,
        0.99999929],
       [0.99999989, 0.99999935, 1.        , ..., 0.99999848, 0.99999946,
        0.99999973],
       ...,
       [0.99999854, 0.99999914, 0.99999848, ..., 1.        , 0.99999913,
        0.99999843],
       [0.99999967, 0.99999945, 0.99999946, ..., 0.99999913, 1.        ,
        0.99999923],
       [0.99999977, 0.99999929, 0.99999973, ..., 0.99999843, 0.99999923,
        1.        ]])

In [None]:
similarity_df = pd.DataFrame(similarity_matrix,
                             index=customer_profiles["CustomerID"],
                             columns=customer_profiles["CustomerID"])

In [None]:
similarity_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,0.999999,0.999997,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,1.0,1.0
C0002,0.999999,1.0,0.999999,0.999999,1.0,0.999999,0.999999,1.0,0.999998,1.0,...,0.999999,0.999999,0.999999,1.0,0.999999,0.999999,0.999999,0.999999,0.999999,0.999999
C0003,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0,0.999997,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999998,0.999999,1.0
C0004,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0,0.999997,0.999999,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,1.0,1.0
C0005,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999999,0.999997,0.999999,...,1.0,1.0,1.0,1.0,1.0,0.999999,1.0,0.999998,1.0,1.0


### Find top 3 lookalikes for each customer

In [None]:
target_customers = customer_profiles["CustomerID"].head(20)

In [None]:
lookalike_results = {}
for cust in target_customers:
    similar_customers = similarity_df[cust].drop(cust).nlargest(3)
    lookalike_results[cust] = list(zip(similar_customers.index, similar_customers.values))

### Convert recommendations into DataFrame

In [None]:
lookalike_df = pd.DataFrame(list(lookalike_results.items()), columns=["CustomerID", "Lookalikes"])

In [None]:
lookalike_df.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0039, 0.999999993292041), (C0190, 0.9999999..."
1,C0002,"[(C0088, 0.9999999764589789), (C0134, 0.999999..."
2,C0003,"[(C0152, 0.999999994836078), (C0052, 0.9999999..."
3,C0004,"[(C0137, 0.9999999871370793), (C0165, 0.999999..."
4,C0005,"[(C0146, 0.999999986177863), (C0186, 0.9999999..."
5,C0006,"[(C0171, 0.9999999978926167), (C0187, 0.999999..."
6,C0007,"[(C0140, 0.9999999744672506), (C0115, 0.999999..."
7,C0008,"[(C0139, 0.9999999191219219), (C0047, 0.999999..."
8,C0009,"[(C0111, 0.9999992998021452), (C0198, 0.999999..."
9,C0010,"[(C0111, 0.9999999732455258), (C0103, 0.999999..."


### Save to CSV

In [None]:
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Top 3 Lookalikes for each customer saved to Lookalike.csv.")

Top 3 Lookalikes for each customer saved to Lookalike.csv.


In [None]:
from tabulate import tabulate
import re

lookalike_df = pd.read_csv("Lookalike.csv")

def round_similarity_scores(lookalikes_str):
    clean_str = re.sub(r"[\[\]()]", "", lookalikes_str)

    pairs = clean_str.split(",")

    formatted_pairs = []
    for i in range(0, len(pairs) - 1, 2):
        cust_id = pairs[i].strip()
        try:
            score = round(float(pairs[i + 1].strip()), 2)
            formatted_pairs.append((cust_id, score))
        except ValueError:
            continue

    return ", ".join([f"({cust_id}, {score})" for cust_id, score in formatted_pairs])

lookalike_df["Lookalikes"] = lookalike_df["Lookalikes"].apply(round_similarity_scores)

print(tabulate(lookalike_df.head(10), headers='keys', tablefmt='grid'))


+----+--------------+------------------------------------------------+
|    | CustomerID   | Lookalikes                                     |
|  0 | C0001        | ('C0039', 1.0), ('C0190', 1.0), ('C0048', 1.0) |
+----+--------------+------------------------------------------------+
|  1 | C0002        | ('C0088', 1.0), ('C0134', 1.0), ('C0106', 1.0) |
+----+--------------+------------------------------------------------+
|  2 | C0003        | ('C0152', 1.0), ('C0052', 1.0), ('C0195', 1.0) |
+----+--------------+------------------------------------------------+
|  3 | C0004        | ('C0137', 1.0), ('C0165', 1.0), ('C0169', 1.0) |
+----+--------------+------------------------------------------------+
|  4 | C0005        | ('C0146', 1.0), ('C0186', 1.0), ('C0007', 1.0) |
+----+--------------+------------------------------------------------+
|  5 | C0006        | ('C0171', 1.0), ('C0187', 1.0), ('C0082', 1.0) |
+----+--------------+------------------------------------------------+
|  6 |