In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
customers=pd.read_csv('Customers.csv')
products=pd.read_csv('Products.csv')
transactions=pd.read_csv('Transactions.csv')

In [None]:
print(customers.info())
print(products.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------

In [None]:
#merging datasets
merge_data=pd.merge(transactions,customers,on="CustomerID",how="inner")
merge_data=pd.merge(merge_data,products,on="ProductID",how="inner")
print(merge_data)

    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32         4   
998        T00959      C0115      P037  2024-09-29 10:16:02         2   
999        T00992      C0024      P037  2024-04-21 10:52:24         1   

     TotalValue  Price_x          CustomerName         Region  SignupDate  \
0        300.68   300.68        Andrea Jenkins

In [None]:
customer_profiles=merge_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "TransactionID": "count",  # Purchase frequency
    "ProductID": lambda x: list(x),  # List of purchased products
    "Category": lambda x: list(x)  # Product categories
}).reset_index()

# Rename columns for clarity
customer_profiles.rename(columns={"TotalValue": "TotalSpending", "TransactionID": "PurchaseFrequency"}, inplace=True)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Aggregate transaction data
transaction_summary = transactions.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    PurchaseFrequency=('TransactionID', 'count')
).reset_index()

# Merge with customers dataset
customer_profiles = pd.merge(customers, transaction_summary, on='CustomerID', how='left')

# Fill missing values with 0
customer_profiles[['TotalSpending', 'PurchaseFrequency']] = customer_profiles[['TotalSpending', 'PurchaseFrequency']].fillna(0)


In [None]:
# Merge transactions with product data
transactions_products = pd.merge(transactions, products, on='ProductID')

# Create a space-separated string of product categories for each customer
category_preferences = transactions_products.groupby('CustomerID')['Category'].apply(lambda x: ' '.join(x)).reset_index()
category_preferences.rename(columns={'Category': 'CategoryPreferences'}, inplace=True)

# Merge with customer profiles
customer_profiles = pd.merge(customer_profiles, category_preferences, on='CustomerID', how='left')

# Fill missing category preferences with an empty string
customer_profiles['CategoryPreferences'] = customer_profiles['CategoryPreferences'].fillna('')


In [None]:
# Apply TF-IDF Vectorization
tfidf = TfidfVectorizer()
category_vectors = tfidf.fit_transform(customer_profiles['CategoryPreferences'])

# Convert TF-IDF matrix to a DataFrame and merge
category_vectors_df = pd.DataFrame(category_vectors.toarray())
customer_profiles = pd.concat([customer_profiles, category_vectors_df], axis=1)


In [None]:
# Select features for similarity calculation
features = customer_profiles[['TotalSpending', 'PurchaseFrequency'] + list(category_vectors_df.columns)]

# Compute similarity matrix
similarity_matrix = cosine_similarity(features)


In [None]:
lookalikes = {}
for i, customer_id in enumerate(customer_profiles['CustomerID']):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[i]))

    # Sort by similarity score in descending order and exclude self-similarity
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]

    # Save the top 3 similar customers with their scores
    lookalikes[customer_id] = [
        (customer_profiles['CustomerID'][idx], round(score, 2)) for idx, score in similar_customers
    ]


In [None]:
# Prepare the Lookalike data
lookalike_data = []
for customer_id, similar_list in lookalikes.items():
    lookalike_data.append({
        'CustomerID': customer_id,
        'Lookalikes': similar_list
    })

# Convert to DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)
