## Import necessary libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.preprocessing import OneHotEncoder


D:\Anaconda\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
D:\Anaconda\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
D:\Anaconda\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
D:\Anaconda\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


## Load datasets

In [2]:

customers_df = pd.read_csv("Customers.csv")  # Customer data
transactions_df = pd.read_csv("Transactions.csv")  # Transaction data
products_df = pd.read_csv("Products.csv")  # Product data

###  Preview the data

In [3]:
print("\nCustomers Data:\n")
customers_df.head()  # First 5 rows of Customers data


Customers Data:



Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
print("\nTransactions Data:\n")
transactions_df.head()  # First 5 rows of Transactions data



Transactions Data:



Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [5]:
print("\nProducts Data:\n")
products_df.head()  # First 5 rows of Products data


Products Data:



Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


### Data Overview

In [6]:

customers_info = customers_df.info()
print("\n")
products_info = products_df.info()
print("\n")
transactions_info = transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.2+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  ----

### Check for missing values

In [7]:

missing_values = {
    "Customers": customers_df.isnull().sum(),
    "Products": products_df.isnull().sum(),
    "Transactions": transactions_df.isnull().sum()
}


missing_values

{'Customers': CustomerID      0
 CustomerName    0
 Region          0
 SignupDate      0
 dtype: int64,
 'Products': ProductID      0
 ProductName    0
 Category       0
 Price          0
 dtype: int64,
 'Transactions': TransactionID      0
 CustomerID         0
 ProductID          0
 TransactionDate    0
 Quantity           0
 TotalValue         0
 Price              0
 dtype: int64}

### Check for Duplicates values

In [8]:
duplicated_value = {
    "Customers": customers_df.duplicated().sum(),
    "Products": products_df.duplicated().sum(),
    "Transactions": transactions_df.duplicated().sum()
}

duplicated_value

{'Customers': 0, 'Products': 0, 'Transactions': 0}

In [9]:
# Merge transactions with products to get product details
transactions_products = transactions_df.merge(products_df, on="ProductID", how="left")

# Merge with customers to get customer details
customer_data = transactions_products.merge(customers_df, on="CustomerID", how="left")

# Display the first few rows of the unified dataset
customer_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


## Task : Lookalike Model

In [10]:
# Aggregate features for each customer
customer_features = customer_data.groupby("CustomerID").agg({
    "ProductName": lambda x: list(x),  # List of products purchased
    "Category": lambda x: list(x),    # List of product categories
    "Quantity": "sum",                # Total quantity purchased
    "TotalValue": "sum",              # Total spending
    "Region": "first",                # Customer's region
}).reset_index()


In [11]:
# One-hot encode product categories per customer

mlb = MultiLabelBinarizer()
categories_encoded = pd.DataFrame(
    mlb.fit_transform(customer_features["Category"]),
    columns=mlb.classes_,
    index=customer_features.index
)

# Combine encoded categories with other features
customer_features = pd.concat([customer_features, categories_encoded], axis=1).drop(columns=["Category"])

# Display the transformed customer features
customer_features.head()


Unnamed: 0,CustomerID,ProductName,Quantity,TotalValue,Region,Books,Clothing,Electronics,Home Decor
0,C0001,"[SoundWave Cookbook, HomeSense Wall Art, Sound...",12,3354.52,South America,1,0,1,1
1,C0002,"[BookWorld Cookware Set, BookWorld Rug, Comfor...",10,1862.74,Asia,0,1,0,1
2,C0003,"[ActiveWear Cookware Set, ActiveWear Rug, Acti...",14,2725.38,South America,0,1,1,1
3,C0004,"[TechPro Textbook, TechPro Rug, TechPro Vase, ...",23,5354.88,South America,1,0,1,1
4,C0005,"[ActiveWear Cookware Set, TechPro Smartwatch, ...",7,2034.24,Asia,0,0,1,1


**Feature Extraction: Aggregated features like total spending, categories purchased (one-hot encoded), and quantity are used to represent customers.**

In [18]:

# Extract features for similarity computation (numerical columns)
feature_columns = ['Quantity', 'TotalValue', 'Books', 'Clothing', 'Electronics', 'Home Decor']
feature_matrix = customer_features[feature_columns].values

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

similarity_matrix

array([[1.        , 0.99999813, 0.99999867, ..., 0.99999897, 0.99999944,
        0.99999994],
       [0.99999813, 1.        , 0.99999988, ..., 0.99999683, 0.99999939,
        0.99999784],
       [0.99999867, 0.99999988, 1.        , ..., 0.9999976 , 0.99999974,
        0.99999837],
       ...,
       [0.99999897, 0.99999683, 0.9999976 , ..., 1.        , 0.99999825,
        0.9999992 ],
       [0.99999944, 0.99999939, 0.99999974, ..., 0.99999825, 1.        ,
        0.99999917],
       [0.99999994, 0.99999784, 0.99999837, ..., 0.9999992 , 0.99999917,
        1.        ]])

**Similarity Measure: Cosine similarity computes how close customers are in this feature space.**

In [25]:
# Create a DataFrame to store results
customer_ids = customer_features["CustomerID"]
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,0.999998,0.999999,1.000000,1.000000,1.000000,1.000000,0.999999,0.999999,0.999994,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.999999,0.999999,0.999999,0.999999,1.000000
C0002,0.999998,1.000000,1.000000,0.999999,0.999998,0.999997,0.999997,1.000000,0.999997,0.999998,...,0.999998,0.999998,0.999998,0.999998,0.999999,0.999995,0.999999,0.999997,0.999999,0.999998
C0003,0.999999,1.000000,1.000000,1.000000,0.999998,0.999997,0.999998,1.000000,0.999998,0.999998,...,0.999999,0.999999,0.999999,0.999999,1.000000,0.999996,1.000000,0.999998,1.000000,0.999998
C0004,1.000000,0.999999,1.000000,1.000000,1.000000,0.999999,0.999999,1.000000,0.999998,0.999996,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.999998,1.000000,0.999998,1.000000,1.000000
C0005,1.000000,0.999998,0.999998,1.000000,1.000000,1.000000,1.000000,0.999999,0.999999,0.999993,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.999999,0.999999,0.999999,0.999999,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,0.999999,0.999995,0.999996,0.999998,0.999999,1.000000,1.000000,0.999997,0.999998,0.999989,...,0.999999,0.999999,0.999999,0.999999,0.999998,1.000000,0.999997,0.999999,0.999997,1.000000
C0197,0.999999,0.999999,1.000000,1.000000,0.999999,0.999998,0.999999,1.000000,0.999998,0.999997,...,0.999999,0.999999,0.999999,0.999999,1.000000,0.999997,1.000000,0.999998,1.000000,0.999999
C0198,0.999999,0.999997,0.999998,0.999998,0.999999,0.999999,0.999999,0.999998,1.000000,0.999992,...,0.999999,0.999999,0.999999,0.999999,0.999999,0.999999,0.999998,1.000000,0.999998,0.999999
C0199,0.999999,0.999999,1.000000,1.000000,0.999999,0.999998,0.999999,1.000000,0.999998,0.999996,...,0.999999,0.999999,0.999999,0.999999,1.000000,0.999997,1.000000,0.999998,1.000000,0.999999


In [26]:
# Get top 3 similar customers for the first 20 customers (C0001 - C0020)
top_customers = customer_ids[:20]
lookalikes = {}

for cust_id in top_customers:
    # Get similarity scores for the current customer, excluding itself
    similar_scores = similarity_df[cust_id].drop(index=cust_id).sort_values(ascending=False).head(3)
    lookalikes[cust_id] = similar_scores.index.tolist(), similar_scores.values.tolist()

# Prepare the Lookalike.csv output
lookalike_output = pd.DataFrame([{
    "CustomerID": cust_id,
    "Lookalikes": list(zip(lookalikes[cust_id][0], lookalikes[cust_id][1]))
} for cust_id in lookalikes])


lookalike_output

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0177, 0.9999999848631079), (C0127, 0.999999..."
1,C0002,"[(C0003, 0.9999998770103186), (C0062, 0.999999..."
2,C0003,"[(C0031, 0.999999962672191), (C0002, 0.9999998..."
3,C0004,"[(C0175, 0.9999999801354358), (C0017, 0.999999..."
4,C0005,"[(C0069, 0.9999999786704178), (C0007, 0.999999..."
5,C0006,"[(C0079, 0.999999999869063), (C0051, 0.9999999..."
6,C0007,"[(C0064, 0.999999942892985), (C0152, 0.9999999..."
7,C0008,"[(C0113, 0.9999999925381394), (C0075, 0.999999..."
8,C0009,"[(C0198, 0.9999999901629575), (C0092, 0.999999..."
9,C0010,"[(C0049, 0.9999998210678243), (C0091, 0.999999..."


**Cosine Similarity: Since cosine similarity ranges from 0 to 1, higher values indicate a strong resemblance between two customers. The similarity scores in the output (e.g., ~0.9999) show high closeness in behavior and preferences.**

**High Similarity Scores: The High similarity scores (~0.9999) for most pairs suggest possible overfitting due to the dominance of specific features like TotalValue.**

In [13]:
#  Validate recommendations

for cust_id, (similar_ids, scores) in lookalikes.items():
    print(f"\nCustomer: {cust_id}")
    print(f"Top 3 Lookalikes: {similar_ids}")
    print(f"Similarity Scores: {scores}")

    # Compare features for validation
    current_features = customer_features[customer_features["CustomerID"] == cust_id]
    similar_features = customer_features[customer_features["CustomerID"].isin(similar_ids)]
    print(f"Region: {current_features['Region'].values[0]}")
    print(f"TotalValue: {current_features['TotalValue'].values[0]}")
    print(similar_features[["CustomerID", "Region", "TotalValue"]])



Customer: C0001
Top 3 Lookalikes: ['C0177', 'C0127', 'C0012']
Similarity Scores: [0.9999999848631079, 0.9999999845474193, 0.9999999813451149]
Region: South America
TotalValue: 3354.5200000000004
    CustomerID         Region  TotalValue
11       C0012  South America     5231.26
126      C0127         Europe     3232.88
176      C0177           Asia     2509.81

Customer: C0002
Top 3 Lookalikes: ['C0003', 'C0062', 'C0030']
Similarity Scores: [0.9999998770103186, 0.9999998750956173, 0.9999998657876807]
Region: Asia
TotalValue: 1862.74
   CustomerID         Region  TotalValue
2       C0003  South America     2725.38
29      C0030  North America     2549.68
61      C0062         Europe     1639.54

Customer: C0003
Top 3 Lookalikes: ['C0031', 'C0002', 'C0075']
Similarity Scores: [0.999999962672191, 0.9999998770103186, 0.9999998666386807]
Region: South America
TotalValue: 2725.38
   CustomerID         Region  TotalValue
1       C0002           Asia     1862.74
30      C0031  South America  

**Feature Correlation: Validation steps comparing regions and spending (e.g., TotalValue) confirm the logical consistency of recommendations. For instance:**

   For Customer C0001:
   - Region Check: The top similar customer, C0012, shares the same region (South America) and similar TotalValue (3354.52 vs. 5231.26).
   - Other Matches: C0177 and C0127 differ in regions but align well on other features like spending and product category patterns.

In [28]:
# Save the lookalikes to a CSV file
lookalike_output_path = "C:/Users/shrey/Desktop/Internship task/Shreyas_Kumbhar_Lookalike.csv"
lookalike_output.to_csv(lookalike_output_path, index=False)


In [29]:
lookalike_output_path

'C:/Users/shrey/Desktop/Internship task/Shreyas_Kumbhar_Lookalike.csv'