**1: Data Preparation**

**Step 1.1: Load the Data**

Load the Customers.csv, Products.csv, and Transactions.csv datasets and checking for null values.

In [22]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Check for missing values and handle them
customers.isnull().sum()
products.isnull().sum()
transactions.isnull().sum()




Unnamed: 0,0
TransactionID,0
CustomerID,0
ProductID,0
TransactionDate,0
Quantity,0
TotalValue,0
Price,0


**Step 1.2: Merge the Datasets**

In [23]:
# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')


**2. Feature Engineering**

**Step 2.1: Create a Pivot Table**

In [24]:
# Create a pivot table for customer-product interactions
customer_product_matrix = data.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', aggfunc='sum', fill_value=0)


**Step 2.2: Normalize the Data**

In [25]:
# Normalize the matrix for similarity calculation
normalized_matrix = customer_product_matrix.apply(lambda x: (x - x.mean()) / (x.std()), axis=1)


**3. Similarity Calculation**

**Step 3.1: Calculate Cosine Similarity**



In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_product_matrix.index, columns=customer_product_matrix.index)


**4. Identifying Lookalikes**

**Step 4.1: Identify Top 3 Lookalikes**

In [27]:
lookalikes = {}

for customer_id in customers['CustomerID'][:20]:
    scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    top_3_lookalikes = scores.index[1:4]
    top_3_scores = scores.iloc[1:4]
    lookalikes[customer_id] = list(zip(top_3_lookalikes, top_3_scores))

# Convert to DataFrame
lookalikes_list = []
for customer_id, similar_customers in lookalikes.items():
    for lookalike_id, score in similar_customers:
        lookalikes_list.append([customer_id, lookalike_id, score])

lookalikes_df = pd.DataFrame(lookalikes_list, columns=['CustomerID', 'LookalikeID', 'Score'])
print(lookalikes_df)

# Save results to CSV
lookalikes_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)


   CustomerID LookalikeID     Score
0       C0001       C0097  0.541621
1       C0001       C0194  0.440331
2       C0001       C0199  0.415457
3       C0002       C0030  0.346540
4       C0002       C0071  0.310212
5       C0002       C0091  0.305070
6       C0003       C0134  0.499152
7       C0003       C0181  0.494970
8       C0003       C0144  0.383366
9       C0004       C0070  0.474671
10      C0004       C0132  0.423053
11      C0004       C0063  0.317523
12      C0005       C0096  0.635617
13      C0005       C0055  0.496837
14      C0005       C0064  0.309776
15      C0006       C0058  0.644928
16      C0006       C0040  0.622315
17      C0006       C0178  0.352900
18      C0007       C0020  0.582761
19      C0007       C0079  0.480955
20      C0007       C0026  0.344227
21      C0008       C0144  0.371154
22      C0008       C0088  0.300965
23      C0008       C0003  0.274885
24      C0009       C0140  0.551451
25      C0009       C0162  0.492628
26      C0009       C0062  0