In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers_path = '/content/Customers (1).csv'
products_path = '/content/Products (1).csv'
transactions_path ='/content/Transactions.csv'

# Read files into DataFrames
customers_df = pd.read_csv(customers_path)
products_df = pd.read_csv(products_path)
transactions_df = pd.read_csv(transactions_path)

# Merge data to create a combined dataset
transactions_df = transactions_df.merge(customers_df, on='CustomerID', how='left')
transactions_df = transactions_df.merge(products_df, on='ProductID', how='left')

# Aggregate transaction data by CustomerID
# Changed aggregation to avoid KeyError, added merging to get Price
customer_features = transactions_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': lambda x: x.mode()[0], # Using ProductID for aggregation and then merging to get Price
    'Region': lambda x: x.mode()[0],
    'Category': lambda x: x.mode()[0]
}).reset_index()

customer_features = customer_features.merge(products_df[['ProductID', 'Price']], on='ProductID', how='left') # Merging to get Price
customer_features = customer_features.drop('ProductID', axis=1)  # Dropping ProductID if not needed


# Encode categorical features (Region and Category)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

# Standardize the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Find top 3 similar customers for each customer
def find_similar_customers(customer_id, similarity_matrix, customer_ids, top_n=3):
    customer_idx = customer_ids.index(customer_id)
    similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_customers = [(customer_ids[i[0]], i[1]) for i in similarity_scores[1:top_n+1]]
    return top_customers

# Get customer IDs
customer_ids = customer_features['CustomerID'].tolist()

# Generate recommendations for the first 20 customers
lookalike_recommendations = {}
for customer_id in customer_ids[:20]:
    similar_customers = find_similar_customers(customer_id, similarity_matrix, customer_ids)
    lookalike_recommendations[customer_id] = similar_customers

# Save recommendations to CSV
recommendations_df = pd.DataFrame({
    'CustomerID': lookalike_recommendations.keys(),
    'SimilarCustomers': [str(v) for v in lookalike_recommendations.values()]
})
recommendations_df.to_csv('Lookalike_Recommendations.csv', index=False)

# Print sample recommendations
print(recommendations_df.head())

  CustomerID                                   SimilarCustomers
0      C0001  [('C0184', 0.9179566297070914), ('C0091', 0.89...
1      C0002  [('C0106', 0.9239472508196769), ('C0134', 0.91...
2      C0003  [('C0076', 0.9790270885835207), ('C0031', 0.96...
3      C0004  [('C0165', 0.9621825203696691), ('C0169', 0.93...
4      C0005  [('C0140', 0.9934685775641008), ('C0186', 0.87...
