# **Lookalike Model** 

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')

In [3]:
# Convert SignupDate to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], format='%Y-%m-%d')

In [4]:
# Calculate tenure (days since signup)
customers['Tenure'] = (datetime.now() - customers['SignupDate']).dt.days

In [5]:
# One-hot encode categorical variables (Region)
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_region = encoder.fit_transform(customers[['Region']])
encoded_region_df = pd.DataFrame(encoded_region, columns=encoder.get_feature_names_out(['Region']))

In [6]:
# Combine encoded features with original data
customers_encoded = pd.concat([customers, encoded_region_df], axis=1)

In [7]:
# Normalize numerical features (Tenure)
scaler = StandardScaler()
customers_encoded['Tenure'] = scaler.fit_transform(customers_encoded[['Tenure']])

In [8]:
# Drop unnecessary columns
customers_encoded = customers_encoded.drop(['CustomerName', 'SignupDate', 'Region'], axis=1)

# **Feature Engineering**

In [9]:
# Feature Engineering: Add TotalSpent and FavoriteCategory (placeholders for now)
# In a real-world scenario, these would be calculated from transaction data.
customers_encoded['TotalSpent'] = 1000  # Placeholder
customers_encoded['FavoriteCategory'] = 'Books'  # Placeholder

In [10]:
# One-hot encode FavoriteCategory
encoded_category = encoder.fit_transform(customers_encoded[['FavoriteCategory']])
encoded_category_df = pd.DataFrame(encoded_category, columns=encoder.get_feature_names_out(['FavoriteCategory']))

In [11]:
# Combine encoded features with original data
customers_encoded = pd.concat([customers_encoded, encoded_category_df], axis=1)

In [12]:
# Drop unnecessary columns
customers_encoded = customers_encoded.drop(['FavoriteCategory'], axis=1)

# **Similarity Calculation**

In [13]:
# Drop non-numeric columns (CustomerID)
customers_numeric = customers_encoded.drop(['CustomerID'], axis=1)

# Check for missing values and fill them if necessary
if customers_numeric.isnull().any().any():
    customers_numeric = customers_numeric.fillna(0)

# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customers_numeric)

# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customers['CustomerID'], columns=customers['CustomerID'])

# **Recommendation**

In [14]:
# Create a dictionary to store recommendations
recommendations = {}

# For each customer, find the top 3 most similar customers
for customer in customers['CustomerID'][:20]:  # Only first 20 customers as per instructions
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]  # Exclude self
    recommendations[customer] = list(zip(similar_customers.index, similar_customers.values))

# Convert recommendations to DataFrame
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index')
recommendations_df = recommendations_df.stack().reset_index()
recommendations_df.columns = ['CustomerID', 'Index', 'Recommendation']

# Split Recommendation into CustomerID and Similarity Score
recommendations_df[['RecommendedCustomerID', 'SimilarityScore']] = pd.DataFrame(recommendations_df['Recommendation'].tolist(), index=recommendations_df.index)
recommendations_df = recommendations_df.drop(['Index', 'Recommendation'], axis=1)


# Save to CSV
recommendations_df.to_csv('Subhash_Kumar_Lookalike.csv', index=False)

# Print the first 5 rows of the recommendations
print(recommendations_df.head())

  CustomerID RecommendedCustomerID  SimilarityScore
0      C0001                 C0112              1.0
1      C0001                 C0025              1.0
2      C0001                 C0071              1.0
3      C0002                 C0134              1.0
4      C0002                 C0045              1.0
