In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Parameters for the synthetic dataset
num_customers = 100       # Number of unique customers
num_products = 50         # Number of unique products
num_ratings = 1000        # Total number of ratings

# Generate random customer IDs
customer_ids = [f"CUST_{i+1}" for i in range(num_customers)]

# Generate random product IDs
product_ids = [f"PROD_{i+1}" for i in range(num_products)]

# Generate random ratings and timestamps
data = []
for _ in range(num_ratings):
    customer_id = random.choice(customer_ids)
    product_id = random.choice(product_ids)
    rating = random.randint(1, 5)  # Ratings from 1 to 5
    # Generate random timestamp within the past year
    timestamp = datetime.now() - timedelta(days=random.randint(1, 365))
    data.append([customer_id, product_id, rating, timestamp])

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'product_id', 'rating', 'timestamp'])

# Save the dataset as a CSV file
file_path = '/mnt/data/customer_data.csv'
df.to_csv(file_path, index=False)

file_path


'/mnt/data/customer_data.csv'

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

# Step 1: Data Collection and Preprocessing
# Sample data loading (Replace with actual dataset path)
# Assume we have `customer_id`, `product_id`, `purchase_count`, `rating`, `timestamp`

data = pd.read_csv('customer_data.csv')
# Display the first few rows
data.head()

# Preprocessing: Handle missing values, duplicates, etc.
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

# Step 2: Customer Segmentation
# Using RFM (Recency, Frequency, Monetary) Analysis
# Here, we will use a simplified version for demonstration.

# Assuming the dataset has columns 'customer_id', 'product_id', 'purchase_count', and 'timestamp'
# Frequency: Total purchases per customer
frequency = data.groupby('customer_id')['purchase_count'].sum()

# Recency: Days since the last purchase per customer
data['timestamp'] = pd.to_datetime(data['timestamp'])
latest_date = data['timestamp'].max()
recency = data.groupby('customer_id')['timestamp'].apply(lambda x: (latest_date - x.max()).days)

# Monetary: Assuming the purchase count represents spending
monetary = data.groupby('customer_id')['purchase_count'].sum()

# Creating RFM DataFrame
rfm = pd.DataFrame({'Recency': recency, 'Frequency': frequency, 'Monetary': monetary})

# Applying KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=0)
rfm['Segment'] = kmeans.fit_predict(rfm[['Recency', 'Frequency', 'Monetary']])

# Display segmented customers
rfm.head()

# Step 3: Preference Modeling using Collaborative Filtering

# Creating a customer-item matrix
customer_item_matrix = data.pivot_table(index='customer_id', columns='product_id', values='rating', fill_value=0)
customer_item_sparse = csr_matrix(customer_item_matrix.values)

# Compute similarity matrix based on cosine similarity
similarity_matrix = cosine_similarity(customer_item_sparse)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_item_matrix.index, columns=customer_item_matrix.index)

# Sample customer preference predictions
def recommend_products(customer_id, num_recommendations=5):
    # Find similar customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).index[1:num_recommendations+1]
    
    # Aggregate product ratings from similar customers
    similar_customer_data = customer_item_matrix.loc[similar_customers]
    recommended_products = similar_customer_data.sum().sort_values(ascending=False).head(num_recommendations)
    return recommended_products.index.tolist()

# Test the recommendation function
sample_customer_id = customer_item_matrix.index[0]
recommendations = recommend_products(sample_customer_id)
print("Recommended Products for Customer", sample_customer_id, ":", recommendations)

# Step 4: Validation and Testing
# Split data into train and test sets for validation
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Building a simple recommendation model
# (In practice, we would use a more complex model or framework for collaborative filtering)
train_matrix = train_data.pivot_table(index='customer_id', columns='product_id', values='rating', fill_value=0)
test_matrix = test_data.pivot_table(index='customer_id', columns='product_id', values='rating', fill_value=0)

# Evaluating with Mean Squared Error (MSE) as a metric
train_pred = train_matrix.dot(similarity_df) / np.array([np.abs(similarity_df).sum(axis=1)])
test_pred = test_matrix.dot(similarity_df) / np.array([np.abs(similarity_df).sum(axis=1)])

# Calculate MSE
train_mse = mean_squared_error(train_matrix.values.flatten(), train_pred.flatten())
test_mse = mean_squared_error(test_matrix.values.flatten(), test_pred.flatten())

print("Train MSE:", train_mse)
print("Test MSE:", test_mse)

# Step 5: Report Compilation
report = {
    'RFM Segments': rfm['Segment'].value_counts(),
    'Train MSE': train_mse,
    'Test MSE': test_mse,
    'Sample Recommendations': {sample_customer_id: recommendations}
}

report


In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [3]:

data = pd.read_csv('/mnt/data/customer_data.csv')
# Display the first few rows
data.head(10)

Unnamed: 0,customer_id,product_id,rating,timestamp
0,CUST_15,PROD_34,5,2024-02-09 15:03:17.375250
1,CUST_23,PROD_44,3,2023-12-03 15:03:17.375250
2,CUST_8,PROD_50,4,2024-09-08 15:03:17.375250
3,CUST_87,PROD_26,3,2024-02-27 15:03:17.375250
4,CUST_44,PROD_6,3,2024-06-06 15:03:17.375250
5,CUST_10,PROD_34,4,2024-07-22 15:03:17.375250
6,CUST_89,PROD_14,4,2024-01-27 15:03:17.375250
7,CUST_48,PROD_38,5,2024-08-15 15:03:17.375250
8,CUST_35,PROD_38,3,2024-08-20 15:03:17.375250
9,CUST_3,PROD_38,3,2024-04-12 15:03:17.375250


In [4]:

# Preprocessing: Handle missing values, duplicates, etc.
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [7]:
# Step 2: Customer Segmentation
# Using RFM (Recency, Frequency, Monetary) Analysis
# Adjusting to use 'rating' instead of 'purchase_count'

# Frequency: Total ratings per customer (as a proxy for frequency of interaction)
frequency = data.groupby('customer_id')['rating'].count()

# Recency: Days since the last rating for each customer
data['timestamp'] = pd.to_datetime(data['timestamp'])
latest_date = data['timestamp'].max()
recency = data.groupby('customer_id')['timestamp'].apply(lambda x: (latest_date - x.max()).days)

# Monetary: Sum of ratings for each customer (can be considered a measure of "value")
monetary = data.groupby('customer_id')['rating'].sum()

# Creating the RFM DataFrame
rfm = pd.DataFrame({'Recency': recency, 'Frequency': frequency, 'Monetary': monetary})

# Display the RFM DataFrame to verify
rfm.head()


Unnamed: 0_level_0,Recency,Frequency,Monetary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CUST_1,19,12,41
CUST_10,14,6,23
CUST_100,54,7,18
CUST_11,17,13,42
CUST_12,0,13,49


In [8]:
# Step 3: Preference Modeling using Collaborative Filtering

# Creating a customer-item matrix
customer_item_matrix = data.pivot_table(index='customer_id', columns='product_id', values='rating', fill_value=0)
customer_item_sparse = csr_matrix(customer_item_matrix.values)

# Compute similarity matrix based on cosine similarity
similarity_matrix = cosine_similarity(customer_item_sparse)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_item_matrix.index, columns=customer_item_matrix.index)

# Sample customer preference predictions
def recommend_products(customer_id, num_recommendations=5):
    # Find similar customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).index[1:num_recommendations+1]
    
    # Aggregate product ratings from similar customers
    similar_customer_data = customer_item_matrix.loc[similar_customers]
    recommended_products = similar_customer_data.sum().sort_values(ascending=False).head(num_recommendations)
    return recommended_products.index.tolist()

# Test the recommendation function
sample_customer_id = customer_item_matrix.index[0]
recommendations = recommend_products(sample_customer_id)
print("Recommended Products for Customer", sample_customer_id, ":", recommendations)


Recommended Products for Customer CUST_1 : ['PROD_31', 'PROD_34', 'PROD_10', 'PROD_17', 'PROD_5']


In [11]:
# Reindex the similarity matrix to match the train matrix
similarity_df = similarity_df.reindex(index=train_matrix.index, columns=train_matrix.index).fillna(0)


In [13]:
# Step 1: Create a product-product similarity matrix based on product interactions
# For example, assuming we use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between products (not customers)
product_similarity = cosine_similarity(train_matrix.T)
product_similarity_df = pd.DataFrame(product_similarity, index=train_matrix.columns, columns=train_matrix.columns)

# Step 2: Ensure train_matrix and product_similarity_df are aligned
train_matrix = train_matrix.reindex(columns=product_similarity_df.index)
test_matrix = test_matrix.reindex(columns=product_similarity_df.index)

# Step 3: Use the product similarity matrix for predictions
train_pred = train_matrix.dot(product_similarity_df) / np.array([np.abs(product_similarity_df).sum(axis=1)])
test_pred = test_matrix.dot(product_similarity_df) / np.array([np.abs(product_similarity_df).sum(axis=1)])

# Now `train_pred` and `test_pred` should have the correct shape
train_pred, test_pred


(product_id     PROD_1   PROD_10   PROD_11   PROD_12   PROD_13   PROD_14  \
 customer_id                                                               
 CUST_1       0.567940  1.126073  0.867200  0.560552  0.500488  0.504799   
 CUST_10      0.414135  0.338580  0.502627  0.206416  0.562611  0.529937   
 CUST_100     0.287536  0.303013  0.219783  0.089178  0.268533  0.595388   
 CUST_11      1.130192  0.584405  0.443129  0.385826  0.569283  0.336771   
 CUST_12      0.417889  0.954387  0.669061  0.545862  0.523644  0.682892   
 ...               ...       ...       ...       ...       ...       ...   
 CUST_95      0.516499  0.238563  0.400877  0.357123  0.556710  0.260274   
 CUST_96      0.178267  0.223350  0.251339  0.206412  0.432465  0.239934   
 CUST_97      0.281419  0.328076  0.237741  0.264771  0.485433  0.320916   
 CUST_98      0.284431  0.510209  0.424079  0.300570  0.510492  0.257200   
 CUST_99      0.606269  0.696538  0.698273  0.753368  0.556738  0.465943   
 
 product_i

In [17]:


# Evaluating with Mean Squared Error (MSE) as a metric
train_pred = train_matrix.dot(similarity_df) / np.array([np.abs(similarity_df).sum(axis=1)])
test_pred = test_matrix.dot(similarity_df) / np.array([np.abs(similarity_df).sum(axis=1)])


In [20]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Replace NaN values in predictions with 0 (or you can use another value depending on the context)
train_pred = np.nan_to_num(train_pred.values.flatten())
test_pred = np.nan_to_num(test_pred.values.flatten())

# Similarly, convert train_matrix and test_matrix to arrays and handle NaNs if needed
train_true = np.nan_to_num(train_matrix.values.flatten())
test_true = np.nan_to_num(test_matrix.values.flatten())

# Calculate MSE
train_mse = mean_squared_error(train_true, train_pred)
test_mse = mean_squared_error(test_true, test_pred)

# Print the results
print("Train MSE:", train_mse)
print("Test MSE:", test_mse)


Train MSE: 0.0
Test MSE: 0.0
