In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import xgboost as xgb
from sklearn.svm import SVC

In [1]:
df = pd.read_csv("Data/Processed_data/regular_transactions.csv")

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', ' ', text) 
    tokens = text.split()  
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens] 
    return ' '.join(tokens)

df['ProductName_clean'] = df['ProductName'].apply(preprocess_text)

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['ProductName_clean'])

# Fit LDA Model
n_topics = 5
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(dtm)

# Assign topics to products
topic_assignments = lda_model.transform(dtm)  # Get topic distribution for each product
df['Topic'] = topic_assignments.argmax(axis=1)  # Assign most probable topic

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics[topic_idx] = top_features
    return topics

# Get top words for each topic
no_top_words = 10
feature_names = vectorizer.get_feature_names_out()
topics = display_topics(lda_model, feature_names, no_top_words)

# Print topics
for topic_num, top_words in topics.items():
    print(f'Topic {topic_num}: {", ".join(top_words)}')

# Save output CSV
df.to_csv("Data/Processed_data/regular_transactions_with_topics.csv", index=False)

print("Topics assigned and saved to CSV.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Topic 0: heart, pink, light, hanging, holder, decoration, rose, box, cream, green
Topic 1: metal, sign, card, cake, blue, pack, wooden, case, pink, candle
Topic 2: bag, retrospot, red, design, lunch, bottle, jumbo, hot, water, home
Topic 3: set, tin, christmas, cake, design, light, pantry, tea, wall, tree
Topic 4: vintage, set, bag, paper, christmas, girl, doily, ribbon, dolly, feltcraft
Topics assigned and saved to CSV.


In [2]:
df = pd.read_csv("Data/Processed_data/regular_transactions.csv")

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', ' ', text) 
    tokens = text.split()  
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens] 
    return ' '.join(tokens)

df['ProductName_clean'] = df['ProductName'].apply(preprocess_text)

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['ProductName_clean'])

# Fit LDA Model
n_topics = 5
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(dtm)

# Assign topics to products
topic_assignments = lda_model.transform(dtm)  # Get topic distribution for each product
df['Topic'] = topic_assignments.argmax(axis=1)  # Assign most probable topic

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics[topic_idx] = top_features
    return topics

# Get top words for each topic
no_top_words = 10
feature_names = vectorizer.get_feature_names_out()
topics = display_topics(lda_model, feature_names, no_top_words)

# Print topics
for topic_num, top_words in topics.items():
    print(f'Topic {topic_num}: {", ".join(top_words)}')

# Save output CSV
df.to_csv("Data/Processed_data/regular_transactions_with_topics.csv", index=False)

print("Topics assigned and saved to CSV.")

 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Topic 0: heart, pink, light, hanging, holder, decoration, rose, box, cream, green
Topic 1: metal, sign, card, cake, blue, pack, wooden, case, pink, candle
Topic 2: bag, retrospot, red, design, lunch, bottle, jumbo, hot, water, home
Topic 3: set, tin, christmas, cake, design, light, pantry, tea, wall, tree
Topic 4: vintage, set, bag, paper, christmas, girl, doily, ribbon, dolly, feltcraft
Topics assigned and saved to CSV.


In [30]:
# 1. Load and prepare the data
def load_data():
    df = pd.read_csv('Data/Processed_data/regular_transactions_with_topics.csv')

In [31]:
# 2. Customer Segmentation Using K-means Clustering
def customer_segmentation(df, n_clusters=4):
    """Segment customers based on their purchasing behavior"""

    customer_features = df.groupby('CustomerNo').agg({
        'TransactionNo': 'nunique',  
        'Revenue': 'sum',            
        'Quantity': 'sum',           
        'ProductNo': 'nunique',     
        'Price': ['mean', 'max']     
    })
    
    # Flatten multi-level column names
    customer_features.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in customer_features.columns]
    
    # Additional derived metrics
    customer_features['AvgBasketSize'] = customer_features['Quantity_sum'] / customer_features['TransactionNo_nunique']
    customer_features['AvgTransactionValue'] = customer_features['Revenue_sum'] / customer_features['TransactionNo_nunique']
    
    # For clustering, use only numeric features
    cluster_features = [
        'TransactionNo_nunique', 'Revenue_sum', 'Quantity_sum', 
        'ProductNo_nunique', 'Price_mean', 'Price_max',
        'AvgBasketSize', 'AvgTransactionValue'
    ]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(customer_features[cluster_features])
    
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=min(len(cluster_features), 3))
    X_pca = pca.fit_transform(X_scaled)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    customer_features['Cluster'] = kmeans.fit_predict(X_pca)
    
    # Analyze clusters
    cluster_analysis = customer_features.groupby('Cluster').agg({
        'TransactionNo_nunique': 'mean',
        'Revenue_sum': 'mean',
        'Quantity_sum': 'mean',
        'ProductNo_nunique': 'mean',
        'Price_mean': 'mean',
        'AvgBasketSize': 'mean',
        'AvgTransactionValue': 'mean'
    }).sort_values('Revenue_sum', ascending=False)
    
    # Create descriptive cluster names
    if len(cluster_analysis) >= 4:
        cluster_names = {
            cluster_analysis.index[0]: "High-Value Customers",
            cluster_analysis.index[1]: "Regular Customers",
            cluster_analysis.index[2]: "Occasional Shoppers",
            cluster_analysis.index[3]: "Low-Value Customers"
        }
    else:
        # If fewer clusters, assign basic names
        cluster_names = {i: f"Segment {i+1}" for i in range(len(cluster_analysis))}
    
    # Add cluster names to the customer features
    customer_features['SegmentName'] = customer_features['Cluster'].map(cluster_names)
    
    return customer_features, cluster_analysis, cluster_names, pca.explained_variance_ratio_

In [32]:

# 3. Product Association Analysis (Market Basket Analysis)
def product_association_analysis(df):
    """Analyze which products are frequently purchased together"""
    # Create transaction-product matrix
    transaction_product_matrix = pd.crosstab(df['TransactionNo'], df['ProductNo'])
    
    # Convert to binary (purchased or not)
    transaction_product_binary = (transaction_product_matrix > 0).astype(int)
    
    # Calculate product support (percentage of transactions containing the product)
    product_support = transaction_product_binary.sum() / len(transaction_product_binary)
    
    # Calculate confidence and lift for product pairs
    associations = []
    
    for prod1 in transaction_product_binary.columns:
        for prod2 in transaction_product_binary.columns:
            if prod1 != prod2:
                # Transactions containing prod1
                support_prod1 = product_support[prod1]
                
                # Transactions containing prod2
                support_prod2 = product_support[prod2]
                
                # Transactions containing both
                both = transaction_product_binary[transaction_product_binary[prod1] == 1][prod2].sum()
                support_both = both / len(transaction_product_binary)
                
                # Calculate confidence
                confidence = support_both / support_prod1 if support_prod1 > 0 else 0
                
                # Calculate lift
                lift = confidence / support_prod2 if support_prod2 > 0 else 0
                
                associations.append({
                    'product1': prod1,
                    'product2': prod2,
                    'support': support_both,
                    'confidence': confidence,
                    'lift': lift
                })
    
    # Convert to DataFrame
    association_df = pd.DataFrame(associations)
    
    # Get product names
    product_mapping = df[['ProductNo', 'ProductName']].drop_duplicates().set_index('ProductNo')['ProductName'].to_dict()
    
    # Add product names
    association_df['product1_name'] = association_df['product1'].map(product_mapping)
    association_df['product2_name'] = association_df['product2'].map(product_mapping)
    
    # Sort by lift
    top_associations = association_df.sort_values('lift', ascending=False)
    
    return top_associations, product_support, product_mapping

In [33]:
# 4. Collaborative Filtering Recommendation System
def build_recommendation_system(df):
    """Build a product recommendation system based on collaborative filtering"""
    # Create customer-product matrix with purchase quantities
    customer_product_matrix = df.pivot_table(
        index='CustomerNo', 
        columns='ProductNo', 
        values='Quantity', 
        aggfunc='sum',
        fill_value=0
    )
    
    # Calculate cosine similarity between products
    product_similarity = cosine_similarity(customer_product_matrix.T)
    
    # Create a DataFrame for easier lookups
    product_similarity_df = pd.DataFrame(
        product_similarity, 
        index=customer_product_matrix.columns,
        columns=customer_product_matrix.columns
    )
    
    # Create product name mapping
    product_names = df[['ProductNo', 'ProductName']].drop_duplicates().set_index('ProductNo')
    
    # Function to get recommendations for a product
    def get_recommendations(product_id, n=5):
        if product_id not in product_similarity_df.index:
            return []
        
        # Get similarity scores
        sim_scores = product_similarity_df[product_id]
        
        # Get top similar products (excluding itself)
        similar_products = sim_scores.sort_values(ascending=False).index[1:n+1]
        
        # Get product names and similarity scores
        recommendations = []
        for prod_id in similar_products:
            if prod_id in product_names.index:
                prod_name = product_names.loc[prod_id, 'ProductName']
                recommendations.append((prod_id, prod_name, sim_scores[prod_id]))
        
        return recommendations
    
    return get_recommendations, product_similarity_df, customer_product_matrix

In [56]:
# 5. Purchase Behavior Prediction
def build_purchase_prediction_model(df):
    """Build a model to predict high-value purchases"""
    # Feature engineering
    model_df = df.copy()
    
    # Encode cyclical features
    model_df['DayOfWeek_sin'] = np.sin(2 * np.pi * model_df['DayOfWeek'] / 7)
    model_df['DayOfWeek_cos'] = np.cos(2 * np.pi * model_df['DayOfWeek'] / 7)
    model_df['Month_sin'] = np.sin(2 * np.pi * model_df['Month'] / 12)
    model_df['Month_cos'] = np.cos(2 * np.pi * model_df['Month'] / 12)
    
    # Create target variable: high revenue purchases
    median_revenue = model_df['Revenue'].median()
    model_df['HighRevenue'] = (model_df['Revenue'] > median_revenue).astype(int)
    
    # Select features
    features = [
        'Price', 'Quantity', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Month_sin', 'Month_cos',
        'IsBlackFridayPeriod', 'IsChristmasPeriod', 'Total_Spent_Before', 'Topic'
    ]
    
    # Handle missing values
    X = model_df[features].fillna(-1)
    y = model_df['HighRevenue']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train multiple models
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingClassifier(random_state=42),
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
        'XGBoost': xgb.XGBClassifier(random_state=42)
    }
    
    # Evaluate models
    model_performance = {}
    best_accuracy = 0
    best_model_name = None
    best_model = None
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        
        model_performance[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name
            best_model = model
    
    # Get feature importance for the best model
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': features,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
    else:
        feature_importance = None
    
    return best_model, best_model_name, model_performance, feature_importance, features

In [35]:
# 6. Customer Lifetime Value Prediction
def predict_customer_lifetime_value(df):
    """Build a regression model to predict customer lifetime value"""
    # Aggregate data at customer level
    customer_df = df.groupby('CustomerNo').agg({
        'Revenue': 'sum',
        'Quantity': 'sum',
        'TransactionNo': 'nunique',
        'ProductNo': 'nunique',
        'Price': ['mean', 'max', 'min'],
        'Days_Since_Last_Purchase': 'min',
        'Avg_Purchase_Frequency': 'mean',
        'Total_Spent_Before': 'mean'
    })
    
    # Flatten multi-level column names
    customer_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in customer_df.columns]
    
    # Create additional features
    customer_df['AvgTransactionValue'] = customer_df['Revenue_sum'] / customer_df['TransactionNo_nunique']
    customer_df['ItemsPerTransaction'] = customer_df['Quantity_sum'] / customer_df['TransactionNo_nunique']
    customer_df['PriceRange'] = customer_df['Price_max'] - customer_df['Price_min']
    
    # Handle missing values
    customer_df = customer_df.fillna({
        'Price_min': customer_df['Price_mean'],
        'Price_max': customer_df['Price_mean'],
        'Days_Since_Last_Purchase_min': 999,
        'Avg_Purchase_Frequency_mean': 0,
        'PriceRange': 0
    })
    
    # Prepare data for modeling
    X = customer_df.drop('Revenue_sum', axis=1)
    y = customer_df['Revenue_sum']
    
    # Since our dataset is small, we'll use all data to train
    # In a real scenario, we would split into train/test sets
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Predict CLV values
    customer_df['PredictedCLV'] = model.predict(X)
    
    # Calculate feature importance
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    return model, customer_df, feature_importance

In [36]:
# 7. Topic-Based Product Analysis
def analyze_product_topics(df):
    """Analyze the performance of products by topic categories"""
    if 'Topic' not in df.columns:
        return None, None
    
    # Group by topic
    topic_analysis = df.groupby('Topic').agg({
        'Revenue': 'sum',
        'Quantity': 'sum',
        'TransactionNo': 'nunique',
        'ProductNo': 'nunique',
        'Price': 'mean'
    }).sort_values('Revenue', ascending=False)
    
    # Calculate additional metrics
    topic_analysis['AvgRevenuePerTransaction'] = topic_analysis['Revenue'] / topic_analysis['TransactionNo']
    topic_analysis['AvgQuantityPerTransaction'] = topic_analysis['Quantity'] / topic_analysis['TransactionNo']
    topic_analysis['MarketShare'] = topic_analysis['Revenue'] / topic_analysis['Revenue'].sum() * 100
    
    # Find top products in each topic
    top_products_by_topic = {}
    for topic in df['Topic'].dropna().unique():
        topic_products = df[df['Topic'] == topic].groupby('ProductNo').agg({
            'Revenue': 'sum',
            'Quantity': 'sum',
            'ProductName': 'first'
        }).sort_values('Revenue', ascending=False).head(5)
        
        top_products_by_topic[topic] = topic_products
    
    return topic_analysis, top_products_by_topic


In [37]:
# 8. Price Sensitivity Analysis
def analyze_price_sensitivity(df):
    """Analyze how price affects purchase quantity"""
    # Group by product
    product_analysis = df.groupby('ProductNo').agg({
        'ProductName': 'first',
        'Price': 'mean',
        'Quantity': 'sum',
        'Revenue': 'sum',
        'TransactionNo': 'nunique'
    })
    
    # Calculate price per unit
    product_analysis['PricePerUnit'] = product_analysis['Price']
    
    # Calculate average quantity per transaction
    product_analysis['AvgQuantityPerTransaction'] = product_analysis['Quantity'] / product_analysis['TransactionNo']
    
    # Calculate price rank (percentile)
    product_analysis['PricePercentile'] = product_analysis['Price'].rank(pct=True)
    
    # Simulate price elasticity
    # Note: Real elasticity calculation requires time series data with price changes
    # This is a simplified simulation for illustration
    product_analysis['SimulatedElasticity'] = -1 * (0.5 + product_analysis['PricePercentile'])
    
    # Calculate revenue impact with price changes
    product_analysis['Revenue_10pct_PriceIncrease'] = product_analysis['Revenue'] * (1 + 0.1 * (1 + product_analysis['SimulatedElasticity']))
    product_analysis['Revenue_10pct_PriceDecrease'] = product_analysis['Revenue'] * (1 - 0.1 * (1 + product_analysis['SimulatedElasticity']))
    
    # Price vs. Quantity correlation
    price_quantity_corr = df[['Price', 'Quantity']].corr().iloc[0, 1]
    
    return product_analysis, price_quantity_corr


In [38]:
# 9. Customer Churn Risk Prediction
def predict_churn_risk(df):
    """Predict which customers are at risk of churning"""
    # Create customer-level features
    customer_features = df.groupby('CustomerNo').agg({
        'TransactionNo': 'nunique',
        'Revenue': 'sum',
        'Quantity': 'sum',
        'Date': 'max',  # Latest purchase date
        'Days_Since_Last_Purchase': 'min',
        'Avg_Purchase_Frequency': 'mean',
        'Total_Spent_Before': 'mean'
    })
    
    # Add recency feature
    most_recent_date = df['Date'].max()
    customer_features['DaysSinceLastPurchase'] = (most_recent_date - customer_features['Date']).dt.days
    
    # Define churn based on recency
    # In a real scenario, this should be based on customer purchase patterns
    customer_features['ChurnRisk'] = customer_features['DaysSinceLastPurchase'] > 30
    
    # Prepare features for the model
    X = customer_features[[
        'TransactionNo', 'Revenue', 'Quantity', 'DaysSinceLastPurchase',
        'Days_Since_Last_Purchase', 'Avg_Purchase_Frequency', 'Total_Spent_Before'
    ]].fillna(0)
    
    y = customer_features['ChurnRisk']
    
    # Build the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Predict churn probability
    customer_features['ChurnProbability'] = model.predict_proba(X)[:, 1]
    
    # Categorize churn risk
    customer_features['ChurnRiskCategory'] = pd.cut(
        customer_features['ChurnProbability'], 
        bins=[0, 0.3, 0.7, 1], 
        labels=['Low', 'Medium', 'High']
    )
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    return customer_features, model, feature_importance

In [39]:
# 10. Product Performance and Seasonality Analysis
def analyze_product_performance(df):
    """Analyze product performance and identify seasonal patterns"""
    # Group by product
    product_performance = df.groupby('ProductNo').agg({
        'ProductName': 'first',
        'Revenue': 'sum',
        'Quantity': 'sum',
        'TransactionNo': 'nunique',
        'Price': 'mean'
    }).sort_values('Revenue', ascending=False)
    
    # Calculate performance metrics
    product_performance['AvgQuantityPerTransaction'] = product_performance['Quantity'] / product_performance['TransactionNo']
    product_performance['AvgRevenuePerTransaction'] = product_performance['Revenue'] / product_performance['TransactionNo']
    product_performance['MarketShare'] = product_performance['Revenue'] / product_performance['Revenue'].sum() * 100
    
    # Identify seasonal patterns
    # Since our dataset has limited timespan, we'll create dummy seasonality data
    # In a real scenario, we would analyze sales patterns across different time periods
    
    # Create a date-product pivot table (if we have enough date range)
    if len(df['Date'].dt.date.unique()) > 1:
        date_product_sales = df.pivot_table(
            index='Date', 
            columns='ProductNo', 
            values='Quantity', 
            aggfunc='sum',
            fill_value=0
        )
        
        # Analyze day-of-week patterns
        dow_sales = df.groupby(['DayOfWeek', 'DayName']).agg({
            'Revenue': 'sum',
            'Quantity': 'sum'
        }).sort_values('Revenue', ascending=False)
        
        # Analyze month patterns
        month_sales = df.groupby('Month').agg({
            'Revenue': 'sum',
            'Quantity': 'sum'
        }).sort_values('Revenue', ascending=False)
        
        seasonality_analysis = {
            'dow_sales': dow_sales,
            'month_sales': month_sales
        }
    else:
        seasonality_analysis = None
    
    return product_performance, seasonality_analysis


In [40]:
# 11. Customer Segmentation Based on RFM Analysis
def rfm_segmentation(df):
    """Segment customers based on Recency, Frequency, and Monetary value"""
    # Create customer-level features
    rfm_df = df.groupby('CustomerNo').agg({
        'Date': 'max',          # Recency: date of last purchase
        'TransactionNo': 'nunique',  # Frequency: number of transactions
        'Revenue': 'sum'        # Monetary: total spend
    })
    
    # Calculate recency in days
    most_recent_date = df['Date'].max()
    rfm_df['Recency'] = (most_recent_date - rfm_df['Date']).dt.days
    
    # Rename columns
    rfm_df = rfm_df.rename(columns={
        'TransactionNo': 'Frequency',
        'Revenue': 'Monetary'
    })
    
    # Create RFM scores (1-5 scale, with 5 being the best)
    # For Recency, lower is better, so we need to invert the quantiles
    rfm_df['R_Score'] = pd.qcut(rfm_df['Recency'], 5, labels=False, duplicates='drop')
    rfm_df['R_Score'] = 5 - rfm_df['R_Score']
    
    rfm_df['F_Score'] = pd.qcut(rfm_df['Frequency'].rank(method='first'), 5, labels=False, duplicates='drop') + 1
    rfm_df['M_Score'] = pd.qcut(rfm_df['Monetary'].rank(method='first'), 5, labels=False, duplicates='drop') + 1
    
    # Calculate RFM score
    rfm_df['RFM_Score'] = rfm_df['R_Score'] * 100 + rfm_df['F_Score'] * 10 + rfm_df['M_Score']
    
    # Segment customers based on RFM score
    rfm_df['RFM_Segment'] = pd.cut(
        rfm_df['RFM_Score'],
        bins=[0, 150, 250, 350, 450, 550],
        labels=['Lost Customers', 'At Risk', 'Average Customers', 'Loyal Customers', 'Champions']
    )
    
    return rfm_df

In [None]:
df = load_data()


In [22]:
customer_segments, cluster_analysis, cluster_names, explained_variance = customer_segmentation(df)
print(f"Created {len(cluster_analysis)} customer segments")
print("Cluster analysis:")
print(cluster_analysis)
print(f"Explained variance: {explained_variance}")
print("\nCustomer segment sample:")
print(customer_segments.head(3))


Created 4 customer segments
Cluster analysis:
         TransactionNo_nunique    Revenue_sum  Quantity_sum  \
Cluster                                                       
3                   131.200000  314071.184000  27738.200000   
2                     7.673392   22172.706166   1933.514502   
1                     6.000000   16419.112857   1301.904762   
0                     3.087243    4346.947567    374.321033   

         ProductNo_nunique  Price_mean  AvgBasketSize  AvgTransactionValue  
Cluster                                                                     
3              1225.000000   11.943418     270.310479          3066.154674  
2               234.194199   12.719684     412.477053          4852.560854  
1               123.761905   69.088692     192.432868          2678.995105  
0                48.977069   13.049606     125.706889          1467.377260  
Explained variance: [0.44605916 0.2308269  0.1672437 ]

Customer segment sample:
            TransactionNo_nuniqu

In [None]:
top_associations, product_support, product_mapping = product_association_analysis(df)
print("Top 5 product associations by lift:")
print(top_associations[['product1_name', 'product2_name', 'support', 'confidence', 'lift']].head(5))
print("\nTop 5 products by support (popularity):")
top_5_products = product_support.sort_values(ascending=False).head(5)
for prod_id, support in top_5_products.items():
    prod_name = product_mapping.get(prod_id, f"Product {prod_id}")
    print(f"{prod_name}: {support:.2%}")


In [24]:
get_recommendations, product_similarity_df, customer_product_matrix = build_recommendation_system(df)
# Get a sample product to recommend for
sample_product = df['ProductNo'].iloc[0]
sample_product_name = df[df['ProductNo'] == sample_product]['ProductName'].iloc[0]
print(f"Recommendations for {sample_product_name}:")
recommendations = get_recommendations(sample_product)
for prod_id, prod_name, similarity in recommendations:
    print(f"- {prod_name} (similarity: {similarity:.2f})")
print("\nCustomer-product matrix shape:", customer_product_matrix.shape)

Recommendations for Set Of 2 Wooden Market Crates:
- Georgian Trinket Box (similarity: 0.51)
- Pink Pot Plant Candle (similarity: 0.48)
- Tall Rococo Candle Holder (similarity: 0.48)
- Bathroom Scales Footprints In Sand (similarity: 0.47)
- Empire Union Jack Tv Dinner Tray (similarity: 0.47)

Customer-product matrix shape: (4634, 3746)


In [57]:
purchase_model, model_name, model_performance, feature_importance, features = build_purchase_prediction_model(df)
print(f"Best model: {model_name}")
print("\nModel performance:")
for model, metrics in model_performance.items():
    print(f"{model}: Accuracy={metrics['accuracy']:.2f}, F1={metrics['f1']:.2f}")
print("\nFeature importance:")
if feature_importance is not None:
    print(feature_importance.head(10))


Best model: RandomForest

Model performance:
RandomForest: Accuracy=1.00, F1=1.00
GradientBoosting: Accuracy=1.00, F1=1.00
LogisticRegression: Accuracy=0.97, F1=0.97
XGBoost: Accuracy=1.00, F1=1.00

Feature importance:
               Feature  Importance
1             Quantity    0.881918
0                Price    0.092050
8   Total_Spent_Before    0.010058
5            Month_cos    0.005207
6  IsBlackFridayPeriod    0.004407
4            Month_sin    0.002879
9                Topic    0.001329
2        DayOfWeek_sin    0.001276
3        DayOfWeek_cos    0.000788
7    IsChristmasPeriod    0.000088


In [41]:
# 6. Customer Lifetime Value Prediction
# This should be in a cell by itself
clv_model, customer_clv, clv_feature_importance = predict_customer_lifetime_value(df)
print("CLV prediction feature importance:")
print(clv_feature_importance.head(10))
print("\nCustomer CLV sample:")
print(customer_clv[['Revenue_sum', 'PredictedCLV']].head(5))

CLV prediction feature importance:
                        Feature  Importance
0                  Quantity_sum    0.923989
8       Total_Spent_Before_mean    0.017414
11                   PriceRange    0.013192
2             ProductNo_nunique    0.012599
3                    Price_mean    0.010875
4                     Price_max    0.007544
1         TransactionNo_nunique    0.006872
5                     Price_min    0.003534
10          ItemsPerTransaction    0.001943
7   Avg_Purchase_Frequency_mean    0.000944

Customer CLV sample:
            Revenue_sum  PredictedCLV
CustomerNo                           
12004.0         1509.60     1502.2996
12006.0           24.76       24.7802
12008.0         4606.61     4540.7769
12013.0           69.96       70.1822
12024.0          149.52      149.3336


In [27]:
topic_analysis, top_products_by_topic = analyze_product_topics(df)
if topic_analysis is not None:
    print("Topic analysis:")
    print(topic_analysis[['Revenue', 'Quantity', 'MarketShare']])
    print("\nTop products by topic:")
    for topic, products in top_products_by_topic.items():
        print(f"\nTopic {topic}:")
        for idx, row in products.iterrows():
            print(f"- {row['ProductName']}: Revenue=${row['Revenue']:.2f}, Quantity={row['Quantity']}")
else:
    print("Topic analysis not available (Topic column may be missing)")


Topic analysis:
          Revenue  Quantity  MarketShare
Topic                                   
0      9997736.99    850406    27.515268
2      7400662.93    664140    20.367732
1      7261038.17    627135    19.983464
4      5925700.81    537874    16.308415
3      5750094.82    467267    15.825121

Top products by topic:

Topic 1:
- Pack Of 72 Retrospot Cake Cases: Revenue=$180931.96, Quantity=16855
- Party Bunting: Revenue=$139201.99, Quantity=8775
- Pack Of 60 Pink Paisley Cake Cases: Revenue=$97366.40, Quantity=9148
- 60 Teatime Fairy Cake Cases: Revenue=$95029.59, Quantity=8925
- Spotty Bunting: Revenue=$85871.22, Quantity=5554

Topic 4:
- Mini Paint Set Vintage: Revenue=$97266.87, Quantity=9134
- Jumbo Shopper Vintage Red Paisley: Revenue=$95069.03, Quantity=7683
- 6 Ribbons Rustic Charm: Revenue=$87428.85, Quantity=7384
- Paper Chain Kit 50'S Christmas: Revenue=$87006.41, Quantity=7084
- Vintage Snap Cards: Revenue=$80606.83, Quantity=7635

Topic 3:
- Jam Making Set Printed: 

In [28]:
#higher price ->lower demand
# This should be in a cell by itself
price_analysis, price_quantity_corr = analyze_price_sensitivity(df)
print(f"Price-Quantity correlation: {price_quantity_corr:.2f}")
print("\nPrice analysis sample (most elastic products):")
elastic_products = price_analysis.sort_values('SimulatedElasticity').head(5)
print(elastic_products[['ProductName', 'Price', 'Quantity', 'SimulatedElasticity']])

Price-Quantity correlation: -0.13

Price analysis sample (most elastic products):
                                 ProductName       Price  Quantity  \
ProductNo                                                            
22656           Vintage Blue Kitchen Cabinet  637.478000        26   
22655            Vintage Red Kitchen Cabinet  624.080000        60   
22827      Rustic Seventeen Drawer Sideboard  172.277308        35   
22828           Regency Mirror With Shutters  170.588571        10   
22823          Chest Natural Wood 20 Drawers  131.281538        24   

           SimulatedElasticity  
ProductNo                       
22656                -1.500000  
22655                -1.499733  
22827                -1.499466  
22828                -1.499199  
22823                -1.498932  


In [42]:
churn_data, churn_model, churn_feature_importance = predict_churn_risk(df)
print("Churn risk by category:")
print(churn_data['ChurnRiskCategory'].value_counts())
print("\nChurn feature importance:")
print(churn_feature_importance.head(5))
print("\nHigh-risk customer sample:")
high_risk = churn_data[churn_data['ChurnRiskCategory'] == 'High'].head(5)
print(high_risk[['ChurnProbability', 'Revenue', 'TransactionNo']])


Churn risk by category:
ChurnRiskCategory
High      2922
Low        508
Medium       0
Name: count, dtype: int64

Churn feature importance:
                    Feature  Importance
3     DaysSinceLastPurchase    0.810737
0             TransactionNo    0.071939
1                   Revenue    0.036494
2                  Quantity    0.034756
4  Days_Since_Last_Purchase    0.023596

High-risk customer sample:
            ChurnProbability  Revenue  TransactionNo
CustomerNo                                          
12004.0                  1.0  1509.60              1
12006.0                  1.0    24.76              1
12008.0                  1.0  4606.61              1
12013.0                  1.0    69.96              1
12024.0                  1.0   149.52              1


In [43]:
product_performance, seasonality_analysis = analyze_product_performance(df)
print("Top 5 products by revenue:")
print(product_performance.head(5)[['ProductName', 'Revenue', 'Quantity', 'MarketShare']])
if seasonality_analysis is not None:
    print("\nDay of week sales pattern:")
    print(seasonality_analysis['dow_sales'])
    print("\nMonth sales pattern:")
    print(seasonality_analysis['month_sales'])
else:
    print("\nSeasonality analysis not available (limited date range)")


Top 5 products by revenue:
                                  ProductName    Revenue  Quantity  \
ProductNo                                                            
85123A     Cream Hanging Heart T-Light Holder  253202.20     19688   
22423                Regency Cakestand 3 Tier  229616.36     10208   
84879           Assorted Colour Bird Ornament  205222.32     17594   
21212         Pack Of 72 Retrospot Cake Cases  180931.96     16855   
22197                          Popcorn Holder  144024.29     13279   

           MarketShare  
ProductNo               
85123A        0.696850  
22423         0.631939  
84879         0.564803  
21212         0.497952  
22197         0.396376  

Day of week sales pattern:
                      Revenue  Quantity
DayOfWeek DayName                      
6         Sun      7563117.02    659554
4         Fri      6711706.82    575292
5         Sat      6447200.50    565109
3         Thu      6072805.83    525126
0         Mon      5702889.26    490201

In [44]:
# 11. RFM Segmentation
# This should be in a cell by itself
rfm_segments = rfm_segmentation(df)
print("RFM Segment distribution:")
print(rfm_segments['RFM_Segment'].value_counts())
print("\nCustomers by RFM score (sample):")
print(rfm_segments[['Recency', 'Frequency', 'Monetary', 'RFM_Score', 'RFM_Segment']].head(5))

RFM Segment distribution:
RFM_Segment
Lost Customers       909
At Risk              881
Average Customers    842
Loyal Customers      787
Champions            756
Name: count, dtype: int64

Customers by RFM score (sample):
            Recency  Frequency  Monetary  RFM_Score     RFM_Segment
CustomerNo                                                         
12004.0         227          1   1509.60        112  Lost Customers
12006.0         218          1     24.76        111  Lost Customers
12008.0         276          1   4606.61        113  Lost Customers
12013.0         359          1     69.96        111  Lost Customers
12024.0         176          1    149.52        211         At Risk
