In [12]:
! py -m pip install mlxtend --upgrade



ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikeras 0.4.1 requires packaging<22.0,>=0.21, but you have packaging 23.2 which is incompatible.



Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting scikit-learn>=1.3.1 (from mlxtend)
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting joblib>=0.13.2 (from mlxtend)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 1.4/1.4 MB 23.2 MB/s eta 0:00:00
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 11.1/11.1 MB 18.8 MB/s eta 0:00:00
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn, mlxtend
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully unin

In [None]:
import numpy as np
import pandas as pd

# For reproducibility
np.random.seed(42)

n = 10000

# CustomerID: 1, 2, 3, ..., 10000
customer_id = np.arange(1, n + 1)

# Age: random integer values between 18 and 74
age = np.random.randint(18, 75, n)

# Income: Correlated with age.
# Base income scales linearly from $50,000 (age 18) to $500,000 (age 74).
income_base = 50000 + ((age - 18) / (74 - 18)) * (500000 - 50000)
# Add some noise to vary the income (mean 0, standard deviation $30,000)
noise_income = np.random.normal(0, 30000, n)
income = income_base + noise_income
# Ensure income stays within [$50,000, $500,000]
income = np.clip(income, 50000, 500000)
# Round income to the nearest 1000 so it always ends with '000'
income = np.round(income / 1000) * 1000

# Credit Score: Simulate a score between 300 and 850.
# We make it increase with age: baseline score is 300 at age 18 and 850 at age 74.
credit_score_base = 300 + (age - 18) * ((850 - 300) / (74 - 18))
noise_credit = np.random.normal(0, 30, n)
credit_score = credit_score_base + noise_credit
credit_score = np.clip(credit_score, 300, 850)
credit_score = np.round(credit_score).astype(int)

# Tenure: Number of years with the company, random integer between 1 and 20.
tenure = np.random.randint(1, 21, n)

# Segment: Determine customer segment.
# Default is "Mass Market". High income customers (income > $300,000) become "Affluent".
# Customers aged 60 or above have a 70% chance to be "Retiree", otherwise assign based on income.
segment = []
for a, inc in zip(age, income):
    if a >= 60:
        # 70% chance for older customers to be "Retiree"
        if np.random.rand() < 0.7:
            segment.append("Retiree")
        else:
            segment.append("Affluent" if inc > 300000 else "Mass Market")
    else:
        segment.append("Affluent" if inc > 300000 else "Mass Market")

# Create the DataFrame
df_customers = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Income': income.astype(int),
    'Credit Score': credit_score,
    'Tenure': tenure,
    'Segment': segment
})

# Save the DataFrame to CSV
df_customers.to_csv("../data/Customers.csv", index=False)

print("Mock Customers data generated and saved as 'Customers.csv'.")


Mock Customers data generated and saved as 'Customers.csv'.


In [4]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import csv
# For reproducibility
np.random.seed(42)

total_customers = 10000

# Ensure each customer has at least one transaction.
base_customer_ids = np.arange(1, total_customers + 1)

# Decide on extra transactions (e.g., 5000 additional transactions)
extra_transactions = 10000
extra_customer_ids = np.random.randint(1, total_customers + 1, extra_transactions)

# Combine to get a list where every customer appears at least once.
customer_ids = np.concatenate([base_customer_ids, extra_customer_ids])
num_transactions = len(customer_ids)

# TransactionID: Sequential IDs starting from 5001
transaction_ids = np.arange(5001, 5001 + num_transactions)

# Extensive list of products and their variations that large banks typically offer
products = {
    'Mortgage': ['Fixed Rate', 'Adjustable Rate', 'Interest Only', 'Reverse Mortgage'],
    'Credit Card': ['Rewards', 'Cashback', 'Travel', 'Business', 'Secured'],
    'Checking Account': ['Premium', 'Basic', 'Senior', 'Interest Bearing'],
    'Savings Account': ['High Yield', 'Regular', 'Money Market', "Children's Savings"],
    'Certificate of Deposit': ['1-Year', '2-Year', '5-Year', '10-Year'],
    'Personal Loan': ['Secured', 'Unsecured', 'Debt Consolidation', 'Home Improvement'],
    'Auto Loan': ['New Car', 'Used Car', 'Refinance', 'Lease'],
    'Home Equity Loan': ['Fixed Rate', 'Variable Rate', 'HELOC'],
    'Investment Account': ['Managed', 'Self-Directed', 'Robo Advisor', 'Brokerage'],
    'Business Loan': ['Term Loan', 'Line of Credit', 'SBA Loan', 'Equipment Financing'],
    'Retirement Account': ['IRA', '401K', 'Roth IRA', 'Simple IRA'],
    'Student Loan': ['Federal', 'Private', 'Refinance', 'Consolidation']
}

# List of product names for random selection
product_list = list(products.keys())
chosen_products = np.random.choice(product_list, num_transactions)
variations = [np.random.choice(products[prod]) for prod in chosen_products]

# Generate random dates between 2020-01-01 and 2025-03-31
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 3, 31)
days_range = (end_date - start_date).days
random_days = np.random.randint(0, days_range + 1, num_transactions)
dates = [start_date + timedelta(days=int(day)) for day in random_days]
# Format dates as MM/DD/YYYY (e.g., "08/04/2020")
formatted_dates = [d.strftime("%m/%d/%Y") for d in dates]

# Create the DataFrame
df_transactions = pd.DataFrame({
    'TransactionID': transaction_ids,
    'CustomerID': customer_ids,
    'Product': chosen_products,
    'Variation': variations,
    'Date': formatted_dates
})

# Ensure Date column is explicitly string
df_transactions['Date'] = df_transactions['Date'].astype(str)

# Save to CSV with all fields quoted so that formatting is preserved
df_transactions.to_csv("../data/Transactions.csv", index=False, quoting=csv.QUOTE_ALL)
print("Mock Transactions data generated and saved as 'Transactions.csv'.")


Mock Transactions data generated and saved as 'Transactions.csv'.


In [17]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import csv

# For reproducibility
np.random.seed(42)

total_customers = 10000

# Ensure each customer has at least one transaction.
base_customer_ids = np.arange(1, total_customers + 1)
extra_transactions = 10000  # Increase extra transactions if needed
extra_customer_ids = np.random.randint(1, total_customers + 1, extra_transactions)
customer_ids = np.concatenate([base_customer_ids, extra_customer_ids])
num_transactions = len(customer_ids)

# TransactionID: Sequential IDs starting from 5001
transaction_ids = np.arange(5001, 5001 + num_transactions)

# Original products dictionary with variations
products = {
    'Mortgage': ['Fixed Rate', 'Adjustable Rate', 'Hybrid Rate', 'Interest Only', 'Balloon Payment', 'Reverse Mortgage'],
    'Credit Card': ['Rewards', 'Cashback', 'Travel', 'Business', 'Student', 'Secured', 'Balance Transfer'],
    'Checking Account': ['Premium', 'Basic', 'Student', 'Senior', 'Interest Bearing'],
    'Savings Account': ['High Yield', 'Regular', 'Money Market', "Children's Savings"],
    'Certificate of Deposit': ['3-Month', '6-Month', '1-Year', '2-Year', '5-Year', '10-Year'],
    'Personal Loan': ['Secured', 'Unsecured', 'Debt Consolidation', 'Home Improvement'],
    'Auto Loan': ['New Car', 'Used Car', 'Refinance', 'Lease'],
    'Home Equity Loan': ['Fixed Rate', 'Variable Rate', 'HELOC'],
    'Investment Account': ['Managed', 'Self-Directed', 'Robo Advisor', 'Brokerage'],
    'Business Loan': ['Term Loan', 'Line of Credit', 'SBA Loan', 'Equipment Financing'],
    'Retirement Account': ['IRA', '401K', 'Roth IRA', 'SEP IRA', 'Simple IRA'],
    'Student Loan': ['Federal', 'Private', 'Refinance', 'Consolidation']
}

# Define three product groups (fewer combinations per customer)
group1 = ['Mortgage', 'Credit Card', 'Business Loan']
group2 = ['Checking Account', 'Savings Account', 'Certificate of Deposit', 'Personal Loan']
group3 = ['Auto Loan', 'Home Equity Loan', 'Investment Account', 'Retirement Account', 'Student Loan']

groups = {1: group1, 2: group2, 3: group3}
all_products = group1 + group2 + group3  # complete list of products

# Assign each customer to one group at random
customer_group = {}
for cid in range(1, total_customers + 1):
    customer_group[cid] = np.random.choice([1, 2, 3])

# Generate chosen_products for each transaction:
# With 80% probability, choose from the customer's group.
chosen_products = []
expanded_customer_ids = []

for cid in customer_ids:
    if np.random.rand() < 0.9:
        n_products = np.random.choice([2, 3], p=[0.7, 0.3])
        prods = np.random.choice(groups[customer_group[cid]], size=n_products, replace=False)
        chosen_products.extend(prods)
        expanded_customer_ids.extend([cid] * n_products)
    else:
        prod = np.random.choice(all_products)
        chosen_products.append(prod)
        expanded_customer_ids.append(cid)

# Now they match
assert len(expanded_customer_ids) == len(chosen_products)

# TransactionID
transaction_ids = np.arange(5001, 5001 + len(expanded_customer_ids))

# Variations
chosen_variations = [np.random.choice(products[prod]) for prod in chosen_products]

# Dates
random_days = np.random.randint(0, days_range + 1, len(expanded_customer_ids))
dates = [start_date + timedelta(days=int(day)) for day in random_days]
formatted_dates = [d.strftime("%m/%d/%Y") for d in dates]

# Final DataFrame
df_transactions = pd.DataFrame({
    'TransactionID': transaction_ids,
    'CustomerID': expanded_customer_ids,
    'Product': chosen_products,
    'Variation': chosen_variations,
    'Date': formatted_dates
})

# Save
df_transactions['Date'] = df_transactions['Date'].astype(str)
df_transactions.to_csv("../data/Transactions.csv", index=False, quoting=csv.QUOTE_ALL)
print("Modified mock Transactions data generated and saved as 'Transactions.csv'.")

Modified mock Transactions data generated and saved as 'Transactions.csv'.


In [124]:
import pandas as pd
import numpy as np

# Load the Transactions data
df_transactions = pd.read_csv("../data/Transactions.csv")

# Generate a rating between 1.0 to 5.0 with slight skew toward higher values
# We'll use a clipped normal distribution
ratings = np.random.normal(loc=4.0, scale=0.7, size=len(df_transactions))
ratings = np.clip(ratings, 1.0, 5.0)
ratings = np.round(ratings, 1)

# Create the new DataFrame
df_ratings = df_transactions.copy()
df_ratings = df_ratings.drop(columns=["Date"])  # Remove Date
df_ratings["Rating"] = ratings  # Add Rating column

# Save to CSV
df_ratings.to_csv("../data/ProductVariationRatings.csv", index=False, quoting=1)  # quoting=1 = csv.QUOTE_ALL
print("ProductVariationRatings.csv generated successfully.")


ProductVariationRatings.csv generated successfully.


In [125]:
import pandas as pd

# Load the Transactions data
df_transactions = pd.read_csv("../data/Transactions.csv")

# Take a quick look at the data
print("Transactions Data Sample:")
print(df_transactions.head())


Transactions Data Sample:
   TransactionID  CustomerID             Product         Variation        Date
0           5001           1         Credit Card  Balance Transfer  04/30/2020
1           5002           1            Mortgage  Reverse Mortgage  11/15/2021
2           5003           2  Retirement Account           SEP IRA  04/12/2023
3           5004           2  Investment Account     Self-Directed  09/09/2023
4           5005           3         Credit Card           Rewards  03/28/2025


## Apriori rules

In [126]:
# Group transactions by CustomerID and aggregate unique products into a list (basket)
basket = df_transactions.groupby('CustomerID')['Product'] \
                          .apply(lambda x: list(set(x))) \
                          .reset_index()

print("\nBasket Sample:")
print(basket.head)



Basket Sample:
<bound method NDFrame.head of       CustomerID                                            Product
0              1                            [Mortgage, Credit Card]
1              2  [Investment Account, Auto Loan, Home Equity Lo...
2              3             [Business Loan, Mortgage, Credit Card]
3              4             [Business Loan, Mortgage, Credit Card]
4              5             [Business Loan, Mortgage, Credit Card]
...          ...                                                ...
9995        9996             [Investment Account, Home Equity Loan]
9996        9997  [Mortgage, Savings Account, Home Equity Loan, ...
9997        9998                          [Business Loan, Mortgage]
9998        9999  [Business Loan, Mortgage, Credit Card, Home Eq...
9999       10000             [Business Loan, Mortgage, Credit Card]

[10000 rows x 2 columns]>


In [127]:
from mlxtend.preprocessing import TransactionEncoder

# Transform the basket list into one-hot encoded format
te = TransactionEncoder()
te_array = te.fit(basket['Product']).transform(basket['Product'])
df_basket = pd.DataFrame(te_array, columns=te.columns_, index=basket['CustomerID'])
df_basket.reset_index(inplace=True)  # Optionally keep CustomerID as a column

print("\nOne-Hot Encoded Basket Sample:")
print(df_basket.head())



One-Hot Encoded Basket Sample:
   CustomerID  Auto Loan  Business Loan  Certificate of Deposit  \
0           1      False          False                   False   
1           2       True          False                   False   
2           3      False           True                   False   
3           4      False           True                   False   
4           5      False           True                   False   

   Checking Account  Credit Card  Home Equity Loan  Investment Account  \
0             False         True             False               False   
1             False        False              True                True   
2             False         True             False               False   
3             False         True             False               False   
4             False         True             False               False   

   Mortgage  Personal Loan  Retirement Account  Savings Account  Student Loan  
0      True          False              

In [128]:
from mlxtend.frequent_patterns import apriori, association_rules

# Drop CustomerID from the one-hot encoded DataFrame since it's not needed for Apriori
df_onehot = df_basket.drop(columns=['CustomerID'])

# Run the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_onehot, min_support=0.005, use_colnames=True)

print("\nFrequent Itemsets Sample:")
print(frequent_itemsets)



Frequent Itemsets Sample:
    support                                           itemsets
0    0.2214                                        (Auto Loan)
1    0.2901                                    (Business Loan)
2    0.2590                           (Certificate of Deposit)
3    0.2481                                 (Checking Account)
4    0.2869                                      (Credit Card)
..      ...                                                ...
92   0.0591  (Auto Loan, Student Loan, Retirement Account, ...
93   0.0578  (Investment Account, Auto Loan, Student Loan, ...
94   0.0964  (Certificate of Deposit, Checking Account, Per...
95   0.0574  (Investment Account, Home Equity Loan, Student...
96   0.0406  (Investment Account, Student Loan, Retirement ...

[97 rows x 2 columns]


In [None]:
# Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# print("\nAssociation Rules Sample:")
# print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

In [204]:
# Convert sets to lists so they can be JSON-serialized
rules_clean = rules.copy()
rules_clean['antecedents'] = rules_clean['antecedents'].apply(list)
rules_clean['consequents'] = rules_clean['consequents'].apply(list)

# Save as JSON Lines
rules_clean.to_json("../data/rules.json", orient="records", lines=True)


## Find Similar Customers

In [205]:
import pandas as pd

# Load JSON lines
rules = pd.read_json("../data/rules.json", lines=True)

# Convert antecedents/consequents back to sets
rules['antecedents'] = rules['antecedents'].apply(set)
rules['consequents'] = rules['consequents'].apply(set)


In [166]:
def recommend_next_products(customer_products, rules_df, min_conf, min_lift):
    recommendations = set()
    for _, row in rules_df.iterrows():
        antecedent = list(row['antecedents'])
        consequent = list(row['consequents'])
        if set(antecedent).issubset(set(customer_products)) and row['confidence'] >= min_conf and row['lift'] >= min_lift:
            recommendations.update(consequent)
    return list(recommendations)


In [131]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

def compute_customer_similarity(customers_df):
    # Select numeric and categorical features
    numeric_cols = ['Age', 'Income', 'Credit Score', 'Tenure']
    categorical_col = ['Segment']
    
    # Normalize numeric columns
    scaler = StandardScaler()
    numeric_scaled = scaler.fit_transform(customers_df[numeric_cols])
    
    # One-hot encode Segment
    encoder = OneHotEncoder(sparse_output=False)
    segment_encoded = encoder.fit_transform(customers_df[categorical_col])
    
    # Combine features into single vector
    combined_features = np.hstack((numeric_scaled, segment_encoded))
    
    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(combined_features)
    
    # Return as DataFrame with CustomerID as index/columns
    sim_df = pd.DataFrame(similarity_matrix, 
                          index=customers_df['CustomerID'], 
                          columns=customers_df['CustomerID'])
    return sim_df


In [132]:
# Example: Customer 200 has Mortgage. We recommend Credit Card. Now find best Credit Card variation

# Load the Customers.csv first
customers_df = pd.read_csv("../data/Customers.csv")

# Compute cosine similarity between all customers using their attributes
sim_df = compute_customer_similarity(customers_df)


## Test set

In [208]:
import pandas as pd

# Load the real data from the data folder
df_customers = pd.read_csv("../data/Customers.csv")
df_transactions = pd.read_csv("../data/Transactions.csv")

# Get each customer's first product (sorted by transaction ID or date)
df_first_product = df_transactions.sort_values(by="TransactionID").groupby("CustomerID").first().reset_index()
df_first_product = df_first_product[['CustomerID', 'Product']]
df_first_product.rename(columns={'Product': 'MainProduct'}, inplace=True)

# Merge with customers
df_customer_test = pd.merge(df_customers, df_first_product, on="CustomerID", how="inner")

# Take any 100 customers for the test set
df_customer_test = df_customer_test.sample(n=100, random_state=42)

# Save to CSV
df_customer_test.to_csv("../data/customer_test.csv", index=False)

print("✅ Test set with 100 customers and their main product saved as 'customer_test.csv'")


✅ Test set with 100 customers and their main product saved as 'customer_test.csv'


## Recommend Product variation based on Similar Customers

In [133]:
def recommend_variation_with_customer_similarity(customer_id, product, df_ratings, sim_df, top_n_similar_users=10):
    if customer_id not in sim_df.index:
        return None

    # Get most similar customers
    similar_customers = sim_df.loc[customer_id].sort_values(ascending=False)
    similar_customers = similar_customers.drop(index=customer_id).head(top_n_similar_users).index.tolist()

    # Filter ratings by these similar customers for the product
    similar_ratings = df_ratings[(df_ratings['CustomerID'].isin(similar_customers)) &
                                 (df_ratings['Product'] == product)]

    if similar_ratings.empty:
        return None

    # Get highest-rated variation
    top_variation = (similar_ratings.groupby('Variation')['Rating']
                     .mean()
                     .sort_values(ascending=False)
                     .idxmax())
    return top_variation


In [134]:
recommend_variation_with_customer_similarity(
    customer_id=1, 
    product="Mortgage", 
    df_ratings=df_ratings, 
    sim_df=sim_df,top_n_similar_users=10
)

'Reverse Mortgage'

In [168]:
def full_recommendation(customer_id, df_transactions, rules_df, df_ratings, sim_df, min_conf, min_lift):
    # Step 1: Find what the customer already has
    customer_products = df_transactions[df_transactions['CustomerID'] == customer_id]['Product'].unique().tolist()
    
    # Step 2: Get recommended products using Apriori rules
    recommended_products = recommend_next_products(customer_products, rules_df, min_conf, min_lift)

    # Step 3: Recommend best variation using customer-based similarity
    recommendations = []
    for prod in recommended_products:
        best_variation = recommend_variation_with_customer_similarity(
            customer_id, prod, df_ratings, sim_df, 10
        )
        recommendations.append({
            'Product': prod,
            'Variation': best_variation or "Default"
        })

    return recommendations


In [207]:
df_transactions = pd.read_csv("../data/Transactions.csv")
# rules = pd.read_csv("../data/rules.csv")  # assuming you've saved your Apriori rules

recos = full_recommendation(
    customer_id=6,
    df_transactions=df_transactions,
    rules_df=rules,
    df_ratings=df_ratings,
    sim_df=sim_df, min_conf=0.6, min_lift=2
)

pd.DataFrame(recos)

Unnamed: 0,Product,Variation
0,Investment Account,Robo Advisor
1,Student Loan,Consolidation
2,Retirement Account,Simple IRA
3,Home Equity Loan,Fixed Rate
4,Auto Loan,Refinance
