In [12]:
! py -m pip install mlxtend --upgrade



ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikeras 0.4.1 requires packaging<22.0,>=0.21, but you have packaging 23.2 which is incompatible.



Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting scikit-learn>=1.3.1 (from mlxtend)
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting joblib>=0.13.2 (from mlxtend)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 1.4/1.4 MB 23.2 MB/s eta 0:00:00
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 11.1/11.1 MB 18.8 MB/s eta 0:00:00
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn, mlxtend
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully unin

In [3]:
import numpy as np
import pandas as pd

# For reproducibility
np.random.seed(42)

n = 10000

# CustomerID: 1, 2, 3, ..., 10000
customer_id = np.arange(1, n + 1)

# Age: random integer values between 18 and 74
age = np.random.randint(18, 75, n)

# Income: Correlated with age.
# Base income scales linearly from $50,000 (age 18) to $500,000 (age 74).
income_base = 50000 + ((age - 18) / (74 - 18)) * (500000 - 50000)
# Add some noise to vary the income (mean 0, standard deviation $30,000)
noise_income = np.random.normal(0, 30000, n)
income = income_base + noise_income
# Ensure income stays within [$50,000, $500,000]
income = np.clip(income, 50000, 500000)
# Round income to the nearest 1000 so it always ends with '000'
income = np.round(income / 1000) * 1000

# Credit Score: Simulate a score between 300 and 850.
# We make it increase with age: baseline score is 300 at age 18 and 850 at age 74.
credit_score_base = 300 + (age - 18) * ((850 - 300) / (74 - 18))
noise_credit = np.random.normal(0, 30, n)
credit_score = credit_score_base + noise_credit
credit_score = np.clip(credit_score, 300, 850)
credit_score = np.round(credit_score).astype(int)

# Tenure: Number of years with the company, random integer between 1 and 20.
tenure = np.random.randint(1, 21, n)

# Segment: Determine customer segment.
# Default is "Mass Market". High income customers (income > $300,000) become "Affluent".
# Customers aged 60 or above have a 70% chance to be "Retiree", otherwise assign based on income.
segment = []
for a, inc in zip(age, income):
    if a >= 60:
        # 70% chance for older customers to be "Retiree"
        if np.random.rand() < 0.7:
            segment.append("Retiree")
        else:
            segment.append("Affluent" if inc > 300000 else "Mass Market")
    else:
        segment.append("Affluent" if inc > 300000 else "Mass Market")

# Create the DataFrame
df_customers = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Income': income.astype(int),
    'Credit Score': credit_score,
    'Tenure': tenure,
    'Segment': segment
})

# Save the DataFrame to CSV
df_customers.to_csv("Customers.csv", index=False)

print("Mock Customers data generated and saved as 'Customers.csv'.")


Mock Customers data generated and saved as 'Customers.csv'.


In [7]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import csv
# For reproducibility
np.random.seed(42)

total_customers = 10000

# Ensure each customer has at least one transaction.
base_customer_ids = np.arange(1, total_customers + 1)

# Decide on extra transactions (e.g., 5000 additional transactions)
extra_transactions = 10000
extra_customer_ids = np.random.randint(1, total_customers + 1, extra_transactions)

# Combine to get a list where every customer appears at least once.
customer_ids = np.concatenate([base_customer_ids, extra_customer_ids])
num_transactions = len(customer_ids)

# TransactionID: Sequential IDs starting from 5001
transaction_ids = np.arange(5001, 5001 + num_transactions)

# Extensive list of products and their variations that large banks typically offer
products = {
    'Mortgage': ['Fixed Rate', 'Adjustable Rate', 'Hybrid Rate', 'Interest Only', 'Balloon Payment', 'Reverse Mortgage'],
    'Credit Card': ['Rewards', 'Cashback', 'Travel', 'Business', 'Student', 'Secured', 'Balance Transfer'],
    'Checking Account': ['Premium', 'Basic', 'Student', 'Senior', 'Interest Bearing'],
    'Savings Account': ['High Yield', 'Regular', 'Money Market', "Children's Savings"],
    'Certificate of Deposit': ['3-Month', '6-Month', '1-Year', '2-Year', '5-Year', '10-Year'],
    'Personal Loan': ['Secured', 'Unsecured', 'Debt Consolidation', 'Home Improvement'],
    'Auto Loan': ['New Car', 'Used Car', 'Refinance', 'Lease'],
    'Home Equity Loan': ['Fixed Rate', 'Variable Rate', 'HELOC'],
    'Investment Account': ['Managed', 'Self-Directed', 'Robo Advisor', 'Brokerage'],
    'Business Loan': ['Term Loan', 'Line of Credit', 'SBA Loan', 'Equipment Financing'],
    'Retirement Account': ['IRA', '401K', 'Roth IRA', 'SEP IRA', 'Simple IRA'],
    'Student Loan': ['Federal', 'Private', 'Refinance', 'Consolidation']
}

# List of product names for random selection
product_list = list(products.keys())
chosen_products = np.random.choice(product_list, num_transactions)
variations = [np.random.choice(products[prod]) for prod in chosen_products]

# Generate random dates between 2020-01-01 and 2025-03-31
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 3, 31)
days_range = (end_date - start_date).days
random_days = np.random.randint(0, days_range + 1, num_transactions)
dates = [start_date + timedelta(days=int(day)) for day in random_days]
# Format dates as MM/DD/YYYY (e.g., "08/04/2020")
formatted_dates = [d.strftime("%m/%d/%Y") for d in dates]

# Create the DataFrame
df_transactions = pd.DataFrame({
    'TransactionID': transaction_ids,
    'CustomerID': customer_ids,
    'Product': chosen_products,
    'Variation': variations,
    'Date': formatted_dates
})

# Ensure Date column is explicitly string
df_transactions['Date'] = df_transactions['Date'].astype(str)

# Save to CSV with all fields quoted so that formatting is preserved
df_transactions.to_csv("Transactions.csv", index=False, quoting=csv.QUOTE_ALL)
print("Mock Transactions data generated and saved as 'Transactions.csv'.")


Mock Transactions data generated and saved as 'Transactions.csv'.


In [50]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import csv

# For reproducibility
np.random.seed(42)

total_customers = 10000

# Ensure each customer has at least one transaction.
base_customer_ids = np.arange(1, total_customers + 1)
extra_transactions = 10000  # Increase extra transactions if needed
extra_customer_ids = np.random.randint(1, total_customers + 1, extra_transactions)
customer_ids = np.concatenate([base_customer_ids, extra_customer_ids])
num_transactions = len(customer_ids)

# TransactionID: Sequential IDs starting from 5001
transaction_ids = np.arange(5001, 5001 + num_transactions)

# Original products dictionary with variations
products = {
    'Mortgage': ['Fixed Rate', 'Adjustable Rate', 'Hybrid Rate', 'Interest Only', 'Balloon Payment', 'Reverse Mortgage'],
    'Credit Card': ['Rewards', 'Cashback', 'Travel', 'Business', 'Student', 'Secured', 'Balance Transfer'],
    'Checking Account': ['Premium', 'Basic', 'Student', 'Senior', 'Interest Bearing'],
    'Savings Account': ['High Yield', 'Regular', 'Money Market', "Children's Savings"],
    'Certificate of Deposit': ['3-Month', '6-Month', '1-Year', '2-Year', '5-Year', '10-Year'],
    'Personal Loan': ['Secured', 'Unsecured', 'Debt Consolidation', 'Home Improvement'],
    'Auto Loan': ['New Car', 'Used Car', 'Refinance', 'Lease'],
    'Home Equity Loan': ['Fixed Rate', 'Variable Rate', 'HELOC'],
    'Investment Account': ['Managed', 'Self-Directed', 'Robo Advisor', 'Brokerage'],
    'Business Loan': ['Term Loan', 'Line of Credit', 'SBA Loan', 'Equipment Financing'],
    'Retirement Account': ['IRA', '401K', 'Roth IRA', 'SEP IRA', 'Simple IRA'],
    'Student Loan': ['Federal', 'Private', 'Refinance', 'Consolidation']
}

# Define three product groups (fewer combinations per customer)
group1 = ['Mortgage', 'Credit Card', 'Business Loan']
group2 = ['Checking Account', 'Savings Account', 'Certificate of Deposit', 'Personal Loan']
group3 = ['Auto Loan', 'Home Equity Loan', 'Investment Account', 'Retirement Account', 'Student Loan']

groups = {1: group1, 2: group2, 3: group3}
all_products = group1 + group2 + group3  # complete list of products

# Assign each customer to one group at random
customer_group = {}
for cid in range(1, total_customers + 1):
    customer_group[cid] = np.random.choice([1, 2, 3])

# Generate chosen_products for each transaction:
# With 80% probability, choose from the customer's group.
chosen_products = []
for cid in customer_ids:
    if np.random.rand() < 0.8:
        prod = np.random.choice(groups[customer_group[cid]])
    else:
        prod = np.random.choice(all_products)
    chosen_products.append(prod)

# Generate variations for each chosen product from the original products dictionary
chosen_variations = [np.random.choice(products[prod]) for prod in chosen_products]

# Generate random dates between 2020-01-01 and 2025-03-31
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 3, 31)
days_range = (end_date - start_date).days
random_days = np.random.randint(0, days_range + 1, num_transactions)
dates = [start_date + timedelta(days=int(day)) for day in random_days]
formatted_dates = [d.strftime("%m/%d/%Y") for d in dates]

# Create the DataFrame
df_transactions = pd.DataFrame({
    'TransactionID': transaction_ids,
    'CustomerID': customer_ids,
    'Product': chosen_products,
    'Variation': chosen_variations,
    'Date': formatted_dates
})

# Ensure Date column is explicitly string and save CSV with quotes to preserve formatting
df_transactions['Date'] = df_transactions['Date'].astype(str)
df_transactions.to_csv("Transactions.csv", index=False, quoting=csv.QUOTE_ALL)
print("Modified mock Transactions data generated and saved as 'Transactions.csv'.")


Modified mock Transactions data generated and saved as 'Transactions.csv'.


In [51]:
import pandas as pd

# Load the Transactions data
df_transactions = pd.read_csv("Transactions.csv")

# Take a quick look at the data
print("Transactions Data Sample:")
print(df_transactions.head())


Transactions Data Sample:
   TransactionID  CustomerID      Product         Variation        Date
0           5001           1     Mortgage   Adjustable Rate  03/13/2025
1           5002           2     Mortgage  Reverse Mortgage  08/23/2024
2           5003           3  Credit Card           Secured  11/03/2023
3           5004           4     Mortgage        Fixed Rate  02/02/2024
4           5005           5  Credit Card            Travel  03/13/2025


In [52]:
# Group transactions by CustomerID and aggregate unique products into a list (basket)
basket = df_transactions.groupby('CustomerID')['Product'] \
                          .apply(lambda x: list(set(x))) \
                          .reset_index()

print("\nBasket Sample:")
print(basket.head)



Basket Sample:
<bound method NDFrame.head of       CustomerID                                            Product
0              1                                         [Mortgage]
1              2  [Investment Account, Mortgage, Auto Loan, Home...
2              3                            [Credit Card, Mortgage]
3              4                                         [Mortgage]
4              5    [Credit Card, Certificate of Deposit, Mortgage]
...          ...                                                ...
9995        9996                               [Investment Account]
9996        9997                  [Personal Loan, Checking Account]
9997        9998                                         [Mortgage]
9998        9999                          [Business Loan, Mortgage]
9999       10000                                         [Mortgage]

[10000 rows x 2 columns]>


In [53]:
from mlxtend.preprocessing import TransactionEncoder

# Transform the basket list into one-hot encoded format
te = TransactionEncoder()
te_array = te.fit(basket['Product']).transform(basket['Product'])
df_basket = pd.DataFrame(te_array, columns=te.columns_, index=basket['CustomerID'])
df_basket.reset_index(inplace=True)  # Optionally keep CustomerID as a column

print("\nOne-Hot Encoded Basket Sample:")
print(df_basket.head())



One-Hot Encoded Basket Sample:
   CustomerID  Auto Loan  Business Loan  Certificate of Deposit  \
0           1      False          False                   False   
1           2       True          False                   False   
2           3      False          False                   False   
3           4      False          False                   False   
4           5      False          False                    True   

   Checking Account  Credit Card  Home Equity Loan  Investment Account  \
0             False        False             False               False   
1             False        False              True                True   
2             False         True             False               False   
3             False        False             False               False   
4             False         True             False               False   

   Mortgage  Personal Loan  Retirement Account  Savings Account  Student Loan  
0      True          False              

In [56]:
from mlxtend.frequent_patterns import apriori, association_rules

# Drop CustomerID from the one-hot encoded DataFrame since it's not needed for Apriori
df_onehot = df_basket.drop(columns=['CustomerID'])

# Run the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_onehot, min_support=0.005, use_colnames=True)

print("\nFrequent Itemsets Sample:")
print(frequent_itemsets)



Frequent Itemsets Sample:
    support                                           itemsets
0    0.1261                                        (Auto Loan)
1    0.1718                                    (Business Loan)
2    0.1466                           (Certificate of Deposit)
3    0.1516                                 (Checking Account)
4    0.1771                                      (Credit Card)
..      ...                                                ...
84   0.0108  (Savings Account, Checking Account, Personal L...
85   0.0056  (Investment Account, Retirement Account, Home ...
86   0.0050  (Investment Account, Home Equity Loan, Student...
87   0.0052  (Home Equity Loan, Retirement Account, Student...
88   0.0054  (Investment Account, Retirement Account, Stude...

[89 rows x 2 columns]


In [57]:
# Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

print("\nAssociation Rules Sample:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])



Association Rules Sample:
                            antecedents  \
0                           (Auto Loan)   
1                    (Home Equity Loan)   
2                           (Auto Loan)   
3                  (Investment Account)   
4                  (Retirement Account)   
..                                  ...   
99   (Student Loan, Investment Account)   
100  (Retirement Account, Student Loan)   
101                (Investment Account)   
102                (Retirement Account)   
103                      (Student Loan)   

                                  consequents  support  confidence      lift  
0                          (Home Equity Loan)   0.0250    0.198255  1.627712  
1                                 (Auto Loan)   0.0250    0.205255  1.627712  
2                        (Investment Account)   0.0233    0.184774  1.461820  
3                                 (Auto Loan)   0.0233    0.184335  1.461820  
4                                 (Auto Loan)   0.0262    0.2