In [12]:
! py -m pip install mlxtend --upgrade



ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikeras 0.4.1 requires packaging<22.0,>=0.21, but you have packaging 23.2 which is incompatible.



Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting scikit-learn>=1.3.1 (from mlxtend)
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting joblib>=0.13.2 (from mlxtend)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 1.4/1.4 MB 23.2 MB/s eta 0:00:00
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 11.1/11.1 MB 18.8 MB/s eta 0:00:00
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn, mlxtend
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully unin

In [7]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import AzureChatOpenAI

In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

# Setup
fake = Faker()
np.random.seed(42)
random.seed(42)

# -----------------------
# 1. Generate Customers
# -----------------------
num_customers = 100
customer_data = []

for i in range(1, num_customers + 1):
    is_male = random.choice([True, False])
    first_name = fake.first_name_male() if is_male else fake.first_name_female()
    last_name = fake.last_name()
    gender = 'M' if is_male else 'F'
    dob = fake.date_of_birth(minimum_age=18, maximum_age=75)
    age = (datetime.now().date() - dob).days // 365
    income = random.randint(20000, 150000)
    credit_score = int(np.clip(np.random.normal(loc=600 + (income / 1000), scale=50), 300, 850))

    customer_data.append({
        "Customer_ID": f"CUST{i:04d}",
        "First_Name": first_name,
        "Last_Name": last_name,
        "Gender": gender,
        "Date_of_Birth": dob,
        "Age": age,
        "Email": fake.email(),
        "Phone": fake.phone_number(),
        "Address": fake.address().replace("\n", ", "),
        "City": fake.city(),
        "State": fake.state(),
        "Postal_Code": fake.postcode(),
        "Country": "USA",
        "Annual_Income": income,
        "Credit_Score": credit_score
    })

df_customers = pd.DataFrame(customer_data)
df_customers.to_csv("customers.csv", index=False)

# ----------------------------
# 2. Generate Transactions
# ----------------------------
transaction_data = []
categories = ['Grocery', 'Fuel', 'Dining', 'Travel', 'Medical', 'Baby', 'Education', 'Entertainment', 'Home', 'Salary']
merchants = {
    'Grocery': ['Walmart', 'Whole Foods', 'Kroger'],
    'Fuel': ['Shell', 'Exxon', 'Chevron'],
    'Dining': ['McDonald\'s', 'Starbucks', 'Chipotle'],
    'Travel': ['Delta', 'Airbnb', 'Uber'],
    'Medical': ['CVS', 'Walgreens', 'Urgent Care'],
    'Baby': ['BabyGap', 'Target', 'Pampers Store'],
    'Education': ['Coursera', 'Udemy', 'University Bookstore'],
    'Entertainment': ['Netflix', 'AMC Theatres', 'Spotify'],
    'Home': ['Home Depot', 'Lowe\'s', 'IKEA'],
    'Salary': ['Company Payroll', 'Direct Deposit']
}

txn_id = 1
for customer in df_customers['Customer_ID']:
    num_txns = random.randint(5, 15)
    for _ in range(num_txns):
        category = random.choice(categories)
        merchant = random.choice(merchants[category])
        amount = round(random.uniform(10, 2000) if category != 'Salary' else random.uniform(3000, 10000), 2)
        date = datetime.now() - timedelta(days=random.randint(0, 28))
        
        transaction_data.append({
            "Transaction_ID": f"TXN{txn_id:06d}",
            "Customer_ID": customer,
            "Timestamp": date.strftime("%Y-%m-%d %H:%M:%S"),
            "Merchant": merchant,
            "Category": category,
            "Amount": amount,
            "Description": f"{category} - {merchant}"
        })
        txn_id += 1

df_transactions = pd.DataFrame(transaction_data)
df_transactions.to_csv("transactions.csv", index=False)

# -------------------------
# 3. Generate Product Table
# -------------------------



TypeError: Random.randint() takes 3 positional arguments but 4 were given

In [3]:
import pandas as pd

# Load CSVs
df_customers = pd.read_csv("customers.csv")
df_transactions = pd.read_csv("transactions.csv")

# Ensure timestamp is in datetime format
df_transactions["Timestamp"] = pd.to_datetime(df_transactions["Timestamp"])

# Filter only last 4 weeks of transactions
latest_date = df_transactions["Timestamp"].max()
cutoff_date = latest_date - pd.Timedelta(days=28)
df_recent = df_transactions[df_transactions["Timestamp"] >= cutoff_date]

# Aggregate features per customer
features = df_recent.groupby("Customer_ID").agg(
    Total_Spend=("Amount", "sum"),
    Num_Transactions=("Transaction_ID", "count"),
    Avg_Txn_Amount=("Amount", "mean"),
    Max_Txn_Amount=("Amount", "max"),
    Has_Salary_Credit=("Category", lambda x: int("Salary" in x.values))
).reset_index()

# Pivot category spend into separate columns
category_spend = df_recent.pivot_table(
    index="Customer_ID",
    columns="Category",
    values="Amount",
    aggfunc="sum",
    fill_value=0
).add_prefix("Spend_").reset_index()

# Merge with customer demographics
df_feature_store = (
    features
    .merge(category_spend, on="Customer_ID", how="left")
    .merge(df_customers[["Customer_ID", "Age", "Annual_Income", "Credit_Score"]], on="Customer_ID", how="left")
)

# Save to CSV
df_feature_store.to_csv("feature_store.csv", index=False)


In [31]:
import pandas as pd

# Full product list with detailed columns
products = [
    ("P001", "Starter Credit Card", "Credit Card", "Low",
     "1% cashback on all purchases",
     "New credit users, low spenders",
     "Income > 20,000 and Credit Score > 600",
     "$25 cashback on first $300 spend"),
    
    ("P002", "Everyday Saver", "Savings Account", "Low",
     "4% interest, no maintenance fees",
     "Customers starting savings",
     "Open to all with minimum $100 deposit",
     "$50 bonus for maintaining $1,000 balance for 3 months"),
    
    ("P003", "Smart Shopper Card", "Credit Card", "Mid",
     "2% cashback on groceries & fuel",
     "High grocery and fuel spenders",
     "Income > 30,000 and Credit Score > 650",
     "$50 cashback on first $500 spend"),
    
    ("P004", "Smart Budget Account", "Savings + Budget", "Mid",
     "5% APY on goal-based savings",
     "Budget-conscious savers",
     "Income > 25,000",
     "Free premium budgeting tools for 6 months"),
    
    ("P005", "Travel Rewards Elite", "Credit Card", "High",
     "3x travel points, lounge access",
     "Frequent travelers",
     "Income > 70,000 and Credit Score > 700",
     "$200 travel voucher after $5,000 spend"),
    
    ("P006", "Elite Wealth Plan", "Investment", "High",
     "6-8% ROI, personal wealth advisor",
     "High-income professionals",
     "Income > 100,000 and Age > 30",
     "1% bonus returns for first year"),
    
    ("P007", "Family Future Plan", "Insurance/Savings", "Mid-High",
     "Child savings + life insurance combo",
     "Families, new parents",
     "Married, Age 25-45",
     "$100 bonus on first year premium"),
    
    ("P008", "Student Flex Account", "Savings", "Student",
     "No fees, education budgeting tools",
     "Students under 25",
     "Age < 25",
     "$100 top-up on monthly deposits over $200"),
    
    ("P009", "Home Advantage Loan", "Loan", "Mid-High",
     "Low-interest home loans",
     "First-time home buyers",
     "Home-related spend > $10,000",
     "Reduced interest rate for bundled insurance"),
    
    ("P010", "Retirement Essentials", "Wealth + Health", "Senior",
     "Pension fund + health perks",
     "Customers over 60",
     "Age > 60",
     "$500 wellness bonus for first year"),
    
    ("P011", "FlexFuel Card", "Credit Card", "Low-Mid",
     "3% cashback on fuel",
     "Regular commuters",
     "Fuel spend > $150/month",
     "$50 fuel voucher after $500 spend"),
    
    ("P012", "Digital Nomad Saver", "Savings Account", "Mid",
     "1.5% bonus APY for international use",
     "Frequent travelers, remote workers",
     "Travel spend > $500/month",
     "No international transfer fees for 6 months"),
    
    ("P013", "HealthSecure Plan", "Health + Savings", "Mid",
     "HSA integration, telehealth access",
     "Health-conscious individuals",
     "Medical spend > $200/month",
     "$200 credit toward medical expenses"),
    
    ("P014", "GreenLife Investment", "ESG Investment", "Mid-High",
     "5-9% returns in sustainable funds",
     "Eco-conscious investors",
     "Interest in ESG, Income > 40,000",
     "$100 green bonus + 0.5% bonus returns"),
    
    ("P015", "Weekend Explorer Card", "Credit Card", "Mid",
     "2.5% cashback on dining & entertainment",
     "Social and active lifestyle",
     "Dining + entertainment spend > $400/month",
     "Free concert ticket after $2,000 spend"),
    
    ("P016", "CashBuilder Certificate", "CD / Fixed Deposit", "Low-Mid",
     "4.75% fixed interest (6 months)",
     "Idle balance savers",
     "Idle funds > $5,000",
     "$25 bonus for auto-renewal"),
    
    ("P017", "Lifestyle Bundle Plus", "Bundle (3-in-1)", "High",
     "Credit + Wealth + Travel perks",
     "Affluent, multi-product users",
     "Income > 100,000",
     "$300 statement credit + concierge onboarding"),
    
    ("P018", "BabyStart Trust Plan", "Child Investment", "Mid",
     "Custodial account for education",
     "Parents with young children",
     "Children < 5 years",
     "1st year fees waived + $100 education bonus"),
    
    ("P019", "MoveSmart Relocation Loan", "Loan", "Mid",
     "0% for 6 months, flexible repayment",
     "Customers with large recent spends",
     "One-time spend > $5,000",
     "$150 moving voucher"),
    
    ("P020", "SideHustle Account", "Business Checking", "Low-Mid",
     "No fees, invoice management",
     "Freelancers, side businesses",
     "Self-employed, 3+ biz txns/month",
     "$75 bonus for linking payment gateway")
]

# Create DataFrame
columns = ["Product_ID", "Product_Name", "Product_Type", "Tier",
           "Features_Benefits", "Target_Behavior", "Eligibility_Criteria", "Special_Offer"]

df_products = pd.DataFrame(products, columns=columns)

# Save to CSV
df_products.to_csv("products.csv", index=False)

print("✅ products.csv created with detailed product information.")


✅ products.csv created with detailed product information.


In [37]:
import pandas as pd

# Load data
df_transactions = pd.read_csv("transactions.csv")
df_customers = pd.read_csv("customers.csv")
df_feature_store = pd.read_csv("feature_store.csv")

# Ensure Timestamp is datetime
df_transactions["Timestamp"] = pd.to_datetime(df_transactions["Timestamp"])

# 1️⃣ Calculate Aggregation_Days per customer
agg_days = df_transactions.groupby("Customer_ID").agg(
    First_Txn_Date=("Timestamp", "min"),
    Last_Txn_Date=("Timestamp", "max")
)
agg_days["Aggregation_Days"] = (agg_days["Last_Txn_Date"] - agg_days["First_Txn_Date"]).dt.days + 1
agg_days = agg_days[["Aggregation_Days"]].reset_index()

# 2️⃣ Calculate Spend Variability (std deviation of Amount)
spend_var = df_transactions.groupby("Customer_ID").agg(
    Spend_Variability=("Amount", "std")
).fillna(0).reset_index()

# 3️⃣ Calculate Salary_to_Spend_Ratio
salary_spend = df_transactions.groupby(["Customer_ID", "Category"]).agg(Total_Category_Spend=("Amount", "sum")).reset_index()
salary = salary_spend[salary_spend["Category"] == "Salary"][["Customer_ID", "Total_Category_Spend"]].rename(columns={"Total_Category_Spend": "Total_Salary"})
total_spend = df_feature_store[["Customer_ID", "Total_Spend"]]

salary_ratio = pd.merge(total_spend, salary, on="Customer_ID", how="left").fillna(0)
salary_ratio["Salary_to_Spend_Ratio"] = salary_ratio["Total_Salary"] / salary_ratio["Total_Spend"]
salary_ratio = salary_ratio[["Customer_ID", "Salary_to_Spend_Ratio"]]

# 4️⃣ Top Spend Category
top_category = df_transactions[df_transactions["Category"] != "Salary"].groupby(["Customer_ID", "Category"]).agg(
    Category_Spend=("Amount", "sum")
).reset_index()

top_spend_cat = top_category.loc[top_category.groupby("Customer_ID")["Category_Spend"].idxmax()]
top_spend_cat = top_spend_cat[["Customer_ID", "Category"]].rename(columns={"Category": "Top_Spend_Category"})

# 5️⃣ Idle Balance Estimate
income = df_customers[["Customer_ID", "Annual_Income"]]
idle_balance = pd.merge(total_spend, income, on="Customer_ID", how="left")
idle_balance["Idle_Balance_Estimate"] = idle_balance["Annual_Income"] - idle_balance["Total_Spend"]
idle_balance = idle_balance[["Customer_ID", "Idle_Balance_Estimate"]]

# Merge all new features
df_enhanced = df_feature_store.merge(agg_days, on="Customer_ID", how="left")
df_enhanced = df_enhanced.merge(spend_var, on="Customer_ID", how="left")
df_enhanced = df_enhanced.merge(salary_ratio, on="Customer_ID", how="left")
df_enhanced = df_enhanced.merge(top_spend_cat, on="Customer_ID", how="left")
df_enhanced = df_enhanced.merge(idle_balance, on="Customer_ID", how="left")

# Fill any remaining NaNs
df_enhanced = df_enhanced.fillna({
    "Spend_Variability": 0,
    "Salary_to_Spend_Ratio": 0,
    "Top_Spend_Category": "Unknown",
    "Aggregation_Days": 28  # Default if only 1 txn
})

# Save enhanced feature store
df_enhanced.to_csv("feature_store_enhanced.csv", index=False)

print("✅ Enhanced feature_store_enhanced.csv created.")


✅ Enhanced feature_store_enhanced.csv created.


In [38]:
import pandas as pd
import sqlite3

# Load CSVs
df_customers = pd.read_csv("customers.csv")
df_transactions = pd.read_csv("transactions.csv")
df_products = pd.read_csv("products.csv")
df_feature_store = pd.read_csv("feature_store_enhanced.csv")  # From earlier step

# Create SQLite DB
conn = sqlite3.connect("cross_selling.db")

# Write tables
df_customers.to_sql("customers", conn, if_exists="replace", index=False)
df_transactions.to_sql("transactions", conn, if_exists="replace", index=False)
df_products.to_sql("products", conn, if_exists="replace", index=False)
df_feature_store.to_sql("feature_store", conn, if_exists="replace", index=False)

conn.close()
print("✅ Data loaded into cross_selling.db")


✅ Data loaded into cross_selling.db


## LLM

In [None]:
import sqlite3
import pandas as pd
from langchain.agents import tool

conn = sqlite3.connect("cross_selling.db")

@tool
def fetch_customer_profile(name: str) -> str:
    """Fetch basic customer profile by full name."""
    df = pd.read_sql(f"SELECT * FROM customers WHERE First_Name || ' ' || Last_Name = '{name}'", conn)
    return df.to_json(orient="records") if not df.empty else "Customer not found."

@tool
def analyze_customer_behavior(customer_id: str) -> str:
    """Provides a detailed analysis of customer behavior, spending patterns, and financial signals."""
    df = pd.read_sql(f"SELECT * FROM feature_store_enhanced WHERE Customer_ID = '{customer_id}'", conn)
    if df.empty:
        return "No behavior data found for this customer."
    
    row = df.iloc[0]
    insights = []
    
    # Income & Credit Score Insights
    if row["Annual_Income"] > 100000:
        insights.append(f"High annual income: ${row['Annual_Income']}")
    if row["Credit_Score"] > 700:
        insights.append(f"Strong credit score: {row['Credit_Score']}")
    
    # Spending Patterns
    if row.get("Spend_Grocery", 0) > 500:
        insights.append(f"Grocery spend: ${row['Spend_Grocery']:.2f}")
    if row.get("Spend_Travel", 0) > 800:
        insights.append(f"Travel spend: ${row['Spend_Travel']:.2f}")
    if row.get("Spend_Fuel", 0) > 150:
        insights.append(f"Fuel spend: ${row['Spend_Fuel']:.2f}")
    if row.get("Spend_Medical", 0) > 200:
        insights.append(f"Medical expenses: ${row['Spend_Medical']:.2f}")

    # Salary & Disposable Income
    if row["Has_Salary_Credit"]:
        insights.append("Regular salary credits detected")
    if row["Salary_to_Spend_Ratio"] > 0.5:
        insights.append(f"Good disposable income (Salary to Spend Ratio: {row['Salary_to_Spend_Ratio']:.2f})")

    # Idle Balance Potential
    if row["Idle_Balance_Estimate"] > 5000:
        insights.append(f"Potential idle balance: ${row['Idle_Balance_Estimate']:.2f}")

    # Top Spending Category
    insights.append(f"Primary spending category: {row['Top_Spend_Category']}")

    # Aggregation Period
    insights.append(f"Observed over {row['Aggregation_Days']} days")

    return " | ".join(insights)

@tool
def fetch_product_catalog(dummy_input: str) -> str:
    """Returns the bank's product catalog for cross-selling."""
    df = pd.read_sql("SELECT * FROM products", conn)
    return df.to_json(orient="records")





In [4]:
OPENAI_DEPLOYMENT_ENDPOINT = "https://az-openai-document-question-answer-service.openai.azure.com/" 
OPENAI_API_KEY = "5d24331966b648738e5003caad552df8" 
OPENAI_API_VERSION = "2023-05-15"

OPENAI_DEPLOYMENT_NAME = "az-gpt_35_model"
OPENAI_MODEL_NAME="gpt-3.5-turbo"

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = "az-embedding_model" 
OPENAI_ADA_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

encoding_name = "cl100k_base"

llm = AzureChatOpenAI(
                        temperature=0.1,
                        deployment_name=OPENAI_DEPLOYMENT_NAME,
                        model_name=OPENAI_MODEL_NAME,
                        azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
                        openai_api_version=OPENAI_API_VERSION,
                        openai_api_key=OPENAI_API_KEY            
                    )

In [34]:
system_prompt = """
You are an AI financial advisor specializing in personalized product recommendations.

Instructions:
1. Use the customer's **financial profile** and **spending behavior** to recommend up to 2 products.
2. Reference specific numbers in your reasoning:
   - Mention spend amounts (e.g., "$750 on groceries in 4 weeks").
   - Mention credit score, income, age, or salary detection if relevant.
3. Match products from the catalog based on these data points and the product features.
4. Do **NOT** start with "Hello [Customer Name]" or end with generic phrases like "feel free to ask".
5. Provide a clear, concise explanation:
   - State WHY each product is recommended.
   - Link customer behavior directly to product benefits.
6. Use bullet points if multiple recommendations.

Output Format Example:

Recommendation Summary:
- Based on grocery spend of $720 and fuel spend of $180, the 'Smart Shopper Card' is suitable due to 2% cashback on groceries.
- Travel spend of $1,400 and a credit score of 720 make 'Travel Rewards Elite' ideal for maximizing travel points.

"""

In [35]:
from langchain.agents.format_scratchpad.openai_tools import format_to_openai_tool_messages
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain.agents import AgentExecutor
import textwrap

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

tools = [fetch_customer_profile, analyze_customer_behavior, fetch_product_catalog]
llm_with_tools = llm.bind_tools(tools)

agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(x["intermediate_steps"]),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

response = agent_executor.invoke({"input": "What do you recommend for Austin Jones"})

output_text = response.get('output', '') if isinstance(response, dict) else getattr(response, 'content', str(response))

# Pretty print with wrapping at 100 characters





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `fetch_customer_profile` with `{'name': 'Austin Jones'}`


[0m[36;1m[1;3m[{"Customer_ID":"CUST0003","First_Name":"Austin","Last_Name":"Jones","Gender":"M","Date_of_Birth":"1992-01-01","Age":33,"Email":"wrightkayla@example.net","Phone":"963-583-1637","Address":"442 Collier Throughway Apt. 210, Annachester, SC 13867","City":"North Rodney","State":"Ohio","Postal_Code":77034,"Country":"USA","Annual_Income":38289,"Credit_Score":670}][0m[32;1m[1;3m
Invoking: `analyze_customer_behavior` with `{'customer_id': 'CUST0003'}`


[0m[33;1m[1;3mHigh grocery spending: $958.81 | Regular salary credits detected[0m[32;1m[1;3m
Invoking: `fetch_product_catalog` with `{'dummy_input': 'dummy'}`


[0m[38;5;200m[1;3m[{"Product_ID":"P001","Product_Name":"Starter Credit Card","Product_Type":"Credit Card","Tier":"Low","Features_Benefits":"1% cashback on all purchases","Target_Behavior":"New credit users, low spenders","Eligibil

In [36]:
print("\n" + textwrap.fill(output_text, width=100))


Recommendation Summary: - Based on Austin Jones's high grocery spending of $958.81, I recommend the
**Smart Shopper Card**.   - This card offers 2% cashback on groceries, which aligns well with
Austin's significant grocery expenses.   - The eligibility criteria require an income of over
$30,000 and a credit score above 650, which fits Austin's financial profile.  Feel free to explore
this card for maximizing cashback on your grocery purchases!
