In [25]:
import pandas as pd

# Read expense data from CSV
df = pd.read_csv("/Users/sayems_mac/wealth_odyssey/Model/demo_expenses.csv")

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# (Optional) Filter data for a specific user if needed
user_id = 1
user_expenses = df[df['user_id'] == user_id]

# Create a pivot table that sums spending per category over a period (say, monthly)
expense_features = user_expenses.pivot_table(
    index=pd.Grouper(key='date', freq='M'),
    columns='category',
    values='amount',
    aggfunc='sum',
    fill_value=0
)

print(expense_features.head())


category    entertainment  food  other  shopping  travel  utilities
date                                                               
2023-01-31             90   175     70       120     350        100


  index=pd.Grouper(key='date', freq='M'),


In [14]:
# Aggregate spending by category over a chosen time period (e.g., last 3 months)
recent_expenses = user_expenses[user_expenses['date'] > pd.Timestamp.now() - pd.DateOffset(months=3)]
category_totals = recent_expenses.groupby("category")["amount"].sum()

# Create a feature vector in a fixed order for key categories
categories = ['travel', 'food', 'entertainment', 'utilities']  # adjust as needed
feature_vector = [category_totals.get(cat, 0) for cat in categories]

print("Feature Vector:", feature_vector)



Feature Vector: [0, 0, 0, 0]


In [17]:
from sklearn.cluster import KMeans
import numpy as np

# For demonstration, suppose you have historical feature vectors for multiple time periods/users
# Each row is a feature vector for a time period for one user
X = np.array([
    [500, 100, 50, 200],   # e.g., high travel spending: likely "Traveller"
    [50, 600, 70, 100],    # high food spending: likely "Foodaholic"
    [400, 150, 80, 220],
    [70, 650, 60, 90]
])

# Cluster into two groups (for simplicity)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X)

print("Cluster assignments:", clusters)

# Suppose after manual inspection we label cluster 0 as "Traveller" and cluster 1 as "Foodaholic"
def map_cluster_to_persona(cluster_label):
    mapping = {0: "Traveller", 1: "Foodaholic"}
    return mapping.get(cluster_label, "Unknown")

# For the current user's feature vector:
current_cluster = kmeans.predict([feature_vector])[0]
current_persona = map_cluster_to_persona(current_cluster)
print("Current Persona:", current_persona)


Cluster assignments: [1 0 1 0]
Current Persona: Foodaholic


In [24]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load a pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load your fine-tuning dataset (make sure your JSONL file is in the correct format)
dataset = load_dataset("json", data_files="/Users/sayems_mac/wealth_odyssey/Model/finetune_data.jsonl", split="train")

# Tokenize the dataset
def tokenize_function(examples):
    # For each pair of prompt and completion, concatenate them with a space in between.
    return tokenizer([p + " " + c for p, c in zip(examples["prompt"], examples["completion"])],
                     truncation=True)
    
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "completion"])

# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2_finetuned")
tokenizer.save_pretrained("./gpt2_finetuned")


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.