In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import re
from collections import defaultdict
from tqdm import tqdm

In [None]:
url = "https://huggingface.co/datasets/google/Synthetic-Persona-Chat/resolve/main/data/Synthetic-Persona-Chat_train.csv"
df = pd.read_csv(url)

In [None]:
def fit_wave(arr):
    """
    Mutate arr in-place to form a smooth sine wave pattern,
    ending at arr[-1] and avoiding extreme scaling.
    """
    arr = np.asarray(arr, dtype=float)
    n = len(arr)
    if n < 3:
        return arr

    final_val = arr[-1]

    # 1. Create a base sine wave with consistent shape
    x = np.linspace(0, np.pi, n)  # One half-wave
    sine_wave = np.sin(x)

    # 2. Scale to approximate original amplitude
    mean_val = np.mean(arr[:-1])  # exclude last value for scaling
    amp = (np.max(arr[:-1]) - np.min(arr[:-1])) / 2
    if amp == 0:
        amp = 1.0  # avoid flatline
    scaled_wave = (sine_wave - 0.5) * 2 * amp + mean_val

    # 3. Smoothly shift so that last value matches
    delta = final_val - scaled_wave[-1]
    ramp = np.linspace(0, delta, n)
    adjusted_wave = scaled_wave + ramp

    # 4. Mutate in-place
    for i in range(n - 1):
        arr[i] = adjusted_wave[i]
    arr[-1] = final_val

    return arr

def generate_blurbs(df, n_samples=50000):
    """
    Extracts short, cleaned text blurbs from the 'Best Generated Conversation' column.

    Parameters:
        df (pd.DataFrame): Input dataframe with a 'Best Generated Conversation' column.
        n_samples (int): Number of blurbs to return.

    Returns:
        List[str]: A list of blurbs (short text segments).
    """
    all_blurbs = []

    for convo in df["Best Generated Conversation"].dropna():
        # Split into sentences
        sentences = re.split(r'[.?!]\s+', convo.strip())

        for sentence in sentences:
            sentence = sentence.strip().lower()

            # Remove speaker labels like "user 1: "
            sentence = re.sub(r"user \d+: ?", "", sentence)

            # Keep only blurbs with 3 to 256 words
            words = sentence.split()
            if 3 <= len(words) <= 256:
                all_blurbs.append(" ".join(words))

    # If fewer than n_samples are found, return all
    if len(all_blurbs) < n_samples:
        print(f"Only {len(all_blurbs)} blurbs found.")
        return all_blurbs

    # Randomly sample
    return random.sample(all_blurbs, n_samples)

def tokenize_blurbs(df, n_samples=50000):
    blurbs = generate_blurbs(df, n_samples)
    tokenized_blurbs = []
    vocab = set()

    for blurb in blurbs:
        tokens = re.findall(r'\b\w+\b', blurb.lower())
        if len(tokens) > 1:
            tokenized_blurbs.append(tokens)
            vocab.update(tokens)

    return tokenized_blurbs, sorted(vocab)

def build_training_pairs(tokenized_blurbs):
    pairs = []
    for tokens in tokenized_blurbs:
        for i in range(len(tokens) - 1):
            pairs.append((tokens[i], tokens[i + 1]))
    return pairs

def init_embeds(df, embed_dim=2):
    all_text = (
        df["user 1 personas"].fillna('') + " " +
        df["user 2 personas"].fillna('') + " " +
        df["Best Generated Conversation"].fillna('')
    )

    def tokenize(text):
        return re.findall(r'\b\w+\b', text.lower())

    unique_words = set()
    for row in all_text:
        unique_words.update(tokenize(row))

    embedding_dict = {
        word: np.random.uniform(-1, 1, size=embed_dim).tolist()
        for word in unique_words
    }
    return embedding_dict

def get_blurb_embedding_blob(blurb, embedding_dict):
    """
    Given a blurb and an embedding dictionary, return the embedding blob.

    Parameters:
        blurb (str): The input text (a short sentence or phrase).
        embedding_dict (dict): A dictionary mapping words to [float, float] vectors.

    Returns:
        np.ndarray: A (n_words, 2) array of word embeddings.
    """
    tokens = re.findall(r'\b\w+\b', blurb.lower())
    embeddings = []

    for word in tokens:
        if word in embedding_dict:
            embeddings.append(embedding_dict[word])
        else:
            # If word not found, assign a zero vector
            embeddings.append([0.0, 0.0])

    return np.array(embeddings)

def update_embedding_dict_with_wave(words, blob, embedding_dict):
    blob = np.array(blob, dtype=float)
    assert len(words) == blob.shape[0], "Mismatch between words and blob rows."

    for i in range(blob.shape[1]):
        fit_wave(blob[:, i])

    for i, word in enumerate(words):
        embedding_dict[word] = blob[i].tolist()

def try_fit_all_blurbs(df, max_dim=100, n_samples=50000, tolerance=1e-3):
    blurbs = generate_blurbs(df, n_samples)
    word_set = set()
    for blurb in blurbs:
        word_set.update(re.findall(r'\b\w+\b', blurb.lower()))

    # Track how many blurbs succeeded at each embedding dim
    success_counts = []

    for embed_dim in range(2, max_dim + 1):
        print(f"Trying embedding dimension: {embed_dim}")
        # Re-initialize embedding dict with this dimension
        embedding_dict = {
            word: np.random.uniform(-1, 1, size=embed_dim).tolist()
            for word in word_set
        }

        success = 0
        for blurb in blurbs:
            tokens = re.findall(r'\b\w+\b', blurb.lower())
            blob = get_blurb_embedding_blob(blurb, embedding_dict)
            original_blob = blob.copy()

            # Try fitting wave to each column
            try:
                for i in range(blob.shape[1]):
                    fit_wave(blob[:, i])

                # Update dictionary
                for i, word in enumerate(tokens):
                    embedding_dict[word] = blob[i].tolist()

                # Reconstruct and validate
                reconstructed = get_blurb_embedding_blob(blurb, embedding_dict)
                if np.allclose(original_blob, reconstructed, atol=tolerance):
                    success += 1
                else:
                    raise ValueError("Embedding mismatch")

            except Exception as e:
                continue  # Skip if any fitting or mismatch occurs

        print(f"Success at dim {embed_dim}: {success}/{len(blurbs)}")
        success_counts.append((embed_dim, success))

        # If all blurbs succeed, stop early
        if success == len(blurbs):
            break

    return success_counts

def train_embeddings(pairs, vocab, dim, lr=0.01, epochs=10):
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    n_words = len(vocab)

    # Embeddings: shape (vocab_size, dim)
    E = np.random.randn(n_words, dim) * 0.1
    W = np.random.randn(dim, dim) * 0.1  # Linear predictor

    losses = []

    for epoch in range(epochs):
        total_loss = 0
        random.shuffle(pairs)

        for w1, w2 in pairs:
            i1, i2 = word_to_idx[w1], word_to_idx[w2]
            e1, e2 = E[i1], E[i2]

            pred = W @ e1
            error = pred - e2
            loss = np.sum(error ** 2)
            total_loss += loss

            # Backprop
            grad_W = np.outer(error, e1)
            grad_e1 = W.T @ error
            grad_e2 = -error

            W -= lr * grad_W
            E[i1] -= lr * grad_e1
            E[i2] -= lr * grad_e2  # Optional: allow e2 to adapt too

        avg_loss = total_loss / len(pairs)
        losses.append(avg_loss)

    return E, W, losses[-1]

def optimize_dimensionality(df, max_dim=100, n_samples=50000, epochs=10):
    tokenized_blurbs, vocab = tokenize_blurbs(df, n_samples)
    pairs = build_training_pairs(tokenized_blurbs)

    results = []

    for dim in range(2, max_dim + 1):
        print(f"Training with embedding dim = {dim}")
        _, _, final_loss = train_embeddings(pairs, vocab, dim, lr=0.01, epochs=epochs)
        print(f"  Final MSE Loss: {final_loss:.6f}")
        results.append((dim, final_loss))

    return results

In [None]:
E.shape

(3376, 100)

In [None]:
df

Unnamed: 0,user 1 personas,user 2 personas,Best Generated Conversation
0,I am 32.\nI do not want a job.\nI play video g...,My favorite drink is iced coffee.\nI have a bl...,User 1: Hi! I'm [user 1's name].\nUser 2: Hi [...
1,I am 32.\nI play video games all day.\nI still...,I have a ford f150.\nI like ford cars.\nMy tru...,"User 1: Hey, how's it going?\nUser 2: Good, I'..."
2,I am 32.\nI play video games all day.\nI still...,I can recite the movie young frankenstein word...,"User 1: Hi, my name is John. What's your name?..."
3,I write.\nI work at mcdonald s.\nI watch youtu...,I want to move.\nI don t like feeling controll...,User 1: Hi!\nUser 2: Hey!\nUser 1: What's up?\...
4,I am bald.\nI like to swim.\nMy favorite drink...,My favorite store is american eagle.\nI enjoy ...,User 1: Hello!\nUser 2: Hi!\nUser 1: What do y...
...,...,...,...
8933,"My parents used to work in politics , until th...",I am in a very intimate and loving relationshi...,"User 1: Hey, what's up?\nUser 2: Not much, jus..."
8934,"My parents used to work in politics , until th...",My mind is set on things above.\nI hate evil.\...,User 1: Hey!\nUser 2: Hello!\nUser 1: What do ...
8935,"I went on welfare last month , which makes me ...",I got married last year.\nI am a hair stylist....,"User 1: Hey, I'm [name].\nUser 2: Hi, I'm [nam..."
8936,"I went on welfare last month , which makes me ...",I wish i made more money.\nI have a strange ob...,"User 1: Hi, I'm [user 1].\n\nUser 2: Hi, I'm [..."


In [None]:
results = optimize_dimensionality(df, max_dim=50, n_samples=10000, epochs=5)
for dim, loss in results:
    print(f"Dim {dim}: MSE loss = {loss:.5f}")

Training with embedding dim = 2


KeyboardInterrupt: 

In [None]:
blurbs = generate_blurbs(df)
embedding_dict = init_embeds(df)

blurb = blurbs[-1]
tokens = re.findall(r'\b\w+\b', blurb.lower())
blob = get_blurb_embedding_blob(blurb, embedding_dict)

update_embedding_dict_with_wave(tokens, blob, embedding_dict)
blob2 = get_blurb_embedding_blob(blurb, embedding_dict)

In [None]:
plt.plot([x[0] for x in blob])

In [None]:
plt.plot([x[0] for x in blob2])

In [None]:
blob

In [None]:
arr = np.array([1,2,1,0,3,5])

In [None]:
plt.plot(arr)

In [None]:
plt.plot(fit_wave(arr))

In [None]:
def tokenize_and_split_blurbs(df, n_samples=50000, test_ratio=0.2):
    blurbs = generate_blurbs(df, n_samples)
    tokenized = []
    vocab = set()

    for blurb in blurbs:
        tokens = re.findall(r'\b\w+\b', blurb.lower())
        if len(tokens) > 1:
            tokenized.append(tokens)
            vocab.update(tokens)

    # Shuffle and split
    random.shuffle(tokenized)
    split = int(len(tokenized) * (1 - test_ratio))
    train_blurbs = tokenized[:split]
    test_blurbs = tokenized[split:]

    return train_blurbs, test_blurbs, sorted(vocab)

def build_pairs(tokenized_blurbs):
    pairs = []
    for tokens in tokenized_blurbs:
        for i in range(len(tokens) - 1):
            pairs.append((tokens[i], tokens[i + 1]))
    return pairs
def train_embeddings(pairs, vocab, dim, lr=0.01, epochs=10):
    word_to_idx = {w: i for i, w in enumerate(vocab)}
    E = np.random.randn(len(vocab), dim) * 0.1
    W = np.random.randn(dim, dim) * 0.1

    for epoch in range(epochs):
        random.shuffle(pairs)
        total_loss = 0

        for w1, w2 in pairs:
            i1, i2 = word_to_idx[w1], word_to_idx[w2]
            e1, e2 = E[i1], E[i2]

            pred = W @ e1
            error = pred - e2
            loss = np.sum(error ** 2)
            total_loss += loss

            # Backprop
            W -= lr * np.outer(error, e1)
            E[i1] -= lr * (W.T @ error)
            E[i2] -= lr * (-error)  # allow target to adjust (optional)

        print(f"Epoch {epoch+1}: Loss = {total_loss / len(pairs):.6f}")

    return E, W, word_to_idx
def evaluate_embeddings(test_pairs, E, W, word_to_idx):
    errors = []

    for w1, w2 in test_pairs:
        if w1 not in word_to_idx or w2 not in word_to_idx:
            continue
        i1, i2 = word_to_idx[w1], word_to_idx[w2]
        pred = W @ E[i1]
        true = E[i2]
        mse = np.mean((pred - true) ** 2)
        errors.append(mse)

    avg_mse = np.mean(errors)
    print(f"\nTest MSE: {avg_mse:.6f}")
    return avg_mse


In [None]:
train_blurbs, test_blurbs, vocab = tokenize_and_split_blurbs(df, n_samples=10000)
train_pairs = build_pairs(train_blurbs)
test_pairs = build_pairs(test_blurbs)

dim = 1024
E, W, word_to_idx = train_embeddings(train_pairs, vocab, dim, lr=0.01, epochs=100)

test_loss = evaluate_embeddings(test_pairs, E, W, word_to_idx)


KeyboardInterrupt: 

In [None]:
def train_embeddings_with_growing_sequence_length(
    tokenized_blurbs,
    vocab,
    dim,
    lr=0.01,
    epochs=10,
    min_len=5,
    max_len=256,
):
    word_to_idx = {w: i for i, w in enumerate(vocab)}
    E = np.random.randn(len(vocab), dim) * 0.1
    W = np.random.randn(dim, dim) * 0.1

    for epoch in range(epochs):
        # Linearly increase max allowed blurb length
        curr_max_len = int(min_len + (max_len - min_len) * (epoch / (epochs - 1)))

        # Generate new training pairs for this epoch
        pairs = []
        for tokens in tokenized_blurbs:
            if 2 <= len(tokens) <= curr_max_len:
                for i in range(len(tokens) - 1):
                    pairs.append((tokens[i], tokens[i + 1]))

        random.shuffle(pairs)
        total_loss = 0

        for w1, w2 in pairs:
            i1, i2 = word_to_idx[w1], word_to_idx[w2]
            e1, e2 = E[i1], E[i2]

            pred = W @ e1
            error = pred - e2
            loss = np.sum(error ** 2)
            total_loss += loss

            # Backprop
            W -= lr * np.outer(error, e1)
            E[i1] -= lr * (W.T @ error)
            E[i2] -= lr * (-error)

        print(f"Epoch {epoch+1}/{epochs} | Max len: {curr_max_len:3d} | Loss: {total_loss / len(pairs):.6f}")

    return E, W, word_to_idx


In [None]:
train_blurbs, test_blurbs, vocab = tokenize_and_split_blurbs(df, n_samples=10000)
E, W, word_to_idx = train_embeddings_with_growing_sequence_length(
    train_blurbs,
    vocab,
    dim=100,
    lr=0.1,
    epochs=10,
    min_len=5,
    max_len=256,
)


Epoch 1/10 | Max len:   5 | Loss: 0.243257
Epoch 2/10 | Max len:  32 | Loss: 0.102546
Epoch 3/10 | Max len:  60 | Loss: 0.046788
Epoch 4/10 | Max len:  88 | Loss: 0.030427
Epoch 5/10 | Max len: 116 | Loss: 0.021531
Epoch 6/10 | Max len: 144 | Loss: 0.015909
Epoch 7/10 | Max len: 172 | Loss: 0.012071
Epoch 8/10 | Max len: 200 | Loss: 0.009321
Epoch 9/10 | Max len: 228 | Loss: 0.007287
Epoch 10/10 | Max len: 256 | Loss: 0.005748


In [None]:
test_pairs = build_pairs(test_blurbs)
test_loss = evaluate_embeddings(test_pairs, E, W, word_to_idx)


Test MSE: 0.000408


In [None]:
E[0]

array([ 0.00753075,  0.0008178 , -0.00765929, -0.00109687,  0.00959058,
       -0.0064074 , -0.00690113, -0.00745902,  0.00530447,  0.00244912,
       -0.00290777, -0.00750715, -0.0067005 ,  0.00427543,  0.00366942,
       -0.00541915,  0.00667741,  0.00031248, -0.00304764, -0.01040911,
        0.00902738,  0.0128635 , -0.00966763,  0.00699592,  0.01205744,
       -0.00019397,  0.00601374, -0.00176726, -0.00910873, -0.00527945,
        0.00240791,  0.0117975 , -0.0043702 , -0.00461084,  0.00167314,
        0.00524811,  0.00700562,  0.00505662,  0.00815436, -0.00964976,
       -0.00113658,  0.00117697, -0.00255734, -0.0106756 , -0.01034122,
       -0.00712607, -0.00658268,  0.00892598,  0.00711222,  0.00806651,
        0.00290419,  0.00255786,  0.00865908, -0.00545585, -0.00017807,
       -0.00295799, -0.0106818 , -0.01397543, -0.00423442,  0.00098638,
        0.00211673, -0.00299921, -0.00784722,  0.00633694,  0.00743063,
       -0.01327763, -0.00838752, -0.00932254, -0.01048124, -0.00

In [None]:
len(vocab)

3361

In [None]:
def train_context_sum_embeddings(tokenized_blurbs, vocab, dim=100, lr=0.01, epochs=10):
    word_to_idx = {w: i for i, w in enumerate(vocab)}
    E = np.random.uniform(-1, 1, size=(len(vocab), dim))
    W = np.random.randn(dim, dim) * 0.1  # Linear projection from context to prediction

    for epoch in range(epochs):
        total_loss = 0
        count = 0

        for tokens in tokenized_blurbs:
            if len(tokens) < 3:
                continue

            idxs = [word_to_idx[t] for t in tokens if t in word_to_idx]
            for t in range(1, len(idxs) - 1):
                # Context sum (scaled)
                context_vec = np.sum(E[idxs[:t]], axis=0)
                context_vec *= 1 / np.sqrt(t)

                # Prediction target
                target_idx = idxs[t]
                target_vec = E[target_idx]

                # Predict and compute loss
                pred = W @ context_vec
                error = pred - target_vec
                loss = np.sum(error**2)
                total_loss += loss
                count += 1

                # Backpropagation
                grad_W = np.outer(error, context_vec)
                grad_context = W.T @ error

                W -= lr * grad_W
                # Distribute gradient back to all context embeddings
                grad_per_token = grad_context * (1 / np.sqrt(t))
                for i in idxs[:t]:
                    E[i] -= lr * grad_per_token
                E[target_idx] -= lr * (-error)

        print(f"Epoch {epoch+1}/{epochs} | Avg Loss: {total_loss / count:.6f}")

    return E, W, word_to_idx


In [None]:
tokenized_blurbs, vocab = tokenize_blurbs(df, n_samples=10000)
E, W, word_to_idx = train_context_sum_embeddings(tokenized_blurbs, vocab, dim=100, epochs=10)


Epoch 1/10 | Avg Loss: 9.761951
Epoch 2/10 | Avg Loss: 4.867360
Epoch 3/10 | Avg Loss: 3.758375
Epoch 4/10 | Avg Loss: 3.152309
Epoch 5/10 | Avg Loss: 2.748661
Epoch 6/10 | Avg Loss: 2.452607
Epoch 7/10 | Avg Loss: 2.222463
Epoch 8/10 | Avg Loss: 2.036399
Epoch 9/10 | Avg Loss: 1.881647
Epoch 10/10 | Avg Loss: 1.750146


In [None]:
from prophet import Prophet

model = Prophet()
model.add_regressor('marketing_spend')  # optional
model.fit(df_prophet)

forecast = model.predict(future)


In [None]:
import numpy as np
import pandas as pd
import re
from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from collections import defaultdict
