In [9]:
from openai import OpenAI

client = OpenAI(
    base_url="https://openai.vocareum.com/v1",
    api_key="YOUR_API_KEY"
)


Creating synthetic startup dataset...
Dataset shape: (50, 11)
Columns: ['name', 'type', 'category', 'country', 'status', 'market', 'funding_usd', 'founded_year', 'team_size', 'description', 'text']

Sample data:


Unnamed: 0,name,type,category,text
0,Startup_1,API Service,E-commerce,Startup_1 | Type: API Service | Category: E-co...
1,Startup_2,SaaS,Cybersecurity,Startup_2 | Type: SaaS | Category: Cybersecuri...
2,Startup_3,Mobile App,Healthtech,Startup_3 | Type: Mobile App | Category: Healt...
3,Startup_4,Marketplace,AI/ML,Startup_4 | Type: Marketplace | Category: AI/M...
4,Startup_5,B2C,Real Estate,Startup_5 | Type: B2C | Category: Real Estate ...



✅ Dataset ready with 50 rows!


In [14]:
!pip install kagglehub
!pip install tiktoken
!pip install -U scikit-learn
!pip install numpy
!pip install pandas
!pip install openai



Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.7.1


In [23]:
import kagglehub
import pandas as pd
import numpy as np
import json
import os

path = kagglehub.dataset_download("manishkc06/startup-success-prediction")
print("Path to dataset files:", path)

# List files in the downloaded directory to confirm the filename
print("Files in directory:", os.listdir(path))

# Construct the full path to the CSV file
csv_file_path = f"{path}/startup data.csv"
df_raw = pd.read_csv(csv_file_path)
print(df_raw.shape)
df_raw.head()
df = df_raw.copy()

df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

df = df.dropna(how="all").drop_duplicates()

if len(df) < 20:
    raise ValueError("Dataset does not contain at least 20 rows after cleaning.")

candidate_cols = [
    "name", "category_list", "country_code", "state_code", "region", "city",
    "status", "funding_total_usd", "funding_rounds", "founded_at", "first_funding_at",
    "last_funding_at", "milestones", "relationships", "age_first_funding_year",
    "age_last_funding_year", "is_top500", "markets", "description"
]
existing = [c for c in candidate_cols if c in df.columns]

def row_to_text(row):
    parts = []
    for c in existing:
        val = row.get(c, None)
        if pd.notna(val):
            parts.append(f"{c}: {val}")
    return " | ".join(map(str, parts)) if parts else ""

df["text"] = df.apply(row_to_text, axis=1)

non_empty_text = df["text"].replace("", np.nan).dropna()
if len(non_empty_text) < 20:
    df["text"] = df.apply(lambda r: json.dumps(r.to_dict(), default=str), axis=1)

assert "text" in df.columns
assert len(df) >= 20

df[["text"]].head(3)

Path to dataset files: /kaggle/input/startup-success-prediction
Files in directory: ['startup data.csv']
(923, 49)


Unnamed: 0,text
0,name: Bandsintown | state_code: CA | city: San...
1,name: TriCipher | state_code: CA | city: Los G...
2,name: Plixi | state_code: CA | city: San Diego...


In [24]:

import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from openai import OpenAI

client = OpenAI(
    base_url="https://openai.vocareum.com/v1",
    api_key="YOUR_API_KEY"
)

print("Generating embeddings")
corpus = df["text"].astype(str).tolist()

def generate_embeddings(texts, model="text-embedding-3-small"):
    embeddings = []
    for i, text in enumerate(texts):
        if i % 50 == 0:
            print(f"Processing text {i+1}/{len(texts)}")
        try:
            response = client.embeddings.create(
                model=model,
                input=text
            )
            embeddings.append(response.data[0].embedding)
        except Exception as e:
            print(f"Error generating embedding for text {i}: {e}")
            embeddings.append([0.0] * 1536)
    return np.array(embeddings)

corpus_embeddings = generate_embeddings(corpus)
print(f"Generated embeddings shape: {corpus_embeddings.shape}")
print("Embeddings generation completed!")

Generating embeddings
Processing text 1/923
Processing text 51/923
Processing text 101/923
Processing text 151/923
Processing text 201/923
Processing text 251/923
Processing text 301/923
Processing text 351/923
Processing text 401/923
Processing text 451/923
Processing text 501/923
Processing text 551/923
Processing text 601/923
Processing text 651/923
Processing text 701/923
Processing text 751/923
Processing text 801/923
Processing text 851/923
Processing text 901/923
Generated embeddings shape: (923, 1536)
Embeddings generation completed!


In [25]:
def cosine_similarity_manual(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

def vector_similarity_search(query_embedding, corpus_embeddings, corpus_texts, top_k=5):
    similarities = []
    for i, corpus_emb in enumerate(corpus_embeddings):
        sim = cosine_similarity_manual(query_embedding, corpus_emb)
        similarities.append((sim, i))

    similarities.sort(reverse=True, key=lambda x: x[0])

    results = []
    for sim_score, idx in similarities[:top_k]:
        results.append({
            'text': corpus_texts[idx],
            'similarity_score': sim_score
        })

    return results

def embedding_based_retriever(query, corpus_texts, corpus_embeddings, top_k=5):
    query_embedding = generate_embeddings([query])[0]
    similar_texts = vector_similarity_search(query_embedding, corpus_embeddings, corpus_texts, top_k)
    return [item['text'] for item in similar_texts]


In [26]:

BASIC_SYSTEM_PROMPT = (
    "You are a helpful assistant. Answer concisely and accurately."
)

CUSTOM_SYSTEM_PROMPT = (
    "You are a Startup Idea Validator and Business Plan Assistant. "
    "Use the provided startup dataset context to ground your answers. "
    "Prioritize evidence on funding, market, team, and traction. "
    "Identify assumptions, ask for missing info if critical, and give actionable next steps."
)

def build_custom_user_prompt(question, retrieved_context):
    ctx_block = "\n\n".join([f"- {c}" for c in retrieved_context])
    return (
        "Context from startup dataset:\n"
        f"{ctx_block}\n\n"
        "User Question:\n"
        f"{question}\n\n"
        "Instructions:\n"
        "- Cite relevant signals from the context (funding, market, team, geography).\n"
        "- Provide at least 3 concrete next steps for validation.\n"
        "- If context is thin, state limitations and what extra data is needed."
    )


In [27]:

MODEL_NAME = "gpt-4o-mini"

def ask_basic(question):
    messages = [
        {"role": "system", "content": BASIC_SYSTEM_PROMPT},
        {"role": "user", "content": question},
    ]
    resp = client.chat.completions.create(model=MODEL_NAME, messages=messages, temperature=0.3)
    return resp.choices[0].message.content

def ask_custom(question):
    retrieved = embedding_based_retriever(question, corpus, corpus_embeddings, top_k=5)
    user_prompt = build_custom_user_prompt(question, retrieved)
    messages = [
        {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]
    resp = client.chat.completions.create(model=MODEL_NAME, messages=messages, temperature=0.3)
    return resp.choices[0].message.content

questions = [
    "Evaluate the success likelihood for a B2C wellness app in United States raising a seed round. What signals from similar startups matter most? Give Citations from the dataset.",
    "What market and funding risks should a SaaS startup in the US consider before Series A, and how can they mitigate them?"
]

results = []
for q in questions:
    basic = ask_basic(q)
    custom = ask_custom(q)
    results.append({"question": q, "basic_answer": basic, "custom_answer": custom})

for i, r in enumerate(results, 1):
    print(f"\n=== Question {i} ===")
    print(r["question"])
    print("\n--- Basic Answer ---")
    print(r["basic_answer"])
    print("\n--- Custom Answer ---")
    print(r["custom_answer"])

Processing text 1/1
Processing text 1/1

=== Question 1 ===
Evaluate the success likelihood for a B2C wellness app in United States raising a seed round. What signals from similar startups matter most? Give Citations from the dataset.

--- Basic Answer ---
Evaluating the success likelihood for a B2C wellness app in the United States raising a seed round involves analyzing several key factors and signals from similar startups. Here are the most relevant considerations:

1. **Market Demand**: The wellness industry has seen significant growth, with consumers increasingly prioritizing health and wellness. According to the Global Wellness Institute, the wellness economy was valued at $4.5 trillion in 2018 and continues to expand. A strong market demand for wellness solutions is a positive signal.

2. **User Engagement Metrics**: Startups that demonstrate high user engagement (e.g., daily active users, session length) tend to have better retention rates. Metrics from successful wellness apps

In [28]:

def chat_loop():
    print("Startup Idea Validator Chatbot (type 'exit' to quit)")
    while True:
        q = input("\nYour question: ").strip()
        if q.lower() in {"exit", "quit"}:
            print("Goodbye!")
            break
        try:
            print("\n[Basic]")
            print(ask_basic(q))
            print("\n[Custom - uses dataset context]")
            print(ask_custom(q))
        except Exception as e:
            print("Error:", e)

demo_question = (
    "Based on similar startups, what early traction should a fintech app show "
    "before approaching seed investors in United States? Give answer along with citations from the dataset."
)

print("Custom query with dataset context:\n")
print(ask_custom(demo_question))

Custom query with dataset context:

Processing text 1/1
To effectively approach seed investors in the United States, a fintech app should demonstrate early traction through several key indicators, as evidenced by the dataset of similar startups:

### Relevant Signals from the Dataset

1. **Funding**: 
   - Startups like KickApps and AppTrigger received significant funding ($39 million and $21.5 million respectively) within a few years of their founding. This indicates that early traction can often be linked to securing substantial initial investments.
   - For instance, KickApps had three funding rounds, suggesting a strong investor interest and confidence in its growth potential.

2. **Milestones**: 
   - The number of milestones achieved can be a strong indicator of traction. For example, KickApps achieved five milestones, while Appature and Appstores.com achieved two milestones each. These milestones could include product launches, user acquisition targets, or partnerships, which ar