In [13]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path

from openai import OpenAI

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
client = OpenAI(
    base_url="https://openai.vocareum.com/v1",
    api_key=OPENAI_API_KEY
)


In [5]:
!pip install kagglehub
import kagglehub

path = kagglehub.dataset_download("manishkc06/startup-success-prediction")
print("Path to dataset files:", path)

data_dir = Path(path)
list(data_dir.iterdir())


Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.12



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Downloading from https://www.kaggle.com/api/v1/datasets/download/manishkc06/startup-success-prediction?dataset_version_number=1...


100%|██████████| 64.1k/64.1k [00:00<00:00, 222kB/s]

Extracting files...
Path to dataset files: C:\Users\KHAVIN S\.cache\kagglehub\datasets\manishkc06\startup-success-prediction\versions\1





[WindowsPath('C:/Users/KHAVIN S/.cache/kagglehub/datasets/manishkc06/startup-success-prediction/versions/1/startup data.csv')]

In [6]:
csv_files = [p for p in data_dir.rglob("*.csv")]
if not csv_files:
    raise FileNotFoundError("No CSV files found in the downloaded dataset folder.")

df_raw = pd.read_csv(csv_files[0])
print(df_raw.shape)
df_raw.head()
df = df_raw.copy()

df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

df = df.dropna(how="all").drop_duplicates()

if len(df) < 20:
    raise ValueError("Dataset does not contain at least 20 rows after cleaning.")

candidate_cols = [
    "name", "category_list", "country_code", "state_code", "region", "city",
    "status", "funding_total_usd", "funding_rounds", "founded_at", "first_funding_at",
    "last_funding_at", "milestones", "relationships", "age_first_funding_year",
    "age_last_funding_year", "is_top500", "markets", "description"
]
existing = [c for c in candidate_cols if c in df.columns]

def row_to_text(row):
    parts = []
    for c in existing:
        val = row.get(c, None)
        if pd.notna(val):
            parts.append(f"{c}: {val}")
    return " | ".join(map(str, parts)) if parts else ""

df["text"] = df.apply(row_to_text, axis=1)

non_empty_text = df["text"].replace("", np.nan).dropna()
if len(non_empty_text) < 20:
    df["text"] = df.apply(lambda r: json.dumps(r.to_dict(), default=str), axis=1)

assert "text" in df.columns
assert len(df) >= 20

df[["text"]].head(3)


(923, 49)


Unnamed: 0,text
0,name: Bandsintown | state_code: CA | city: San...
1,name: TriCipher | state_code: CA | city: Los G...
2,name: Plixi | state_code: CA | city: San Diego...


In [7]:
import tiktoken

def simple_retriever(query, docs, top_k=5):
    q_terms = set(str(query).lower().split())
    scores = []
    for i, t in enumerate(docs):
        d_terms = set(str(t).lower().split())
        score = len(q_terms.intersection(d_terms))
        scores.append((score, i))
    scores.sort(reverse=True, key=lambda x: x[0])
    idxs = [i for _, i in scores[:top_k]]
    return [docs[i] for i in idxs]

corpus = df["text"].astype(str).tolist()


In [17]:
BASIC_SYSTEM_PROMPT = (
    "You are a helpful assistant. Answer concisely and accurately."
)

CUSTOM_SYSTEM_PROMPT = (
    "You are a Startup Idea Validator and Business Plan Assistant. "
    "Use the provided startup dataset context to ground your answers. "
    "Prioritize evidence on funding, market, team, and traction. "
    "Identify assumptions, ask for missing info if critical, and give actionable next steps."
)

def build_custom_user_prompt(question, retrieved_context):
    ctx_block = "\n\n".join([f"- {c}" for c in retrieved_context])
    return (
        "Context from startup dataset:\n"
        f"{ctx_block}\n\n"
        "User Question:\n"
        f"{question}\n\n"
        "Instructions:\n"
        "- Cite relevant signals from the context (funding, market, team, geography).\n"
        "- Provide at least 3 concrete next steps for validation.\n"
        "- If context is thin, state limitations and what extra data is needed."
    )

MODEL_NAME = "gpt-4o-mini"

def ask_basic(question):
    messages = [
        {"role": "system", "content": BASIC_SYSTEM_PROMPT},
        {"role": "user", "content": question},
    ]
    resp = client.chat.completions.create(model=MODEL_NAME, messages=messages, temperature=0.3)
    return resp.choices[0].message.content

def ask_custom(question):
    retrieved = simple_retriever(question, corpus, top_k=5)
    user_prompt = build_custom_user_prompt(question, retrieved)
    messages = [
        {"role": "system", "content": CUSTOM_SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]
    resp = client.chat.completions.create(model=MODEL_NAME, messages=messages, temperature=0.3)
    return resp.choices[0].message.content

questions = [
    "Evaluate the success likelihood for a B2C wellness app in United States raising a seed round. What signals from similar startups matter most? Give Citations from the dataset.",
    "What market and funding risks should a SaaS startup in the US consider before Series A, and how can they mitigate them?"
]

results = []
for q in questions:
    basic = ask_basic(q)
    custom = ask_custom(q)
    results.append({"question": q, "basic_answer": basic, "custom_answer": custom})

for i, r in enumerate(results, 1):
    print(f"\n=== Question {i} ===")
    print(r["question"])
    print("\n--- Basic Answer ---")
    print(r["basic_answer"])
    print("\n--- Custom Answer ---")
    print(r["custom_answer"])

def chat_loop():
    print("Startup Idea Validator Chatbot (type 'exit' to quit)")
    while True:
        q = input("\nYour question: ").strip()
        if q.lower() in {"exit", "quit"}:
            print("Goodbye!")
            break
        try:
            print("\n[Basic]")
            print(ask_basic(q))
            print("\n[Custom - uses dataset context]")
            print(ask_custom(q))
        except Exception as e:
            print("Error:", e)

demo_question = (
    "Based on similar startups, what early traction should a fintech app show "
    "before approaching seed investors in United States? Give answer along with citations from the dataset."
)

print("Custom query with dataset context:\n")
print(ask_custom(demo_question))





=== Question 1 ===
Evaluate the success likelihood for a B2C wellness app in United States raising a seed round. What signals from similar startups matter most? Give Citations from the dataset.

--- Basic Answer ---
Evaluating the success likelihood for a B2C wellness app in the United States raising a seed round involves analyzing several key signals from similar startups. Here are the most important factors to consider:

1. **Market Demand**: The wellness industry has seen significant growth, particularly in mental health, fitness, and nutrition. According to a report by Grand View Research, the global wellness market is expected to reach $4.24 trillion by 2026, indicating strong consumer interest.

2. **User Engagement Metrics**: Successful wellness apps often demonstrate high user engagement metrics, such as daily active users (DAUs) and retention rates. Startups that can show a clear path to user acquisition and retention are more likely to attract investment.

3. **Monetization 