In [43]:
import pandas as pd
import numpy as np
import os
import httpx
from pydantic import BaseModel
from tqdm import tqdm


# load dotenv
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
class OrganizationProject(BaseModel):
    project_name: str
    summary: str
    difficulty: str


class Organization(BaseModel):
    name: str
    description: str
    gsoc_url: str
    ideas_url: str
    logo: str
    technologies: list[str]
    topics: list[str]
    projects: list[OrganizationProject]
    jina_response: str | None = None


# read csv file and convert to pydantic model array

organizations: list[Organization] = []

df = pd.read_csv("gsoc_2025_organizations.csv")
for index, row in df.iterrows():
    organization = Organization(
        name=row["Organization"],
        description=row["Short Description"],
        gsoc_url=row["URL"],
        ideas_url=row["Ideas Link"],
        logo=row["Logo"],
        technologies=row["Technologies"]
        .replace("[", "")
        .replace("]", "")
        .replace("'", "")
        .split(", "),
        topics=row["Topics"]
        .replace("[", "")
        .replace("]", "")
        .replace("'", "")
        .split(", "),
        projects=[],
    )
    organizations.append(organization)
print(len(organizations))

185


In [36]:
# Replace with your actual OpenAI API key
import json
from openai import OpenAI


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)
client = OpenAI()
client.api_key = OPENAI_API_KEY


def query(context):
    """Send a request to the OpenAI ChatGPT API to summarize the context with bullet points."""
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant in summarizing the context.",
        },
        {
            "role": "user",
            "content": f"""Please summarize the following context with bullet points and a short description for each point:
            {context}
            
            On each project focus on:
            - What is the project about?
            - What are the main goals?
            """,
        },
    ]

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "project_summary_schema",
                "schema": {
                    "type": "object",
                    "properties": {
                        "projects": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "project_name": {
                                        "description": "The name of the project",
                                        "type": "string",
                                    },
                                    "summary": {
                                        "description": "Short description of the project",
                                        "type": "string",
                                    },
                                    "difficulty": {
                                        "description": "Difficulty level of the project",
                                        "type": "string",
                                    },
                                },
                                "required": ["project_name", "summary"],
                                "additionalProperties": False,
                            },
                        }
                    },
                    "required": ["projects"],
                    "additionalProperties": False,
                },
            },
        },
    )
    # print(response)
    return json.loads(response.choices[0].message.content)["projects"]

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


In [48]:

from time import sleep


pbar = tqdm(sorted(organizations, key=lambda x: x.name))

for organization in pbar:
    while True:
        try:
            # if there is record already, skip
            if os.path.exists(f"organizations/{organization.name}.json"):
                pbar.set_description(f"Skipping {organization.name} as already processed")
                break
            
            pbar.set_description(f"Processing {organization.name}")

            jina_url = f"https://r.jina.ai/{organization.ideas_url}"
            jina_response = httpx.get(jina_url, timeout=60)
            pbar.set_description(f"Got response from Jina {jina_response.status_code}")
            if jina_response.status_code != 200:
                pbar.set_description(f"Error getting response from Jina {jina_response.status_code}")
                break

            organization.jina_response = jina_response.text
            projects = query(
                jina_response.text,
            )
            pbar.set_description("Got response from OpenAI")

            for project in projects:
                organization.projects.append(OrganizationProject(**project))
            
            # now create new json file, contains all org info
            if not os.path.exists("organizations"):
                os.makedirs("organizations")
            
            with open(f"organizations/{organization.name}.json", "w") as f:
                f.write(organization.model_dump_json())
        except Exception as e:
            print(e)
            pbar.set_description(f"Error processing {organization.name} will retry")
            sleep(5)

    

Skipping webpack as already processed: 100%|██████████| 185/185 [00:01<00:00, 118.05it/s]                                            
