In [1]:
# Libraries
from google import genai
import pandas as pd
import yaml
from pyhere import here
import time
from google.api_core.exceptions import ResourceExhausted

In [7]:
# Define paths
paths = {
    "api" : here("donor_analysis/eda/hand/gemini_api.yaml"),
    "ntee_descrip" : here("donor_analysis/eda/hand/ntee_descrip.csv"),
    "missions_to_classify" : here("donor_analysis/import_clean/output/ntee_to_classify.csv"),
    "missions_already_done" : here("donor_analysis/eda/output/ntee_c_misiones_predicted_1.csv"),
    "output": here("donor_analysis/eda/output/")}

In [8]:
# Set Gemini API key
with open(paths["api"], "r") as file:
    gemini_api = yaml.safe_load(file)

client = genai.Client(api_key = gemini_api)

In [11]:
# Read test data
missions_to_classify_pre = pd.read_csv(paths["missions_to_classify"])
missions_already_done = pd.read_csv(paths["missions_already_done"])

# Filter out rows where rfc appears in the 'rfc' column of missions_already_done
missions_to_classify = missions_to_classify_pre[~missions_to_classify_pre['rfc'].isin(missions_already_done['rfc'])]

assert len(missions_to_classify) + len(missions_already_done) == len(missions_to_classify_pre), \
    "Mismatch in number of missions after filtering. Check the filtering logic."

# Read NTEE codes data
ntee_codes = pd.read_csv(paths["ntee_descrip"])

print(missions_to_classify)

              rfc                               razon_social  \
173  FHM190830319                 FUNDACION HEIFER MEXICO AC   
174  FHS201105IN9           FUNDACION HOSPITAL SAN PANCHO AC   
175  FIC180719B33                FUNDACION IVAN COCHEGRUS AC   
176  FID240119E72     FUNDACION INTERDISCIPLINARIA DONAJI AC   
177  FIM190703G54  FUNDACION INTEGRAL MEDICA DE OCCIDENTE AC   
..            ...                                        ...   
368  XTE220222LA1                           XPLORERS TEAM AC   
369  XUS210721HG9                          XU POR USTEDES AC   
370  YFT2211283J5              YAOTZIN FUEGO Y TERRITORIO AC   
371  YKI220131MG5                            YUCATAN KIDS AC   
372  YOA191002GG0                                YOAMOGDL AC   

                                                mision valores  ntee  ntee_2  
173  TRABAJAR CON COMUNIDADES PARA ALIVIAR EL HAMBR...     NaN   NaN     NaN  
174  LOGRAR EQUIPAR EL HOSPITAL PRINCIPAL DE LA CIU...     NaN   NaN     

In [12]:
# Format the NTEE data into a single string for context
ntee_context_lines = ["Here is a list of NTEE codes, their descriptions, and definitions:"]
for index, row in ntee_codes.iterrows():
    # Using the cleaned column names from the previous script: ntee_code, description, definition
    ntee_context_lines.append(f"{row.ntee_code}: {row.description} - {row.definition}")
ntee_context_string = "\n".join(ntee_context_lines)

# Construct the initial context message
# This message will be prepended to every request to provide context.
initial_context_message = {
    "role": "user",
    "parts": [{"text": ntee_context_string}]
}

In [16]:
# Create empty list to store responses
responses = []

# Iterate over each row in the test data
for row in missions_to_classify.itertuples():
    while True:
        try:
            # Construct the prompt for the current NGO
            ngo_prompt = (
                f"Based on the provided NTEE codes, what is the NTEE code "
                f"of the Mexican NGO {row.razon_social} with the mission: "
                f"{row.mision}? Give only the 3-character NTEE code without any other text."
                f"If there is insufficient information to determine the NTEE code, "
                f"respond with an empty string."
            )

            # Combine the initial NTEE context with the current NGO-specific prompt.
            # This creates a 'chat history' for each generate_content call.
            full_contents = [
                initial_context_message,
                {"role": "user", "parts": [{"text": ngo_prompt}]}
            ]

            # Generate responses using the full_contents list
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=full_contents,
            )
            responses.append(response.text)
            print(f"NGO: {row.razon_social}, Mission: {row.mision}, Predicted NTEE: {response.text}")
            time.sleep(5)  # Wait 5 seconds between requests
            break
        except Exception as e:
            if "RESOURCE_EXHAUSTED" in str(e):
                print("Rate limit hit, waiting 60 seconds...")
                time.sleep(60)
            else:
                print(f"An error occurred for NGO {row.razon_social}: {e}")
                # Depending on your needs, you might want to append None or an error message
                # to responses and continue, or re-raise the exception.
                responses.append(f"ERROR: {e}")
                break # Break out of the while loop for this row if a non-rate-limit error occurs

NGO: FUNDACION HEIFER MEXICO AC, Mission: TRABAJAR CON COMUNIDADES PARA ALIVIAR EL HAMBRE Y LA POBREZA CUIDANDO DE LA TIERRA, Predicted NTEE: Q33

Rate limit hit, waiting 60 seconds...
Rate limit hit, waiting 60 seconds...
Rate limit hit, waiting 60 seconds...


KeyboardInterrupt: 

In [None]:
responses = [response.replace("\n", "") for response in responses]  # Clean up responses

missions_classified = missions_to_classify.copy()[['rfc', 'razon_social', 'mision']]
missions_classified['predicted_ntee_code'] = responses
missions_classified['gemini'] = 1

print(missions_classified)

ValueError: Length of values (173) does not match length of index (373)

In [None]:
responses = [response.replace("\n", "") for response in responses]

# Start index of responses
i_start = 0
# End index of responses
i_end = len(responses)

missions_classified = missions_to_classify.copy().iloc[i_start:i_end][['rfc', 'razon_social', 'mision']]

missions_classified['predicted_ntee_code'] = responses
missions_classified['gemini'] = 1
print(missions_classified)

173
173
              rfc                                       razon_social  \
0    AAA990305QJ1  ASOCIACION POR AMOR Y AMISTAD UN RAYITO DE LUZ...   
1    AAC150428V11                         ALQUIMIA ARTE Y CULTURA AC   
2    AAC221220JH5               ASOCIACION ANCLA COMUNIDAD PARRAL AC   
3    AAF2011257C1                                AMOR AL AFLIGIDO AC   
4    AAL230227N90                                  AUTISMO ALDAMA AC   
..            ...                                                ...   
168  FFE200929J2A                                FUNDACION FERBEL AC   
169  FFU2311104J8                     FUNDACION FORJANDO UN ANGEL AC   
170  FGC2310102M6                              FUNDACION G CAABSA AC   
171  FHA220601AY0                          FUNDACION HE AQUI AGUA AC   
172  FHD1912189V4               FUNDACION HUMANISTICA DEMOCRATICA AC   

                                                mision predicted_ntee_code  \
0    BRINDAR UN SERVICIODE EXCELENTE CALIDAD A PE

In [None]:
# Save the updated dataframe to a CSV file
missions_classified.to_csv(paths["output"] / "ntee_c_misiones_predicted_2.csv", index=False)