In [None]:
import os
from typing import Dict, List
from groq import Groq
import pandas as pd
# Get a free API key from https://console.groq.com/keys
os.environ["GROQ_API_KEY"] = "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"

LLAMA3_70B_INSTRUCT = "llama3-70b-8192"
LLAMA3_8B_INSTRUCT = "llama3-8b-8192"

DEFAULT_MODEL = LLAMA3_70B_INSTRUCT

client = Groq()


In [None]:
from typing import List, Optional
import json
from enum import Enum
from pydantic import BaseModel
from groq import Groq

groq = Groq()

class TeamLevel(Enum):
    Level1 = "Senior"
    Level2 = "Reserve"
    Level3 = "Youth"    

class Spells(BaseModel):
    club_name: str
    team_level: TeamLevel
    manager: str
    end_of_spell: str
    start_of_spell: str
    league: str
    tier: str
    sacked: Optional[bool]
    probability_of_sacking_true: Optional[float]


def get_spell(club_name: str,end_of_spell: str) -> Spells:
    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a football statistician that outputs football facts about managerial spells in JSON.\n"
                # Pass the json schema to the model. Pretty printing improves results.
                f" The JSON object must use the schema: {json.dumps(Spells.model_json_schema(), indent=2)}",
            },
            {
                "role": "user",
                "content": f"Fetch football facts about the managerial spells at {club_name} which ended in {end_of_spell}.",
            },
        ],
        model=DEFAULT_MODEL,
        temperature=0,
        top_p= 0.9,
        # Streaming is not supported in JSON mode
        stream=False,
        # Enable JSON mode by setting the response format
        response_format={"type": "json_object"},
    )
    content = chat_completion.choices[0].message.content    

    return Spells.model_validate_json(chat_completion.choices[0].message.content)

#Italy U21	2008-09-10
spell = get_spell("Italy U21", "2008-09-10")
print(spell.club_name)
print(spell.team_level.value)
print(spell.manager)

# Extracting information from managerial spells

In [11]:
part1 = pd.read_csv("LLM_output_augmented.csv")
part2 = pd.read_csv("LLM_output_augmented_part2.csv")
part3 = pd.read_csv("LLM_output_augmented_part3.csv")
subset_columns=['manager','Level','league','tier','sacked','probability_of_sacking_true','error']
part1[~part1[subset_columns].isna().all(axis=1)]

Unnamed: 0,club_name,end_of_spell,manager,Level,league,tier,sacked,probability_of_sacking_true,error


In [14]:
part2[~part2[subset_columns].isna().all(axis=1)]
print(part2[part2[subset_columns].isna().all(axis=1)].__len__())

49661


## Groq limits

Rate Limits
Rate limits act as control measures to regulate how frequently a user or application can make requests within a given timeframe.
Current rate limits for chat completions:
You can view the current rate limits for chat completions in your organization settings

The team is working on introducing paid tiers with stable and increased rate limits in the near future.
Status code & rate limit headers
We set the following x-ratelimit headers to inform you on current rate limits applicable to the API key and associated organization.

The following headers are set (values are illustrative):

Header	Value	Notes
retry-after	2	In seconds
x-ratelimit-limit-requests	14400	Always refers to Requests Per Day (RPD)
x-ratelimit-limit-tokens	18000	Always refers to Tokens Per Minute (TPM)
x-ratelimit-remaining-requests	14370	Always refers to Requests Per Day (RPD)
x-ratelimit-remaining-tokens	17997	Always refers to Tokens Per Minute (TPM)
x-ratelimit-reset-requests	2m59.56s	Always refers to Requests Per Day (RPD)
x-ratelimit-reset-tokens	7.66s	Always refers to Tokens Per Minute (TPM)

When the rate limit is reached we return a 429 Too Many Requests HTTP status code.

Note, retry-after is only set if you hit the rate limit and status code 429 is returned. The other headers are always included.

In [None]:
progress_bar.close()

In [None]:
import pandas as pd
from tqdm import tqdm
import time
from groq import BadRequestError, InternalServerError
from pydantic import ValidationError
import logging  # Import logging module
inputs = df_all_na_subset

# Create a progress bar
progress_bar = tqdm(total=len(inputs))   
    

def main():
    #...
    max_retries = 3  # Maximum number of retries
    retry_delay = 1  # Initial retry delay in seconds
    
    # Iterate over the rows of the csv file
    for i, row in inputs.iterrows():
        club = row["club_name"]
        date = row["end_of_spell"]

        for attempt in range(max_retries):
            try:
                spell = get_spell(club, date)
                # Save the output to a new dataframe
                inputs.loc[i, "manager"] = spell.manager
                inputs.loc[i, "Level"] = spell.team_level.value
                inputs.loc[i, "league"] = spell.league
                inputs.loc[i, "tier"] = spell.tier
                inputs.loc[i, "sacked"] = spell.sacked
                inputs.loc[i, "probability_of_sacking_true"] = spell.probability_of_sacking_true
                progress_bar.update(1)  # Update on success
                break  # Exit the retry loop if successful
            except (BadRequestError, ValidationError) as e:
                # Handle BadRequestError and ValidationError as before
                inputs.loc[i, 'error'] = str(e)
                continue
            except InternalServerError:
                logging.warning(f"InternalServerError encountered. Retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries})")
                time.sleep(retry_delay)
                progress_bar.update(0.1) # Update during retry
                retry_delay *= 2  # Double the delay for the next retry
        else:
            logging.error(f"Failed to fetch data for club '{club}' after {max_retries} attempts.")
            inputs.loc[i, 'error'] = "InternalServerError"

    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()
    
inputs.to_csv("LLM_output_augmented.csv", index=False)

if __name__ == "__main__":
    main()


In [None]:
df=inputs
subset_columns=['manager','Level','league','tier','sacked','probability_of_sacking_true','error']
df_all_na_subset = df[df[subset_columns].isna().all(axis=1)]
print(df_all_na_subset)
inputs[~inputs[subset_columns].isna().all(axis=1)].to_csv("LLM_output_augmented_part3.csv", index=False)
inputs.to_csv("LLM_output_augmented_part2.csv", index=False)