In [11]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.66.4


In [29]:
import os
from typing import Dict, List
from groq import Groq
import pandas as pd
# Get a free API key from https://console.groq.com/keys
os.environ["GROQ_API_KEY"] = "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"

LLAMA3_70B_INSTRUCT = "llama3-70b-8192"
LLAMA3_8B_INSTRUCT = "llama3-8b-8192"
MIXTRAL_8x7B_INSTRUCT= "mixtral-8x7b-32768"

DEFAULT_MODEL = LLAMA3_8B_INSTRUCT

client = Groq()


In [30]:
from typing import List, Optional
import json
from enum import Enum
from pydantic import BaseModel
from groq import Groq

groq = Groq()

class TeamLevel(Enum):
    Level1 = "Senior"
    Level2 = "Reserve"
    Level3 = "Youth"    

class Spells(BaseModel):
    club_name: str
    team_level: TeamLevel
    manager: str
    end_of_spell: str
    start_of_spell: str
    league: str
    tier: str
    sacked: Optional[bool]
    probability_of_sacking_true: Optional[float]


def get_spell(club_name: str,end_of_spell: str) -> Spells:
    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a football statistician that outputs football facts about managerial spells in JSON.\n"
                # Pass the json schema to the model. Pretty printing improves results.
                f" The JSON object must use the schema: {json.dumps(Spells.model_json_schema(), indent=2)}",
            },
            {
                "role": "user",
                "content": f"Fetch football facts about the managerial spells at {club_name} which ended in {end_of_spell}.",
            },
        ],
        model=DEFAULT_MODEL,
        temperature=0,
        top_p= 0.9,
        # Streaming is not supported in JSON mode
        stream=False,
        # Enable JSON mode by setting the response format
        response_format={"type": "json_object"},
    )
    content = chat_completion.choices[0].message.content    

    return Spells.model_validate_json(chat_completion.choices[0].message.content)

#Italy U21	2008-09-10
spell = get_spell("Italy U21", "2008-09-10")
print(spell.club_name)
print(spell.team_level.value)
print(spell.manager)

Italy U21
Youth
Piero Ausilio


# Extracting information from managerial spells

## Groq limits

Rate Limits
Rate limits act as control measures to regulate how frequently a user or application can make requests within a given timeframe.
Current rate limits for chat completions:
You can view the current rate limits for chat completions in your organization settings

The team is working on introducing paid tiers with stable and increased rate limits in the near future.
Status code & rate limit headers
We set the following x-ratelimit headers to inform you on current rate limits applicable to the API key and associated organization.

The following headers are set (values are illustrative):

Header	Value	Notes
retry-after	2	In seconds
x-ratelimit-limit-requests	14400	Always refers to Requests Per Day (RPD)
x-ratelimit-limit-tokens	18000	Always refers to Tokens Per Minute (TPM)
x-ratelimit-remaining-requests	14370	Always refers to Requests Per Day (RPD)
x-ratelimit-remaining-tokens	17997	Always refers to Tokens Per Minute (TPM)
x-ratelimit-reset-requests	2m59.56s	Always refers to Requests Per Day (RPD)
x-ratelimit-reset-tokens	7.66s	Always refers to Tokens Per Minute (TPM)

When the rate limit is reached we return a 429 Too Many Requests HTTP status code.

Note, retry-after is only set if you hit the rate limit and status code 429 is returned. The other headers are always included.

In [31]:
import pandas as pd
from tqdm import tqdm
import time
from groq import BadRequestError, InternalServerError
from pydantic import ValidationError
import logging

#inputs = pd.read_csv("chunk_2.csv")
inputs = subset
def main():
    max_retries = 3  # Maximum number of retries
    retry_delay = 1  # Initial retry delay in seconds

    # Create a progress bar
    progress_bar = tqdm(total=len(inputs))

    # Iterate over the rows of the csv file
    for i, row in inputs.iterrows():
        club = row["club_name"]
        date = row["end_of_spell"]

        for attempt in range(max_retries):
            try:
                spell = get_spell(club, date)
                # Save the output to a new dataframe
                inputs.loc[i, "manager"] = spell.manager
                inputs.loc[i, "Level"] = spell.team_level.value
                inputs.loc[i, "league"] = spell.league
                inputs.loc[i, "tier"] = spell.tier
                inputs.loc[i, "sacked"] = spell.sacked
                inputs.loc[i, "probability_of_sacking_true"] = spell.probability_of_sacking_true
                progress_bar.update(1)  # Update on success
                break  # Exit the retry loop if successful
            except (BadRequestError, ValidationError) as e:
                # Handle BadRequestError and ValidationError as before
                inputs.loc[i, 'error'] = str(e)
                progress_bar.update(1)  # Update on error
                break  # Exit the retry loop if there's an error
            except InternalServerError:
                logging.warning(f"InternalServerError encountered. Retrying in {retry_delay} seconds (attempt {attempt + 1}/{max_retries})")
                time.sleep(retry_delay)
                retry_delay *= 2  # Double the delay for the next retry
        else:
            logging.error(f"Failed to fetch data for club '{club}' after {max_retries} attempts.")
            inputs.loc[i, 'error'] = "InternalServerError"
            progress_bar.update(1)  # Update on failure

    # Close the progress bar
    progress_bar.close()
    inputs.to_csv("chunk_0_processed_pt3.csv", index=False)

In [32]:

if __name__ == "__main__":
    main()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [33]:
ch1= pd.read_csv("chunk_0_processed_pt1.csv")
ch2= pd.read_csv("chunk_0_processed_pt2.csv")
ch3= pd.read_csv("chunk_0_processed_pt3.csv")
processed = pd.concat([ch1, ch2, ch3])
processed.to_csv("chunk_0_processed.csv", index=False)

In [22]:
subset_columns=['manager','Level','league','tier','sacked','probability_of_sacking_true','error']
inputs[~inputs[subset_columns].isna().all(axis=1)]

Unnamed: 0,club_name,end_of_spell,manager,Level,league,tier,sacked,probability_of_sacking_true,error
3924,1.FC Nuremberg,2019-11-05,Damir Canadi,Senior,2. Bundesliga,2,True,0.85,
3925,Atromitos,2019-05-18,Damir Canadi,Senior,Super League Greece,1,True,0.85,
3926,Rapid Vienna,2017-04-09,Zoran Barisic,Senior,Austrian Bundesliga,1,True,0.85,
3927,SCR Altach,2016-11-10,Damir Canadi,Senior,Austrian Bundesliga,1,True,0.85,
3928,FC Lustenau,2013-01-06,Kurt Jara,Senior,Austrian Football Bundesliga,1,True,0.80,
...,...,...,...,...,...,...,...,...,...
5124,SV Wehen,2002-11-03,Günter Sebert,Senior,Regionalliga Süd,3,True,0.80,
5125,FC Augsburg,1999-06-30,Gerd Schädler,Senior,2. Bundesliga,2,True,0.80,
5126,B. Neunkirchen,1996-04-16,Werner Kern,Senior,2. Bundesliga,2,True,0.80,
5127,Eintracht Trier,1994-02-23,Werner Kern,Senior,2. Bundesliga,2,True,0.85,


In [24]:
inputs[~inputs[subset_columns].isna().all(axis=1)].to_csv("chunk_0_processed_pt2.csv", index=False)
subset= inputs[inputs[subset_columns].isna().all(axis=1)]
subset

Unnamed: 0,club_name,end_of_spell,manager,Level,league,tier,sacked,probability_of_sacking_true,error
5129,FC 08 Homburg,1989-06-30,,,,,,,
5130,B. Neunkirchen,1990-06-30,,,,,,,
5131,FC 08 Homburg,1988-02-06,,,,,,,
5132,FC 08 Homburg,1987-06-30,,,,,,,
5133,FC Stätzling,2016-06-30,,,,,,,
...,...,...,...,...,...,...,...,...,...
9691,FC Martigues,2003-06-30,,,,,,,
9692,Cannes,2001-06-30,,,,,,,
9693,FC Gueugnon,1998-06-30,,,,,,,
9694,SEC Bastia,1991-06-30,,,,,,,


In [6]:
df=inputs
subset_columns=['manager','Level','league','tier','sacked','probability_of_sacking_true','error']
subset= inputs[inputs[subset_columns].isna().all(axis=1)]
print(subset)
# inputs.to_csv("LLM_output_augmented_part2.csv", index=False)
# part1 = pd.read_csv("LLM_output_augmented.csv")
# part2 = pd.read_csv("LLM_output_augmented_part2.csv")
# part3 = pd.read_csv("LLM_output_augmented_part3.csv")
# subset_columns=['manager','Level','league','tier','sacked','probability_of_sacking_true','error']
# part1[~part1[subset_columns].isna().all(axis=1)]

             club_name end_of_spell manager Level league tier sacked  \
810    Cagliari Calcio   2025-06-30     NaN   NaN    NaN  NaN    NaN   
811            Watford   2022-01-24     NaN   NaN    NaN  NaN    NaN   
812          Sampdoria   2021-06-30     NaN   NaN    NaN  NaN    NaN   
813            AS Roma   2019-06-30     NaN   NaN    NaN  NaN    NaN   
814             Fulham   2019-02-28     NaN   NaN    NaN  NaN    NaN   
...                ...          ...     ...   ...    ...  ...    ...   
9691       Real Oviedo   1953-05-04     NaN   NaN    NaN  NaN    NaN   
9692          Zaragoza   1951-06-30     NaN   NaN    NaN  NaN    NaN   
9693         CD Málaga   1949-06-30     NaN   NaN    NaN  NaN    NaN   
9694  Deportivo Coruña   1948-06-30     NaN   NaN    NaN  NaN    NaN   
9695       Hércules CF   1947-06-30     NaN   NaN    NaN  NaN    NaN   

     probability_of_sacking_true error  
810                          NaN   NaN  
811                          NaN   NaN  
812         