In [4]:
import re
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
import pandas as pd
from langchain_core.output_parsers import StrOutputParser
import statistics
import timeout_decorator
import time
import numpy as np

In [5]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import sys, os

import configparser

credential_file = "credentials.ini"
credential_config = configparser.ConfigParser()
credential_config.read(credential_file)

ISDM_API_KEY = credential_config['ISDM']["ISDM_API_KEY"]

# Set the environment variables from shell environment
OPENAI_API_KEY = ISDM_API_KEY
OPENAI_CHAT_MODEL = "solidrust/Codestral-22B-v0.1-hf-AWQ"
OPENAI_CHAT_API_URL = "https://isdm-chat.crocc.meso.umontpellier.fr/openai"
model = OPENAI_CHAT_MODEL

llm = ChatOpenAI(
    model=OPENAI_CHAT_MODEL,
    openai_api_key=OPENAI_API_KEY,
    openai_api_base=OPENAI_CHAT_API_URL,
)

from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [6]:
template = """
You are a statician working for the European commission at EUROSTAT. 
You have to give the average income per inhabitant by NUTS_2 levels.
Don't compute it, just guess the income.
Only answer with the average income (not a range, just the income) without any other words.
Example of answer: "30000", with no thousands separator.

Question: What is the average income per inhabitant for {NUTS_ID}
"""

prompt = PromptTemplate.from_template(template)
# result = prompt.format(NUTS_ID = "Auvergne")
# result


In [7]:
chain = prompt | llm | parser

@timeout_decorator.timeout(10, use_signals=False)  # Set the timeout to 10 seconds
def invoke_with_timeout(NUTS_ID):
    return chain.invoke({"NUTS_ID": NUTS_ID})

def prediction(NUTS_ID):
    try:
        result = invoke_with_timeout(NUTS_ID)  # Try to invoke within the timeout
    except timeout_decorator.timeout_decorator.TimeoutError:
        print(f"Timeout occurred for NUTS_ID: {NUTS_ID}. Retrying...")
        time.sleep(2)  # Optional sleep time before retrying
        try:
            result = invoke_with_timeout(NUTS_ID)  # Retry the operation
        except:
            return np.nan
    number = re.search(r'\d+', result).group()
    number = int(number)
    return number

def average_prediction(NUTS_ID):
    number_prediction = 3
    predictions = []

    for _ in range(number_prediction):
        prediction_value = prediction(NUTS_ID)
        predictions.append(prediction_value)

    # Filter out np.nan values from predictions
    predictions = [p for p in predictions if not np.isnan(p)]

    if predictions:  
        average = sum(predictions) / len(predictions)
        deviation = statistics.stdev(predictions) if len(predictions) > 1 else 0  
    else:
        average, deviation = np.nan, np.nan 

    return average, deviation

In [8]:
result = prediction("Auvergne")
result

Timeout occurred for NUTS_ID: Auvergne. Retrying...


35000

In [9]:
result = average_prediction("Ile de France")
result

(37333.333333333336, 6806.8592855540455)

## Load data from Eurostats

In [12]:
import pandas as pd

year = 2017
NUTS_Level = 2

df = pd.read_csv(f"./output/gdp_{year}_nuts_{NUTS_Level}.csv")

In [18]:
import pycountry

df["country"] = df["CNTR_CODE"].apply(lambda code: pycountry.countries.get(alpha_2=code).name if pycountry.countries.get(alpha_2=code) else "Unknown")
df["country"].unique()

array(['Austria', 'Albania', 'Belgium', 'Bulgaria', 'Switzerland',
       'Czechia', 'Cyprus', 'Germany', 'Denmark', 'Unknown', 'Estonia',
       'Spain', 'France', 'Finland', 'Croatia', 'Hungary', 'Italy',
       'Ireland', 'Norway', 'Netherlands', 'Montenegro',
       'North Macedonia', 'Lithuania', 'Malta', 'Luxembourg', 'Latvia',
       'Romania', 'Poland', 'Portugal', 'Serbia', 'Türkiye', 'Sweden',
       'Slovakia', 'Slovenia'], dtype=object)

In [None]:
from tqdm import tqdm
tqdm.pandas()

df[[f"{year}_predicted", f"{year}_deviation"]] = df["NUTS_NAME"].progress_apply(average_prediction).apply(pd.Series)


In [None]:
df.to_csv(f'./output/gdp_{year}_nuts_{NUTS_Level}_llm_{model.split("/")[1].split("-")[0]}.csv')

In [None]:
df["2017_predicted"]

0       35000
1       27000
2          11
3          12
4       25000
        ...  
1695    25000
1696    25000
1697    25000
1698    25000
1699    25000
Name: 2017_predicted, Length: 1700, dtype: int64