### LAI time series imputation - Demonstration 

### Load data 

In [None]:
import joblib 
import pandas as pd 
import numpy as np 
from imputation import * 


country = 'france'

df_missing = pd.read_csv(f'...../{country}/df_missing.csv')
df_missing_scaled = pd.read_csv(f'..../{country}/df_missing_scaled.csv')
low_missing_values_combined_lai_df = pd.read_csv(f'.../{country}/low_missing_values_combined_lai_df.csv')
low_missing_values_series_array = np.load(f"..../{country}/low_missing_values_series_array.npy", allow_pickle=True)
scaler = joblib.load(f'..../{country}/scaler.pkl')

parameters = {
    'time_column' : 'time',
    'sep' : ',',
    'header' : 0,
    'preprocessing': False,
    'index': False,
    'train_params': {
        "gap_type": "random",
        "miss_perc": 0.1,
        "gap_length": 7,
        "max_gap_length": 7,
        "max_gap_count": 3,
        "random_seed": 1
    }
}

time_column = parameters['time_column']
header = parameters['header']
sep = parameters['sep']
preprocessing = parameters['preprocessing']
index = parameters['index']
train_params = parameters['train_params']

df_scaled, scaler = dataframe_scaler(df_input=low_missing_values_combined_lai_df.drop(columns=['Unnamed: 0']), 
                                time_column=time_column, 
                                header=header, 
                                sep=sep, 
                                preprocessing=preprocessing, 
                                index=index)

scaler = joblib.load(f'.../{country}/scaler.pkl')

for i in [df_missing, df_missing_scaled, low_missing_values_combined_lai_df, df_scaled]:
    if "Unnamed: 0" in i.columns:
        i.drop(columns=["Unnamed: 0"], inplace=True)

  @autocast(enabled=False)
  @autocast(enabled=False)


### State of the art Deep Learning algorithms for time series imputation 

In [None]:
parameters = {
    'time_column' : 'time',
    'sep' : ',',
    'header' : 0,
    'is_multivariate': False,
    'areaVStime': 0,
    'preprocessing': False,
    'index': False,
    "algorithms": ["TimesNet", 
                   "saits", 
                   "csdi",
                   ],
    "params": { 
        "saits": {
            "n_layers": 2,
            "d_model": 32,
            "d_ffn": 128,
            "n_heads": 2,
            "d_k": 16,
            "d_v": 128,
            "dropout": 0,
            "epochs": 200,
            "batch_size": 32,
            "ORT_weight": 1,
            "MIT_weight": 1,
            "attn_dropout": 0.3,
            "diagonal_attention_mask": True,
            "num_workers": 0,
            "patience": 20,
            "lr": 0.001224014111277117,
            "device": "cuda:1"
        },
        "TimesNet": {
            "n_layers": 1,
            "d_model": 128,
            "d_ffn": 32,
            "n_heads": 1,
            "top_k": 2,
            "n_kernels": 3,
            "dropout": 0.3,
            "epochs": 200,
            "batch_size": 32,
            "attn_dropout": 0.4,
            "apply_nonstationary_norm": True,
            "num_workers": 0,
            "patience": 20,
            "lr": 0.0015351577260316839,
            "device": "cuda:1"
        },
		"csdi":
		{
			"n_layers": 4,
			"n_channels": 32,
			"n_heads": 4,
			"d_time_embedding": 64,
			"d_feature_embedding": 3,
			"d_diffusion_embedding": 64,
			"is_unconditional": False,
			"beta_start": 0.0001,
			"beta_end": 0.1,
			"epochs": 10,
			"batch_size": 32,
			"n_diffusion_steps": 50,
			"num_workers": 0,
			"patience": 3,
			"lr": 0.001,
			"device":  "cuda:1"
		},
    }
}

time_column = parameters['time_column']
header = parameters['header']
sep = parameters['sep']
is_multivariate = parameters['is_multivariate']
areaVStime = parameters['areaVStime']
preprocessing = parameters['preprocessing']
index = parameters['index']
algorithms = parameters['algorithms']
params = parameters['params']

dict_of_imputed_dfs = run_imputation(missing = df_missing_scaled, ##df_missing,
                                         algorithms=algorithms, 
                                         params=params, 
                                         time_column=time_column,
                                         header=header, 
                                         sep=sep, 
                                         is_multivariate=is_multivariate, 
                                         areaVStime=areaVStime, 
                                         preprocessing=preprocessing, 
                                         index=index)

In [None]:
scaling = 'both'
variable_name = 'results'

In [None]:
import matplotlib.pyplot as plt


if scaling == False: 

    ## Check if the variable exists in the global namespace #it could be also the locals()
    if variable_name in globals():
        print(f"The variable '{variable_name}' exists.")
        pass
    else: 
        print(f"The variable '{variable_name}' does not exist. We create it")
        results = {}

    for i in dict_of_imputed_dfs.keys():

        new_df = low_missing_values_combined_lai_df.set_index(df_missing.columns[0])

        new_df_missing = df_missing[low_missing_values_series_array].set_index(df_missing.columns[0])

        imputed_df = dict_of_imputed_dfs[i][low_missing_values_series_array]

        new_imputed_df = imputed_df.set_index(imputed_df.columns[0])

        results[i+"_"+"tuned"] = dict(compute_metrics(new_df, new_df_missing, new_imputed_df))

        print(i, compute_metrics(new_df, new_df_missing, new_imputed_df), '\n')


elif scaling == True:

    dict_of_imputed_dfs_inversed = {}

    for key, values in dict_of_imputed_dfs.items():

        dict_of_imputed_dfs_inversed[key] = dataframe_inverse_scaler(df_input=dict_of_imputed_dfs[key], 
                                                        scaler=scaler,
                                                        time_column=time_column, 
                                                        header=header, 
                                                        sep=sep, 
                                                        preprocessing=preprocessing, 
                                                        index=index)

    df_scaled_inversed = dataframe_inverse_scaler(df_input=df_scaled, 
                                                        scaler=scaler,
                                                        time_column=time_column, 
                                                        header=header, 
                                                        sep=sep, 
                                                        preprocessing=preprocessing, 
                                                        index=index)

    df_missing_inversed = dataframe_inverse_scaler(df_input=df_missing, 
                                                        scaler=scaler,
                                                        time_column=time_column, 
                                                        header=header, 
                                                        sep=sep, 
                                                        preprocessing=preprocessing, 
                                                        index=index)

    ########################

    ## Check if the variable exists in the global namespace #it could be also the locals()
    if variable_name in globals():
        print(f"The variable '{variable_name}' exists.")
        pass
    else: 
        print(f"The variable '{variable_name}' does not exist. We create it")
        results = {}

    for i in dict_of_imputed_dfs_inversed.keys():

        # new_df_scaled = df_scaled[low_missing_values_series_array].set_index(df_scaled.columns[0])

        # new_df_missing = df_missing[low_missing_values_series_array].set_index(df_missing.columns[0])

        new_df_scaled = df_scaled_inversed[low_missing_values_series_array].set_index(df_scaled.columns[0])

        new_df_missing = df_missing_inversed[low_missing_values_series_array].set_index(df_missing.columns[0])

        imputed_df = dict_of_imputed_dfs_inversed[i][low_missing_values_series_array]

        new_imputed_df = imputed_df.set_index(imputed_df.columns[0])

        results[i+"_"+"tuned"] = dict(compute_metrics(new_df_scaled, new_df_missing, new_imputed_df))

        print(i, compute_metrics(new_df_scaled, new_df_missing, new_imputed_df), '\n')

    #########################

elif scaling == 'both':

    dict_of_imputed_dfs_inversed = {}

    for key, values in dict_of_imputed_dfs.items():

        dict_of_imputed_dfs_inversed[key] = dataframe_inverse_scaler(df_input=dict_of_imputed_dfs[key], 
                                                        scaler=scaler,
                                                        time_column=time_column, 
                                                        header=header, 
                                                        sep=sep, 
                                                        preprocessing=preprocessing, 
                                                        index=index)

    df_scaled_inversed = dataframe_inverse_scaler(df_input=df_scaled, 
                                                        scaler=scaler,
                                                        time_column=time_column, 
                                                        header=header, 
                                                        sep=sep, 
                                                        preprocessing=preprocessing, 
                                                        index=index)

    df_missing_inversed = dataframe_inverse_scaler(df_input=df_missing_scaled, 
                                                        scaler=scaler,
                                                        time_column=time_column, 
                                                        header=header, 
                                                        sep=sep, 
                                                        preprocessing=preprocessing, 
                                                        index=index)

    ########################

    ## Check if the variable exists in the global namespace #it could be also the locals()
    if variable_name in globals():
        print(f"The variable '{variable_name}' exists.")
        pass
    else: 
        print(f"The variable '{variable_name}' does not exist. We create it")
        results = {} 

    for i in dict_of_imputed_dfs_inversed.keys():

        # new_df_scaled = df_scaled[low_missing_values_series_array].set_index(df_scaled.columns[0])

        # new_df_missing = df_missing[low_missing_values_series_array].set_index(df_missing.columns[0])

        new_df_scaled = df_scaled_inversed[low_missing_values_series_array].set_index(df_scaled.columns[0])

        new_df_missing = df_missing_inversed[low_missing_values_series_array].set_index(df_missing_scaled.columns[0])

        imputed_df = dict_of_imputed_dfs_inversed[i][low_missing_values_series_array]

        new_imputed_df = imputed_df.set_index(imputed_df.columns[0])

        results[i+"_"+"tuned"] = dict(compute_metrics(new_df_scaled, new_df_missing, new_imputed_df))

        print(i, compute_metrics(new_df_scaled, new_df_missing, new_imputed_df), '\n')

    #########################

### LLM-based time series imputation 

In [None]:
from utility_functions import *
import pandas as pd 
import numpy as np 
from imputation import * 
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from typing import Optional
from typing import List
from langchain_groq import ChatGroq
from langchain_core.pydantic_v1 import BaseModel, Field, validator
import os 
import uuid
from typing import Dict
from utility_functions import * 
import time 
from tenacity import RetryError
from langchain_core.messages import (AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage)
import json 
from datetime import datetime
from tenacity import retry, stop_after_attempt, wait_fixed
import argparse
from langchain_ollama import ChatOllama
from ollama import chat
from pydantic import BaseModel
from typing import List 
import math 
import numpy as np
import faiss
from langchain import hub
import uuid
from langchain_core.documents import Document
import geopandas as gpd 
from langchain_core.runnables import RunnableLambda
from LLM4TS_imputation.examples_creation import generate_llm_examples
from LLM4TS_imputation.utils import handle_llm_output


def inspect(state):
    """Print the state passed between Runnables in a langchain and pass it on"""
    print(state)
    return state


def impute_missing_values(series):
    return np.where(np.isnan(series), np.nanmean(series), series)

# Example: Convert a time series into a fixed-length embedding
def extract_features(series):
    return np.array([
        np.mean(series),
        np.std(series),
        np.min(series),
        np.max(series),
        np.percentile(series, 25),
        np.percentile(series, 50),
        np.percentile(series, 75)
    ])

# Extract embeddings for all time series in the dataset
def build_embeddings(time_series_data):
    return np.array([extract_features(ts) for ts in time_series_data])
    #return np.array([ts for ts in time_series_data])
    #return np.array([impute_missing_values(ts) for ts in time_series_data])

# Build a FAISS index
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)  # L2 distance
    index.add(embeddings)
    return index

# Query FAISS for k most similar time series
def query_similar_time_series(query_series, index, time_series_data, k):
    query_embedding = extract_features(query_series).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    return [time_series_data[i] for i in indices[0]], indices, distances

def build_rag(df_missing):
    #TODO comment out the line below, because this was originally used 
    #time_series_data = df_missing.drop(columns=['time']).dropna().T.values  #.drop(columns=['time', serie])
    time_series_data = df_missing.drop(columns=['time']).interpolate(method='linear').T.values
    column_names = df_missing.drop(columns=['time']).columns  # Store column names #.drop(columns=['time', serie])
    name_to_index = {name: idx for idx, name in enumerate(column_names)}  # Mapping column names to numeric indices
    index_to_name = {idx: name for name, idx in name_to_index.items()}  # Reverse mapping for lookup
    embeddings = build_embeddings(time_series_data)
    index = build_faiss_index(embeddings)
    return time_series_data, column_names, name_to_index, index_to_name, embeddings, index

def build_rag(df_missing):
    df_missing = df_missing.interpolate(method='linear')
    time_series_data = df_missing.drop(columns=['time']).dropna().T.values  #.drop(columns=['time', serie])
    column_names = df_missing.drop(columns=['time']).columns  # Store column names #.drop(columns=['time', serie])
    name_to_index = {name: idx for idx, name in enumerate(column_names)}  # Mapping column names to numeric indices
    index_to_name = {idx: name for name, idx in name_to_index.items()}  # Reverse mapping for lookup
    embeddings = build_embeddings(time_series_data)
    index = build_faiss_index(embeddings)
    return time_series_data, column_names, name_to_index, index_to_name, embeddings, index


####################
### LLM prompting 
####################

@retry(stop=stop_after_attempt(3), wait=wait_fixed(3))
def make_request(df_missing, serie, missing_date, chain):

    # Record the start time
    start_time = time.time()

    human_message_content = f"""The following array represents the Leaf Area Index values for a crop for certain dates: [{df_missing[['time', serie]].dropna().values}]. 
    Estimate the float number of Leaf Area Index value for the requested date {missing_date[0]}. Return the float LAI value, the date itself, and give explanation."""
    
    print('PROMPT GIVEN TO LLM:', human_message_content)

    # Invoke the model with langchain
    llm_response = chain.invoke({"msgs": [HumanMessage( content = human_message_content)]})  

    elapsed_time = time.time() - start_time
    return llm_response, elapsed_time

#######################################################

class AnswerandJustification(BaseModel):
    '''An answer to the user question '''

    #answer: str =  Field(description="The list of data and lai value pari that you were given but with imputed the missing value.")
    answer_value: float #= Field(description="The leaf area index float")
    answer_date: str #= Field(description="The requested date")
    answer_explanation: str #= Field(description="Explaination of the missing values fullfillment")

def create_examples(example, requested_date):
    msgs=f"The following array represents the Leaf Area Index values for a crop for certain dates: {example}. Estimate the float number Leaf Area Index value for the requested date {requested_date[0]}. Return the float LAI value, the date itself and give explanation."
    answer_list_length = AnswerandJustification(
    msgs=f"The following array represents the Leaf Area Index values for a crop for certain dates: {example}. Estimate the float number Leaf Area Index value for the requested date {requested_date[0]}. Return the float LAI value, the date itself and give explanation.",                   
    answer_date = requested_date[0], 
    answer_value = requested_date[1],
    answer_explanation = f'I guess that the missing LAI value for that date is {requested_date[1]} because of the season'
    )
    print('EXAMPLE MESSAGES -> create_examples function:', 'answer_date:', requested_date[0], 'answer_value:', requested_date[1])
    return msgs, answer_list_length

def tool_example_to_messages(example: Dict) -> List[BaseMessage]:
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages

###########################################################################

#example_msgs = [msg for ex in examples for msg in tool_example_to_messages(ex)]

mask = find_new_nan_mask(df1=low_missing_values_combined_lai_df, df2=df_missing)

###########################################################################

##############
## Groq API 
############## 
# model_name = "llama3-70b-8192" # "llama-3.3-70b-versatile" #"llama3-70b-8192" #llama-3.1-8b-instant
# os.environ["GROQ_API_KEY"] = "put your api key here"
# model = ChatGroq(temperature=0, model=model_name)

##############
## Ollama
##############
model_name = 'llama3.1'
model = ChatOllama(
                model=model_name,
                temperature=0,
                max_tokens=None,
                keep_alive=-1,
                format="json" , 
                base_url = 'if you run with Ollama you need to install locally Ollama and put the base url here'
            )

structured_model = model.with_structured_output(AnswerandJustification)

###########################
## RAG
###########################

time_series_data, column_names, name_to_index, index_to_name, embeddings, index = build_rag(df_missing)


###########################
## Main Loop 
###########################

llm_responses = {}
llm_responses_time_per_serie = {}
llm_explanations = {}
distances_dict = {}

for i, (serie, serie_values) in enumerate(df_missing.items()):

    if serie != 'time':

        llm_responses[serie] = []

        a = df_missing[['time', serie]]
        b = mask[['time', serie]]

        llm_responses_time_per_serie[serie] = []

        distances_dict[serie] = []
        
        for missing_date in a[b.iloc[:, 1]].values:
            
            print('\n')
            print('ITERATION:', i, 'SERIE:', serie, 'MISSING DATE:', missing_date)

            ###########################
            ## RAG usage
            ###########################
            query_series = df_missing[serie].dropna().values.flatten()

            k = 2
            similar_series, indices, distances = query_similar_time_series(query_series, index, time_series_data, k)
            similar_series_names = [index_to_name[idx] for idx in indices[0]]  # Convert numeric indices to column names
            distances_dict[serie].append([missing_date[0], distances[0]])

            examples = []
            context_examples_list = []
            context_examples = {}
            num_of_examples = 0

            for i in range(len(similar_series_names)):
                
                #TODO comment out the below line of code 
                fin_arr = df_missing[['time', similar_series_names[i]]].dropna().values
                #fin_arr = combined_lai[['time', similar_series_names[k]]].dropna().values

                context_ex = fin_arr[fin_arr[:, 0]==missing_date[0]]

                if len(context_ex) == 0:
                    print(f'EXAMPLE MESSAGES -> for serie {similar_series_names[i]} Didnt find the same date in the available data')
                    # Define the date to match
                    date_to_match = missing_date[0]

                    # Convert the target date to a datetime object
                    target_date = datetime.strptime(date_to_match, '%Y-%m-%d')

                    # Convert the first column of dates to datetime objects
                    dates = np.array([datetime.strptime(date, '%Y-%m-%d') for date in fin_arr[:, 0]])

                    # Find the index of the closest date
                    date_diffs = np.abs(dates - target_date)
                    closest_index = np.argmin(date_diffs)

                    # Get the row with the closest date
                    # closest_row = fin_arr[closest_index]
                    # context_ex = fin_arr[fin_arr[:, 0]==closest_row[0]]

                    random_row = fin_arr[closest_index]
                    fin_arr = fin_arr[fin_arr[:, 0]==random_row[0]]

                else:
                    print(f'EXAMPLE MESSAGES -> for serie {similar_series_names[i]} Found the same date in the available data')
                    ##---------------------------
                    ##TODO to be removed this else statement, if I want to return to the previous implementation  
                    ##---------------------------
                    ## choose the instance that is similar in date and exlude it in order to create example 
                    random_row = context_ex[0]
                    # Delete rows where the first column matches missing_date[0]
                    fin_arr = fin_arr[fin_arr[:, 0] != missing_date[0]]

                ## number of examples that i want to give to the llm
                if num_of_examples < 2:
                    msgs, answer_list_length = create_examples(example=fin_arr, requested_date=random_row)
                    examples.append({"input": msgs, "tool_calls": [answer_list_length]})
                    num_of_examples +=1
                else:
                    pass

            example_msgs = [msg for ex in examples for msg in tool_example_to_messages(ex)]
            print('EXAMPLE MESSAGES:', example_msgs)

            #####################################
            ## Prompt template
            #####################################
            template = """
            human: {msgs}
            system: "You are a data scientist specialized in ecological timeseries modeling."
            """
            prompt = ChatPromptTemplate.from_template(template)

            ##########################################################
            ## Chain generation using examples retrieved with RAG 
            ##########################################################

            chain = {"msgs": RunnablePassthrough()}| RunnableLambda(inspect)  | prompt.partial(examples=example_msgs) | structured_model 

            ######################################
            ## # Invoke the model
            ######################################

            try:
                # Invoke the model
                llm_response, elapsed_time = make_request(df_missing, serie, missing_date, chain)

                print('llm_response answer_value:', llm_response.answer_value)
                print('llm_response answer_date:', llm_response.answer_date)
                print('llm_response answer_explanation:', llm_response.answer_explanation)

                print('similarity distances:', distances)
                print('Real value:', low_missing_values_combined_lai_df.loc[low_missing_values_combined_lai_df["time"] == missing_date[0], serie].iloc[0])
                
                llm_explanations[serie] = llm_response.answer_explanation
                response = [llm_response.answer_date, llm_response.answer_value]

            except Exception as e:
                # Handle exceptions gracefully
                response = [missing_date[0], e]
                print(f"Error: {e}")
            finally:
                # Always record the response and elapsed time
                print('RESPONSE:', response)
                llm_responses[serie].append(response)
                llm_responses_time_per_serie[serie].append([missing_date[0], elapsed_time])


#############################
### Handling the LLMs ouput 
#############################
after_handling_output = handle_llm_output(df_missing, llm_responses)
df_imputed = after_handling_output['df_imputed']
llm_responses_updated = after_handling_output['llm_responses_updated']
number_of_error_prediction_from_LLM = after_handling_output['number_of_error_prediction_from_LLM']
number_of_error_prediction_from_LLM_list = after_handling_output['number_of_error_prediction_from_LLM_list']
number_of_none_prediction_from_LLM = after_handling_output['number_of_none_prediction_from_LLM']
number_of_error_prediction_from_LLM_list = after_handling_output['number_of_error_prediction_from_LLM_list']

print('Number of Errors collected in the LLMs response:', number_of_error_prediction_from_LLM, '\n',
          'Number of None collected from LLM:', number_of_none_prediction_from_LLM)


### Visualization of the results 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

sorted_fig = 'True'

df = pd.DataFrame(results)

# Bar plot for each metric
metrics = df.index

# Create a 3x3 grid for 6 plots
fig, axes = plt.subplots(3, 3, figsize=(15, 15))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Loop through the metrics to create bar plots
for i, metric in enumerate(metrics):
    ax = axes[i]
    if sorted_fig == 'True':
        # Sort the data for the current metric
        sorted_data = df.loc[metric].sort_values(ascending=False)
        sorted_data.plot(kind='bar', ax=ax, title=metric)
    else: 
        df.loc[metric].plot(kind='bar', ax=ax, title=metric)
    ax.set_ylabel(metric)

# Hide any unused subplots (if fewer than 9 plots)
for j in range(len(metrics), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout to avoid overlapping
plt.tight_layout()
plt.show()
