In [1]:
import sys
sys.path.insert(0, 'utils')

import pandas as pd
import matplotlib.pyplot as plt
import ds_charts as ds
from ds_charts import HEIGHT
import numpy as np
from pandas.plotting import register_matplotlib_converters
import seaborn as sns
import json
import jstyleson
import collections
import copy
import datetime
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler, MinMaxScaler ,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import timeit
from math import radians, cos, sin, asin, sqrt
from rdflib import Graph, Namespace, URIRef, Literal
import re
import csv

# sys.stdout = open('Results/dankfe_1_timings.txt', 'a')


base_name = "covid"
data_folder = f'data/{base_name}/America/'
model_folder = f'data/{base_name}/'
print(data_folder)
print(model_folder)


  from pandas.core.computation.check import NUMEXPR_INSTALLED


data/covid/America/
data/covid/


In [2]:
# Get dataset
start_time = timeit.default_timer()
if base_name == "covid":
    data = pd.read_csv(f'{data_folder}{base_name}_base.csv',parse_dates=['current_date','first_date'], infer_datetime_format=True)
else:
    data = pd.read_csv(f'{data_folder}{base_name}_base.csv',parse_dates=['current_date'], infer_datetime_format=True)
# print('read csv')
read_time = timeit.default_timer() - start_time

# Getting ER Model of case study data
f = open(f'{model_folder}{base_name}_model.json')
model = jstyleson.load(f)

#Getting Ontology of case study data
ontology = Graph()
ontology.parse(f'{model_folder}{base_name}_model.rdf')

# Getting options JSON
f = open(f'{model_folder}options.json')
options = jstyleson.load(f)

# Getting variable template JSON
f = open(f'utils/var_template.json')
template = jstyleson.load(f)

In [3]:
target = 'high_risk_2w'

# target_count = data[target].value_counts()
# positive_class = target_count.idxmin()
# negative_class = target_count.idxmax()
# print('Minority class=', positive_class, ':', target_count[positive_class])
# print('Majority class=', negative_class, ':', target_count[negative_class])
# print('Proportion:', round(target_count[positive_class] / target_count[negative_class], 2), ': 1')

In [4]:
# entity_name_list = [entity['name'] for entity in model['entities']]
# # Raise Error if entities do not match columns:
# if collections.Counter(entity_name_list) != collections.Counter(data.columns.to_list()):
#     raise ValueError("Entities and Columns do not have the same size.")

# Remove all non-observed columns
#not_observed = [entity['name'] for entity in model['entities'] if not entity['observed']]

# Get names of columns to create
#created_vars = [relation['output'] for relation in model['relations']]

# Convert date entities to timestamp
#date_columns = [entity['name'] for entity in model['entities'] if entity['type'] == 'datetime']
# for col in date_columns:
#     data[col] = pd.to_datetime(data[col], format = "%Y-%m-%d")
# data['current_date'] = pd.to_datetime(data['current_date'], format = "%Y-%m-%d %H:%M:%S")
# data['current_date'] = pd.to_datetime(data['current_date'], format = "%d/%m/%Y")
# data['first_date'] = pd.to_datetime(data['first_date'], format = "%d/%m/%Y")
#data_observed = data.drop(columns = not_observed)
# data_observed = data_observed.sample(frac = 0.1, random_state = 0).reset_index(drop = True)
# data_observed = data_observed[:1000]

In [5]:
# print(data.dtypes)
# ds.get_variable_types(data)

In [6]:
# Data Preparation

def data_preparation(dataset,options,target):
    if dataset[target].isna().sum() != 0:
        # print("Target MVs: dropped rows")
        dataset = dataset.dropna(subset=[target]).reset_index(drop = True)
    dataset = encodeLabels(dataset)
    if options['checkMissingValues'] != 'none':
        dataset = checkMissingValues(dataset,options['checkMissingValues'])
    return dataset
    # generate Variables
    # if options['checkScaling'] != 'none':
    #     dataset = checkScaling(dataset,options['checkScaling'])
    # # divide into train and test
    # data_train, data_test = train_test_split(dataset,train_size=0.7,test_size=0.3,stratify=dataset[target],random_state=1)
    # if options['checkBalancing']:
    #     data_train = checkBalancing(data_train)
    # return data_train,data_test

def data_preparation_2(dataset,options,target):
    if options['checkScaling'] != 'none':
        dataset = checkScaling(dataset,options['checkScaling'])
    data_train, data_test = train_test_split(dataset,train_size=0.7,test_size=0.3,stratify=dataset[target],random_state=1)
    if options['checkBalancing']:
        data_train = checkBalancing(data_train)
    data_train = data_train.reset_index(drop=True)
    data_test = data_test.reset_index(drop=True)
    return data_train,data_test

def encodeLabels(dataset):
    le = LabelEncoder()
    bool_df = dataset.select_dtypes(include='bool')
    if len(bool_df.columns) != 0:
        for col in bool_df:
            dataset[col] = le.fit_transform(dataset[col])
    return dataset

def checkScaling(dataset,method):
    # print(f"Checking scaling: method {method}")
    numeric_vars, symbolic_vars, binary_vars, date_vars = ds.get_variable_types(dataset).values()
    
    if method == 'zscore':
        transf = StandardScaler(with_mean=True, with_std=True, copy=True).fit(dataset[numeric_vars])
    if method == 'minmax':
        transf = MinMaxScaler(feature_range=(0,1), copy=True).fit(dataset[numeric_vars])
    tmp = pd.DataFrame(transf.transform(dataset[numeric_vars]), index = dataset.index, columns = numeric_vars)
    data_scaled = pd.concat([tmp,dataset[symbolic_vars],dataset[binary_vars],dataset[date_vars]], axis = 1)
    return data_scaled

def checkBalancing(dataset):
    target_count = data[target].value_counts()
    proportion = target_count[target_count.idxmin()] / target_count[target_count.idxmax()]
    data_bal = dataset.copy(deep=True)
    df_min = data_bal[data_bal[target] == target_count.idxmin()]
    df_max = data_bal[data_bal[target] == target_count.idxmax()]
    if proportion <= 0.66:
        if target_count[target_count.idxmin()] >= 25000 and target_count[target_count.idxmax()] >= 25000:
            print("Checking balancing: Undersampling both classes - both over 25000")
            df_min = df_min.sample(n=25000, replace = False, random_state = 1)
            df_max = df_max.sample(n=25000, replace = False, random_state = 1)
        elif target_count[target_count.idxmin()] < 25000 and target_count[target_count.idxmax()] >= 25000:
            print("Checking balancing: Oversampling min class and undersampling max class - one over 25000")
            df_min = df_min.sample(n=25000, replace = True, random_state = 1)
            df_max = df_max.sample(n=25000, replace = False, random_state = 1)
        else:
            print("Checking balancing: Oversampling min class - none over 25000")
            df_min = df_min.sample(n=len(df_max), replace = True, random_state = 1)
        data_balanced = pd.concat([df_min,df_max],axis = 0)
        return data_balanced
    else:
        print("Checking balancing: No balancing required.")
        return dataset

def checkMissingValues(dataset,method):
    # print(f"Checking MVs: method {method}")
    numeric_vars, binary_vars, date_vars, symbolic_vars = ds.get_variable_types(dataset).values()
    if method == 'auto':
        tmp_nr, tmp_sb, tmp_bool = None, None, None
        if len(numeric_vars) > 0:
            imp = SimpleImputer(strategy='median', missing_values=np.nan, copy=True)
            tmp_nr = pd.DataFrame(imp.fit_transform(dataset[numeric_vars]), index =dataset.index ,columns=numeric_vars)
        if len(symbolic_vars) > 0:
            imp = SimpleImputer(strategy='most_frequent', missing_values=np.nan, copy=True)
            tmp_sb = pd.DataFrame(imp.fit_transform(dataset[symbolic_vars]), index=dataset.index ,columns=symbolic_vars)
        if len(binary_vars) > 0:
            imp = SimpleImputer(strategy='most_frequent', missing_values=np.nan, copy=True)
            tmp_bool = pd.DataFrame(imp.fit_transform(dataset[binary_vars]), index=dataset.index ,columns=binary_vars)
    data_mv = pd.concat([tmp_nr, tmp_sb, tmp_bool,dataset[date_vars]], axis=1)
    return data_mv

# data_prepared_train, data_prepared_test = data_preparation(data,options,target)

In [7]:
# Automatic variable generation

def generateAutoVariables(dataset,options,template,model):
    if options['generateDates']:
        model = generateDates(dataset,template['dates'])
    if len(options['generateFiveSummary']) != 0:
        model = generateFiveSummary(dataset,template['five_summary'], options['groupby'], options['generateFiveSummary'])
    return model

def generateDates(dataset,template):
    date_vars = ds.get_variable_types(dataset)['Date']
    new_datevars = []
    template_tmp = copy.deepcopy(template)
    for date_col in date_vars:
        template_tmp = copy.deepcopy(template)
        for new_var in template_tmp:
            new_var['output'] = f"{date_col}_{new_var['output']}"
            new_var['inputs'].append(date_col)
            new_datevars.append(new_var)
    # print(f"Adding {len(new_datevars)} new variables")
    model['relations'] =  model['relations'] + new_datevars
    return model
    
def generateFiveSummary(dataset,template,groupby,num_vars):
    # num_vars = ds.get_variable_types(dataset)['Numeric']
    new_numvars = []
    template_tmp = copy.deepcopy(template)
    if len(groupby) != 0:
        for group in groupby:
            for num_col in num_vars:
                template_tmp = copy.deepcopy(template)
                for new_var in template_tmp:
                    new_var['output'] = f"{num_col}_{new_var['output']}_per_{group}"
                    new_var['inputs'].append(num_col)
                    new_var['groupby'] = group
                    new_numvars.append(new_var)
    # print(f"Adding {len(new_numvars)} new variables")
    model['relations'] = model['relations'] + new_numvars
    return model

In [19]:
# Feature Generation
holidays = pd.read_csv('holidays_europe.csv')
holidays['date'] =  pd.to_datetime(holidays['date'],format="%Y-%m-%d")

day_periods = [(1, (datetime.datetime(2000,1,1,0,0,0),  datetime.datetime(2000,1,1,2,59,59))),
           (2, (datetime.datetime(2000,1,1,3,0,0),  datetime.datetime(2000,1,1,5,59,59))),
           (3, (datetime.datetime(2000,1,1,6,0,0),  datetime.datetime(2000,1,1,8,59,59))),
           (4, (datetime.datetime(2000,1,1,9,0,0),  datetime.datetime(2000,1,1,11,59,59))),
           (5, (datetime.datetime(2000,1,1,12,0,0),  datetime.datetime(2000,1,1,14,59,59))),
           (6, (datetime.datetime(2000,1,1,15,0,0),  datetime.datetime(2000,1,1,17,59,59))),
           (7, (datetime.datetime(2000,1,1,18,0,0),  datetime.datetime(2000,1,1,20,59,59))),
           (8, (datetime.datetime(2000,1,1,21,0,0),  datetime.datetime(2000,1,1,23,59,59)))]

energy_prices = [(1, (datetime.datetime(2000,1,1,22,0,0),  datetime.datetime(2000,1,1,23,59,59))),
           (1, (datetime.datetime(2000,1,1,0,0,0),  datetime.datetime(2000,1,1,7,59,59))),
           (2, (datetime.datetime(2000,1,1,8,0,0),  datetime.datetime(2000,1,1,8,59,59))),
           (2, (datetime.datetime(2000,1,1,10,30,0),  datetime.datetime(2000,1,1,17,59,59))),
           (2, (datetime.datetime(2000,1,1,20,30,0),  datetime.datetime(2000,1,1,21,59,59))),
           (3, (datetime.datetime(2000,1,1,9,0,0),  datetime.datetime(2000,1,1,10,29,59))),
           (3, (datetime.datetime(2000,1,1,18,0,0),  datetime.datetime(2000,1,1,20,29,59)))]

seasons = [(1, (datetime.date(2000,  1,  1),  datetime.date(2000,  3, 20))),
           (2, (datetime.date(2000,  3, 21),  datetime.date(2000,  6, 20))),
           (3, (datetime.date(2000,  6, 21),  datetime.date(2000,  9, 22))),
           (4, (datetime.date(2000,  9, 23),  datetime.date(2000, 12, 20))),
           (1, (datetime.date(2000, 12, 21),  datetime.date(2000, 12, 31)))]

center_baltimore = (39.30746849825375, -76.61560625253648)

energy_types = ["coal","nuclear","ccgt","wind","pumped","hydro","biomass","oil","solar","ocgt"]

renewable_types = ["wind","pumped","hydro","biomass","solar"]

crime_type = [
    (1, 'LARCENY'), (2,'LARCENY FROM AUTO'), (3,'AUTO THEFT'), (11,'ROBBERY - STREET'), (12,'ROBBERY - COMMERCIAL'), (13,'ROBBERY - CARJACKING'),(14,'ROBBERY - RESIDENCE'),(15,'BURGLARY'),(21,'COMMON ASSAULT'),(31,'ASSAULT BY THREAT'),(32,'AGG. ASSAULT'),(41,'ARSON'),(51,'SHOOTING'),(52,'RAPE'),(61,'HOMICIDE')
]
weapon_type = [
    (0, 'NONE'), (1,'HANDS'), (2,'OTHER'), (3,'KNIFE'), (4,'FIREARM')
]

# Kept data for operations
temp_data = None

def determine_season(day, month):
    if (month == 3 and day >= 20) or (month == 4 or month == 5) or (month == 6 and day < 21):
        return "Spring"
    elif (month == 6 and day >= 21) or (month == 7 or month == 8) or (month == 9 and day < 23):
        return "Summer"
    elif (month == 9 and day >= 23) or (month == 10 or month == 11) or (month == 12 and day < 21):
        return "Autumn"
    else:
        return "Winter"

def process_numeric_variables(data):
    numeric_vars = ds.get_variable_types(data)['Numeric']
    for numeric_var in numeric_vars:
        values = data[numeric_var].astype(float)
        min = values.min()
        max = values.max()
        median = np.median(values)
        std = np.std(values)
        mean = np.mean(values)

        data[f"{numeric_var}_min"] = min
        data[f"{numeric_var}_max"] = max
        data[f"{numeric_var}_median"] = median
        data[f"{numeric_var}_std"] = std
        data[f"{numeric_var}_mean"] = mean
    return data

def process_date_variables(data, row, idx):
    date_vars = ds.get_variable_types(data)['Date']
    for date_var in date_vars:
        date_value = row[date_var]
        date_obj = date_value.strftime('%Y-%m-%d')
        date_obj = datetime.datetime.strptime(date_obj, "%Y-%m-%d")
        day = date_obj.day
        month = date_obj.month
        year = date_obj.year
        season =  determine_season(day, month)

        data.at[idx, f"{date_var}_day"] = day
        data.at[idx, f"{date_var}_month"] = month
        data.at[idx, f"{date_var}_year"] = year
        data.at[idx, f"{date_var}_season"] = season
    
    return data
    

def process_binary_variables(data):
    binary_vars = ds.get_variable_types(data)['Binary']
    for binary_var in binary_vars:
        unique_values = data[binary_var].unique()

        if len(unique_values) == 2:
            data[binary_var] = data[binary_var].map({unique_values[0]: 0, unique_values[1]: 1})
        else:
            print(f"Warning: Binary variable '{binary_var}' does not have exactly two unique values.")
    return data

def process_symbolic_variable(data, encoding_type):
    symbolic_vars = ds.get_variable_types(data)['Symbolic']
    if encoding_type == 'one_hot':
        data = pd.get_dummies(data, columns=symbolic_vars)
    elif encoding_type == 'label':
        for var in symbolic_vars:
            data[var] = data[var].astype('category').cat.codes
    else:
        print('Invalid encoding type')
    
    return data


# DANKFE 5
def dankfe_5(ontology,data):
    data = process_numeric_variables(data)
    data = process_binary_variables(data)
    for index, row in data.iterrows():
        data = process_date_variables(data,row, index)
    data = process_symbolic_variable(data, 'one_hot')




# DANKFE 2
def dankfe_2(model,loop_dataset,edit_dataset):
    global temp_data
    relations_queue = model['relations']

    while len(relations_queue) != 0:
        current_relation = relations_queue[0]
        print(current_relation['output'])
        newvar_name = current_relation['output']
        inputs = current_relation['inputs']
        groupby = current_relation['groupby']
        if set(inputs).issubset(loop_dataset.columns):         # if inputs already exist in the dataset
            start_time = timeit.default_timer()
            # if len(current_relation['constraint']) == 0:
            #     constraint = "index == index"
            # else:
            constraint = current_relation['constraint']
            if len(current_relation['constraint']) != 0:
                constraint = "row." + constraint
            else:
                constraint = "True"
            if len(groupby) == 0: #LAMBDA: no row dependence
                for index, op in enumerate(current_relation['operations']):
                    if index == 0:
                        edit_dataset[newvar_name] = edit_dataset.apply(lambda row: get_operation(op,*zip(inputs,row[inputs])) if pd.eval(constraint, target = row) else np.nan, axis = 1)
                    else:
                        edit_dataset[newvar_name] = edit_dataset.apply(lambda row: get_operation(op,zip(newvar_name,row[newvar_name])) if pd.eval(constraint, target = row) else np.nan, axis = 1)
            else:
                needed_rows = current_relation['needsRows']
                for i in range(len(loop_dataset)):
                    # counter += 1
                    row = loop_dataset.loc[i]  # we pass the list to get a DataFrame instead of Series
                    if len(groupby) != 0:
                        if needed_rows == 'all':
                            temp_data = loop_dataset[loop_dataset[groupby] == row.loc[groupby]]
                        elif needed_rows < 0:
                            temp_data = loop_dataset[loop_dataset[groupby] == row.loc[groupby]].loc[i + needed_rows+1:i]
                        else:
                            temp_data = loop_dataset[loop_dataset[groupby] == row.loc[groupby]].loc[i:i + needed_rows-1] # get necessary rows to temp_data
                    # else:
                    #     if needed_rows < 0: 
                    #         temp_data = dataset.loc[i+ needed_rows:i]
                    #     else:    
                    #         temp_data = dataset.loc[i:i + needed_rows]
                    if pd.eval(constraint, target = row) == False:
                        edit_dataset.loc[i,newvar_name] = np.nan
                        continue
                    else:
                        for opIndex, op in enumerate(current_relation['operations']):
                            input_values =  [row.loc[inp] for inp in inputs]
                            if opIndex == 0:
                                edit_dataset.loc[i,newvar_name] = get_operation(op,*zip(inputs,input_values))
                            else:
                                edit_dataset.loc[i,newvar_name] = get_operation(op,zip(newvar_name,edit_dataset.loc[i,newvar_name]))
            relations_queue = relations_queue[1:]
            loop_dataset = edit_dataset.copy(deep=True)
            print(timeit.default_timer() - start_time)
        else:
            # send to the bottom of the queue
            relations_queue.append(relations_queue.pop(relations_queue.index(current_relation)))


# DANKFE 1
def dankfe_1(model,dataset):
    relations_queue = model['relations']

    while len(relations_queue) != 0:
        current_relation = relations_queue[0]
        if len(current_relation['groupby']) != 0:
            relations_queue.append(relations_queue.pop(relations_queue.index(current_relation)))
            continue
        print(current_relation['output'])
        newvar_name = current_relation['output']
        inputs = current_relation['inputs']
        if set(inputs).issubset(dataset.columns):
            start_time = timeit.default_timer()
            # if inputs already exist in the dataset
            constraint = current_relation['constraint']
            if len(current_relation['constraint']) != 0:
                constraint = "row." + constraint
            else:
                constraint = "True"
            for index, op in enumerate(current_relation['operations']):
                if index == 0:
                    dataset[newvar_name] = dataset.apply(lambda row: get_operation(op,*zip(inputs,row[inputs])) if pd.eval(constraint, target = row) else np.nan, axis = 1)
                else:
                    # dataset[newvar_name] = dataset.apply(lambda row: print(row[newvar_name]), axis = 1)
                    dataset[newvar_name] = dataset.apply(lambda row: get_operation(op,(newvar_name,row[newvar_name])) if pd.eval(constraint, target = row) else np.nan, axis = 1)
            relations_queue = relations_queue[1:]
            print(timeit.default_timer() - start_time)
        else:
            # send to the bottom of the queue
            relations_queue.append(relations_queue.pop(relations_queue.index(current_relation)))




# DANKFE 4
def dankfe_4(ontology,data):
    start_time = timeit.default_timer()
    data = decompositions(ontology,data)
    print('Decomp op time:')
    print(timeit.default_timer() - start_time)
    start_time = timeit.default_timer()
    data = mapping(ontology,data)
    print('Mapping op time:')
    print(timeit.default_timer() - start_time)
    start_time = timeit.default_timer()
    data = algebraic(ontology,data)
    print('Algebric op time:')
    print(timeit.default_timer() - start_time)
    start_time = timeit.default_timer()
    data = aggregations(ontology,data)
    print('Aggregation op time:')
    print(timeit.default_timer() - start_time)
    start_time = timeit.default_timer()
    data = compositions(ontology,data)
    print('Composition op time:')
    print(timeit.default_timer() - start_time)
    return data


def decompositions(ontology, data):
    vars = extract_variables_decomp(ontology)
    new_variables = {}
    
    # Read the CSV file
    for idx ,row in data.iterrows():
        for var in vars:
            res = apply_decomp_rules(row[vars[var]], var)
            new_variables = {res[0] : res[1]}
            # Update the row with new variables
            for key, value in new_variables.items():
                data.at[idx, key] = value
    
    return data

def mapping(ontology, data):
    vars = extract_variables_mapping(ontology)
    new_variables = {}
    
    # Read the CSV file
    for idx, row in data.iterrows():
        for var in vars:
        # Apply rules to the specified variable in the row
            res = apply_mapping_rules(row[vars[var]], var)
            new_variables = {res[0] : res[1]}
        # Update the row with new variables
            for key, value in new_variables.items():
                data.at[idx, key] = value
    
    return data

def aggregations(ontology, data):
    vars = extract_variables_aggregation(ontology)
    new_variables = {}

    for var in vars:
        if vars[var][0] == 'sum':
            aggregated_values = {}

            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]

                if group_value not in aggregated_values:
                    aggregated_values[group_value] = {vars[var][1] : 0}

                aggregated_values[group_value][vars[var][1]] += int(row[vars[var][1]])
            
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]
                new_variables[var] = aggregated_values[group_value][vars[var][1]]

                for key, value in new_variables.items():
                    data.at[idx, key] = value

        if vars[var][0] == 'avg':
            aggregated_values = {}
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]

                if group_value not in aggregated_values:
                    aggregated_values[group_value] = {'count': 0, 'total': 0}

                aggregated_values[group_value]['count'] += 1
                aggregated_values[group_value]['total'] += int(row[vars[var][1]])
            
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]
                new_variables[var] = aggregated_values[group_value]['total'] / aggregated_values[group_value]['count']

                for key, value in new_variables.items():
                    data.at[idx, key] = value
        
        if vars[var][0] == 'max':
            aggregated_values = {}
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]

                if group_value not in aggregated_values:
                    aggregated_values[group_value] = {'max': 0}

                if aggregated_values[group_value]['max'] <= row[vars[var][1]]:
                    aggregated_values[group_value]['max'] = row[vars[var][1]]
            
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]
                new_variables[var] = aggregated_values[group_value]['max']

                for key, value in new_variables.items():
                    data.at[idx, key] = value
        
        if vars[var][0] == 'min':
            aggregated_values = {}
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]

                if group_value not in aggregated_values:
                    aggregated_values[group_value] = {'min': row[vars][var][1]}

                if aggregated_values[group_value]['min'] >= row[vars[var][1]]:
                    aggregated_values[group_value]['min'] = row[vars[var][1]]
            
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]
                new_variables[var] = aggregated_values[group_value]['min']

                for key, value in new_variables.items():
                    data.at[idx, key] = value

        if vars[var][0] == 'std':
            aggregated_values = {}
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]

                if group_value not in aggregated_values:
                    aggregated_values[group_value] = {'count': 0, 'total': 0, 'squared_total' : 0}

                aggregated_values[group_value]['count'] += 1
                aggregated_values[group_value]['total'] += int(row[vars[var][1]])
                aggregated_values[group_value]['squared_total'] += int(row[vars[var][1]]) ** 2
            
            for idx, row in data.iterrows():
                group_value = row[vars[var][2]]
                mean = aggregated_values[group_value]['total'] / aggregated_values[group_value]['count']
                squared_mean = aggregated_values[group_value]['squared_total'] / aggregated_values[group_value]['count']
                new_variables[var] = np.sqrt(squared_mean - mean ** 2)

                for key, value in new_variables.items():
                    data.at[idx, key] = value


    return data


def compositions(ontology, data):
    swrl_rules = {}

    body_atoms = []

    for s, p, o in ontology.triples((None, URIRef('http://www.w3.org/2003/11/swrl#body'), None)):
        atom_list = [o]
        while True:
            for s, p, o in ontology.triples((atom_list[-1], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#first'), None)):
                atom_list.append(o)
            rest = next(ontology.objects(atom_list[-2], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest')))
            if rest == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'):
                break
            atom_list.append(rest)

        atom_info = {}
        arguments = {}
        list_builtin = []
        variables_connection = {}
        atom_info['argument_variables'] = arguments
        for atom in atom_list:
            if (atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'), None) in ontology:
                variable = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'))).split('/')[-1]
                argument = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#argument1'))).split('/')[-1]
                variables_connection[argument] = variable
                atom_info['argument_variables'][variable] = variable
            elif (atom, URIRef('http://www.w3.org/2003/11/swrl#builtin'), None) in ontology:
                builtin = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#builtin'))).split('/')[-1]
                arguments = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#arguments'))).split('/')[-1]
                arguments = list(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#arguments')))
                for arg in arguments:
                    list_items = {}
                    list_items[builtin.split('#')[-1]] = []
                    for item in ontology.items(arg):
                        # If the list item is a resource, print its URI
                        if isinstance(item, URIRef):
                            if str(item).split('/')[-1] in variables_connection:
                                list_items[builtin.split('#')[-1]].append(variables_connection[str(item).split('/')[-1]])
                            else:
                                list_items[builtin.split('#')[-1]].append(str(item).split('/')[-1])
                        # If the list item is a literal, print its value
                        elif isinstance(item, Literal):
                            list_items[builtin.split('#')[-1]].append(str(item))
                    list_builtin.append(list_items)
                atom_info['builtins'] = list_builtin
        body_atoms.append(atom_info)


    head_atoms = []

    for s, p, o in ontology.triples((None, URIRef('http://www.w3.org/2003/11/swrl#head'), None)):
        atom_list = [o]
        while True:
            for s, p, o in ontology.triples((atom_list[-1], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#first'), None)):
                atom_list.append(o)
            rest = next(ontology.objects(atom_list[-2], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest')))
            if rest == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'):
                break
            atom_list.append(rest)

        atom_info = {}
        arguments = {}
        list_builtin = []
        for atom in atom_list:
            if (atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'), None) in ontology:
                variable = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'))).split('/')[-1]
                atom_info[variable] = variable
        head_atoms.append(atom_info)


    for item in head_atoms:
        key = next(iter(item))  
        swrl_rules[key] = body_atoms.pop(0)
    
    keys_to_delete = [key for key in swrl_rules if key.startswith('Algebric')]
    for key in keys_to_delete:
        del swrl_rules[key]

    composition_rules = {}
    for key, value in swrl_rules.items():
        new_key = key.split('_', 1)[1]
        composition_rules[new_key] = value
 
    data = apply_composition_rules_to_dataset(data, composition_rules)

    return data

def algebraic(ontology, data):
    swrl_rules = {}

    body_atoms = []

    for s, p, o in ontology.triples((None, URIRef('http://www.w3.org/2003/11/swrl#body'), None)):
        atom_list = [o]
        while True:
            for s, p, o in ontology.triples((atom_list[-1], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#first'), None)):
                atom_list.append(o)
            rest = next(ontology.objects(atom_list[-2], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest')))
            if rest == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'):
                break
            atom_list.append(rest)

        atom_info = {}
        arguments = {}
        list_builtin = []
        variables_connection = {}
        atom_info['argument_variables'] = arguments
        for atom in atom_list:
            if (atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'), None) in ontology:
                variable = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'))).split('/')[-1]
                argument = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#argument1'))).split('/')[-1]
                variables_connection[argument] = variable
                atom_info['argument_variables'][variable] = variable
            elif (atom, URIRef('http://www.w3.org/2003/11/swrl#builtin'), None) in ontology:
                builtin = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#builtin'))).split('/')[-1]
                arguments = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#arguments'))).split('/')[-1]
                arguments = list(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#arguments')))
                for arg in arguments:
                    list_items = {}
                    list_items[builtin.split('#')[-1]] = []
                    for item in ontology.items(arg):
                        # If the list item is a resource, print its URI
                        if isinstance(item, URIRef):
                            if str(item).split('/')[-1] in variables_connection:
                                list_items[builtin.split('#')[-1]].append(variables_connection[str(item).split('/')[-1]])
                            else:
                                list_items[builtin.split('#')[-1]].append(str(item).split('/')[-1])
                        # If the list item is a literal, print its value
                        elif isinstance(item, Literal):
                            list_items[builtin.split('#')[-1]].append(str(item))
                    list_builtin.append(list_items)
                atom_info['builtins'] = list_builtin
        body_atoms.append(atom_info)


    head_atoms = []

    for s, p, o in ontology.triples((None, URIRef('http://www.w3.org/2003/11/swrl#head'), None)):
        atom_list = [o]
        while True:
            for s, p, o in ontology.triples((atom_list[-1], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#first'), None)):
                atom_list.append(o)
            rest = next(ontology.objects(atom_list[-2], URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest')))
            if rest == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'):
                break
            atom_list.append(rest)

        atom_info = {}
        arguments = {}
        list_builtin = []
        for atom in atom_list:
            if (atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'), None) in ontology:
                variable = next(ontology.objects(atom, URIRef('http://www.w3.org/2003/11/swrl#classPredicate'))).split('/')[-1]
                atom_info[variable] = variable
        head_atoms.append(atom_info)


    for item in head_atoms:
        key = next(iter(item))  
        swrl_rules[key] = body_atoms.pop(0)
    
    keys_to_delete = [key for key in swrl_rules if key.startswith('Composition')]
    for key in keys_to_delete:
        del swrl_rules[key]

    algebric_rules = {}
    for key, value in swrl_rules.items():
        new_key = key.split('_', 1)[1]
        algebric_rules[new_key] = value
 
    data = apply_algebric_rules_to_dataset(data, algebric_rules)

    return data

       
def apply_algebric_rules_to_dataset(data, swrl_rules):
    for idx, row in data.iterrows():
        modified_row = apply_algebric_rules_to_row(row, swrl_rules)

        for key, value in modified_row.items():
            data.at[idx, key] = value
    
    return data


def apply_composition_rules_to_dataset(data, composition_rules):
    for idx, row in data.iterrows():
        modified_row = apply_composition_rules_to_row(row, composition_rules,data)

        for key, value in modified_row.items():
            data.at[idx, key] = value
    
    return data


def apply_composition_rules_to_row(row, composition_rules,data):
    intermediate_results = {}
    
    for rule_name, rule in composition_rules.items():
        operations = rule['builtins']
        final_operation = operations[-1]
        
        # Perform intermediate operations
        for ops in operations[:-1]:
            op_type = list(ops.keys())[0]
            args = ops[op_type]
            arg1, arg2, arg3 = args
            if row[arg2] == '' or row[arg3] == '':
                intermediate_results[arg1] = None
                continue
            if op_type == 'divide':
                try:
                    result = float(row[arg2]) / float(row[arg3])
                except ZeroDivisionError:
                    result = None
                intermediate_results[arg1] = result
            elif op_type == 'multiply':
                result = float(row[arg2]) * float(row[arg3])
                intermediate_results[arg1] = result
            elif op_type == 'subtract':
                result = float(row[arg2]) - float(row[arg3])
                intermediate_results[arg1] = result
    
        final_op_type = list(final_operation.keys())[0]
        final_op_args = final_operation[final_op_type]
        
        # Perform final operation
        if final_op_type == 'subtract':
            arg1, arg2, arg3 = final_op_args
            result = int(row[arg2]) - int(row[arg3])
            row[rule_name] = result
            continue
        if final_op_type == 'divide':
            arg1, arg2, arg3 = final_op_args
            try:
                result = int(row[arg2]) / int(row[arg3])
            except ZeroDivisionError:
                result = None
            row[rule_name] = result
            continue
        if final_op_type == 'abs':
            arg1, arg2, arg3 = final_op_args
            result = np.mean(data[arg2].to_list())
            row[rule_name] = result
            continue
        if final_op_type == 'multiply':
            arg1, arg2, arg3 = final_op_args
            if arg2 in intermediate_results and intermediate_results[arg2] is not None:
                if arg3.isdigit():
                    result = intermediate_results[arg2] * int(arg3)
                else:
                    result = intermediate_results[arg2] * row[arg3]
                row[rule_name] = result
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is not None:
                if arg2.isdigit():
                    result = int(arg2) * intermediate_results[arg3]
                else:
                    result = row[arg2] * intermediate_results[arg3]
                row[rule_name] = result
                continue
            if arg2 in intermediate_results and intermediate_results[arg2] is None:
                row[rule_name] = None
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is None:
                row[rule_name] = None
                continue
            if arg2.isdigit():
                result = int(arg2) * int(row[arg3])
                row[rule_name] = result
                continue
            if arg3.isdigit():
                result = int(row[arg2]) * int(arg3)
                row[rule_name] = result
                continue
            result = int(row[arg2]) * int(row[arg3])
            row[rule_name] = result
            continue
        if final_op_type == 'greaterThan':
            arg1, arg2, arg3 = final_op_args
            if arg2 in intermediate_results and intermediate_results[arg2] is not None:
                if arg3.isdigit():
                    result = intermediate_results[arg2] > int(arg3)
                else:
                    result = intermediate_results[arg2] > int(row[arg3])
                row[rule_name] = result
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is not None:
                if arg2.isdigit():
                    result = int(arg2) > intermediate_results[arg3]
                else:
                    result = int(row[arg2]) > intermediate_results[arg3]
                row[rule_name] = result
                continue            
            if arg2 in intermediate_results and intermediate_results[arg2] is None:
                row[rule_name] = None
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is None:
                row[rule_name] = None
                continue
            if arg2.isdigit():
                row[rule_name] = int(arg2) > int(row[arg3])
                continue
            if arg3.isdigit():
                row[rule_name] = int(row[arg2]) > int(arg3)
                continue
            row[rule_name] = int(row[arg2]) > int(row[arg3])
            continue
        # Add conditions for other final operations if needed
    
    return row


def apply_algebric_rules_to_row(row, algebric_rules):
    intermediate_results = {}
    
    for rule_name, rule in algebric_rules.items():
        operations = rule['builtins']
        final_operation = operations[-1]
        
        # Perform intermediate operations
        for ops in operations[:-1]:
            op_type = list(ops.keys())[0]
            args = ops[op_type]
            arg1, arg2, arg3 = args
            if row[arg2] == '' or row[arg3] == '':
                intermediate_results[arg1] = None
                continue
            if op_type == 'divide':
                try:
                    result = float(row[arg2]) / float(row[arg3])
                except ZeroDivisionError:
                    result = None
                intermediate_results[arg1] = result
            elif op_type == 'multiply':
                result = float(row[arg2]) * float(row[arg3])
                intermediate_results[arg1] = result
            elif op_type == 'subtract':
                result = float(row[arg2]) - float(row[arg3])
                intermediate_results[arg1] = result
    
        final_op_type = list(final_operation.keys())[0]
        final_op_args = final_operation[final_op_type]
        
        # Perform final operation
        if final_op_type == 'add':
            arg1, arg2, arg3 = final_op_args
            if arg2 in energy_types:
                result = 0
                for energy in energy_types:
                    result = result + int(row[energy])
                row[rule_name] = result
                continue
            if arg2 in renewable_types:
                result = 0
                for energy in renewable_types:
                    result = result + int(row[energy])
                row[rule_name] = result
                continue
        if final_op_type == 'sin':
            arg1, arg2, arg3 = final_op_args
            result = haversine(int(row[arg2]),int(row[arg3]),center_baltimore[0],center_baltimore[1])
            row[rule_name] = result
            continue
        if final_op_type == 'multiply':
            arg1, arg2, arg3 = final_op_args
            if arg2 in intermediate_results and intermediate_results[arg2] is not None:
                if arg3.isdigit():
                    result = intermediate_results[arg2] * int(arg3)
                else:
                    result = intermediate_results[arg2] * row[arg3]
                row[rule_name] = result
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is not None:
                if arg2.isdigit():
                    result = int(arg2) * intermediate_results[arg3]
                else:
                    result = row[arg2] * intermediate_results[arg3]
                row[rule_name] = result
                continue
            if arg2 in intermediate_results and intermediate_results[arg2] is None:
                row[rule_name] = None
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is None:
                row[rule_name] = None
                continue
            if arg2.isdigit():
                result = int(arg2) * int(row[arg3])
                row[rule_name] = result
                continue
            if arg3.isdigit():
                result = int(row[arg2]) * int(arg3)
                row[rule_name] = result
                continue
            result = int(row[arg2]) * int(row[arg3])
            row[rule_name] = result
            continue
        if final_op_type == 'divide':
            arg1, arg2, arg3 = final_op_args
            if arg2 in intermediate_results and intermediate_results[arg2] is not None:
                if arg3.isdigit():
                    try:
                        result = intermediate_results[arg2] / int(arg3)
                    except ZeroDivisionError:
                        result = None
                    row[rule_name] = result
                    continue
                else:
                    try:
                        result = intermediate_results[arg2] / int(row[arg3])
                    except ZeroDivisionError:
                        result = None
                    row[rule_name] = result
                    continue
            if arg3 in intermediate_results and intermediate_results[arg3] is not None:
                if arg2.isdigit():
                    try:
                        result = int(arg2) / intermediate_results[arg3]
                    except ZeroDivisionError:
                        result = None
                    row[rule_name] = result
                    continue
                else:
                    try:
                        result = int(row[arg2]) / intermediate_results[arg3]
                    except ZeroDivisionError:
                        result = None
                    row[rule_name] = result
                    continue
            if arg2 in intermediate_results and intermediate_results[arg2] is None:
                row[rule_name] = None
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is None:
                row[rule_name] = None
                continue
            if arg2.isdigit():
                try:
                    result = int(arg2) / int(row[arg3])
                except ZeroDivisionError:
                    result = None
                row[rule_name] = result
                continue
            if arg3.isdigit():
                try:
                    result = int(row[arg2]) / int(arg3)
                except ZeroDivisionError:
                    result = None
                row[rule_name] = result
                continue
            try:
                result = int(row[arg2]) / int(row[arg3])
            except ZeroDivisionError:
                result = None
            row[rule_name] = result
            continue
        if final_op_type == 'subtract':
            arg1, arg2, arg3 = final_op_args
            if arg2 in intermediate_results and intermediate_results[arg2] is not None:
                if arg3.isdigit():
                    result = intermediate_results[arg2] - int(arg3)
                else:
                    result = intermediate_results[arg2] - row[arg3]
                row[rule_name] = result
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is not None:
                if arg2.isdigit():
                    result = int(arg2) - intermediate_results[arg3]
                else:
                    result = row[arg2] - intermediate_results[arg3]
                row[rule_name] = result
                continue
            if arg2 in intermediate_results and intermediate_results[arg2] is None:
                row[rule_name] = None
                continue
            if arg3 in intermediate_results and intermediate_results[arg3] is None:
                row[rule_name] = None
                continue
            if arg2.isdigit():
                result = int(arg2) - int(row[arg3])
                row[rule_name] = result
                continue
            if arg3.isdigit():
                result = int(row[arg2]) - int(arg3)
                row[rule_name] = result
                continue
            if is_date(row[arg2]):
                if isinstance(row[arg2],str):
                    date1 = datetime.datetime.strptime(row[arg2], "%d/%m/%Y")
                    date2 = datetime.datetime.strptime(row[arg3], "%d/%m/%Y")
                    result = (date2.year - date1.year) * 12 + date2.month - date1.month
                    row[rule_name] = result
                    continue
                value1 = row[arg2]
                value2 = row[arg3]
                aux1 = value1.strftime('%Y-%m-%d')
                aux2 = value2.strftime('%Y-%m-%d')
                date1 = datetime.datetime.strptime(aux1, "%Y-%m-%d")
                date2 = datetime.datetime.strptime(aux2, "%Y-%m-%d")
                result = (date2.year - date1.year) * 12 + date2.month - date1.month
                row[rule_name] = result
                continue
            result = int(row[arg2]) - int(row[arg3])
            row[rule_name] = result
            continue
        # Add conditions for other final operations if needed
    
    return row


def extract_variables_decomp(ontology):
    namespace = Namespace("http://www.semanticweb.org/rodrigo/ontologies/2024/1/covid/") #mudar depois para mais geral

    vars = {}

    # Iterate over the triples to find the DASE_RULE section
    for subj, _, obj in ontology.triples((None, namespace["DASE_RULE"], None)):
        # Extract the rule from the DASE_RULE section
        if obj.startswith("Decomposition"):
            # Use regex to extract the variables
            match1 = re.findall(r"___(\w+)\([^)]*\)", obj)
            match2 = re.findall(r"-> (\w+)\(",obj)
            
            # Append the extracted variables to the list
            if match1 and match2:
                vars[match2[0]] = match1[0]
    return vars

def extract_variables_mapping(ontology):
    namespace = Namespace("http://www.semanticweb.org/rodrigo/ontologies/2024/1/covid/") #mudar depois para mais geral

    vars = {}

    for subj, _, obj in ontology.triples((None, namespace["DASE_RULE"], None)):
        # Extract the rule from the DASE_RULE section
        if obj.startswith("Mapping"):
            # Use regex to extract the variables
            match1 = re.findall(r"___(\w+)\([^)]*\)", obj)
            match2 = re.findall(r"-> (\w+)\(",obj)
            
            # Append the extracted variables to the list
            if match1 and match2:
                vars[match2[0]] = match1[0]
    return vars


def extract_variables_aggregation(ontology):
    namespace = Namespace("http://www.semanticweb.org/rodrigo/ontologies/2024/1/covid/") #mudar depois para mais geral

    vars = {}
    operations = ['sum', 'avg'] #add more operations like stdev etc

    # Iterate over the triples to find the DASE_RULE section
    for subj, _, obj in ontology.triples((None, namespace["DASE_RULE"], None)):
        # Extract the rule from the DASE_RULE section
        if obj.startswith("Aggregation"):
            variables = []
            # Use regex to extract the variables
            match1 = re.findall(r"___(\w+)\([^)]*\)", obj)
            match2 = re.findall(r'\^ (\w+)\(', obj)
            match3 = re.findall(r"-> (\w+)\(",obj)
            
            # Append the extracted variables to the list
            if match1 and match2 and match3:
                variables.append(match1[0])
                for n in range(len(match2)):
                    variables.append(match2[n])
                vars[match3[0]] = variables
            for n in range(len(vars[match3[0]])):
                if vars[match3[0]][n] in operations:
                    tmp = vars[match3[0]][n]
                    aux = vars[match3[0]][0]
                    vars[match3[0]].remove(aux)
                    vars[match3[0]].remove(tmp)
                    vars[match3[0]].insert(0,tmp)
                    vars[match3[0]].insert(1,aux)
    return vars

def is_date(string):
    if isinstance(string, float):
        return False
    formats = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y"] #Add more formats if needed
    if isinstance(string, str):
        for fmt in formats:
            try:
                datetime.datetime.strptime(string, fmt)
                return True
            except ValueError:
                pass
        return False
    date = string.strftime('%Y-%m-%d')
    for fmt in formats:
        try:
            datetime.datetime.strptime(date, fmt)
            return True
        except ValueError:
            pass
    return False

def apply_decomp_rules(value, new_variable):
    result = []
    if is_date(value):
        if isinstance(value, str):
            if(new_variable == 'current_day'):
                    date = datetime.datetime.strptime(value, "%d/%m/%Y")
                    day = date.day
                    result.append(new_variable)
                    result.append(day)
                    return result
            elif(new_variable == 'current_month'):
                date = datetime.datetime.strptime(value, "%d/%m/%Y")
                month = date.month
                result.append(new_variable)
                result.append(month)
                return result
            elif(new_variable == 'current_year'):
                date = datetime.datetime.strptime(value, "%d/%m/%Y")
                year = date.year
                result.append(new_variable)
                result.append(year)
                return result
        date = value.strftime('%Y-%m-%d')
        if(new_variable == 'current_day'):
            date = datetime.datetime.strptime(date, "%Y-%m-%d")
            day = date.day
            result.append(new_variable)
            result.append(day)
            return result
        elif(new_variable == 'current_month'):
            date = datetime.datetime.strptime(date, "%Y-%m-%d")
            month = date.month
            result.append(new_variable)
            result.append(month)
            return result
        elif(new_variable == 'current_year'):
            date = datetime.datetime.strptime(date, "%Y-%m-%d")
            year = date.year
            result.append(new_variable)
            result.append(year)
            return result
        elif(new_variable == 'day_period'):
            res = getDayPeriod(value,day_periods)
            result.append(new_variable)
            result.append(res)
            return result
        elif(new_variable == 'energy_price'):
            res = getDayPeriod(value, energy_prices)
            result.append(new_variable)
            result.append(res)
            return result
    else:
        if(new_variable == 'PM10_safe'):
            res = value >= 150
            result.append(new_variable)
            result.append(res)
            return result
        elif(new_variable == 'SO2_safe'):
            res = value >= 0.14
            result.append(new_variable)
            result.append(res)
            return result
    #add more cases if needed

def apply_mapping_rules(value, new_variable):
    result = []
    if is_date(value):
        if isinstance(value, str):
           if(new_variable == 'Season'):
            date = datetime.datetime.strptime(value, "%d/%m/%Y")
            month = date.month
            day = date.day
            if (month == 3 and day >= 20) or (month == 4 or month == 5) or (month == 6 and day < 21):
                result.append(new_variable)
                result.append("Spring")
                return result
            elif (month == 6 and day >= 21) or (month == 7 or month == 8) or (month == 9 and day < 23):
                result.append(new_variable)
                result.append("Summer")
                return result
            elif (month == 9 and day >= 23) or (month == 10 or month == 11) or (month == 12 and day < 21):
                result.append(new_variable)
                result.append("Autumn")
                return result
            else:
                result.append(new_variable)
                result.append("Winter")
                return result 
        date = value.strftime('%Y-%m-%d')
        if(new_variable == 'Season'):
            date = datetime.datetime.strptime(date, "%Y-%m-%d")
            month = date.month
            day = date.day
            if (month == 3 and day >= 20) or (month == 4 or month == 5) or (month == 6 and day < 21):
                result.append(new_variable)
                result.append("Spring")
                return result
            elif (month == 6 and day >= 21) or (month == 7 or month == 8) or (month == 9 and day < 23):
                result.append(new_variable)
                result.append("Summer")
                return result
            elif (month == 9 and day >= 23) or (month == 10 or month == 11) or (month == 12 and day < 21):
                result.append(new_variable)
                result.append("Autumn")
                return result
            else:
                result.append(new_variable)
                result.append("Winter")
                return result
        #add more cases if needed
    #add more cases if needed
        
    
def get_operation(code,*values):
    if code == '+':
        sum_values = [values[i][1] for i,x in enumerate(values)]
        return np.sum(sum_values)
    elif code == 'positive_sum':
        sum_values = [values[i][1] for i,x in enumerate(values) if values[i][1] >= 0]
        return np.sum(sum_values)
    elif code == 'negative_sum':
        sum_values = [values[i][1] for i,x in enumerate(values) if values[i][1] <= 0]
        return np.sum(sum_values)
    elif code == '-':
        return values[0][1] - values[1][1]
    elif code == '*':
        prod_values = [values[i][1] for i,x in enumerate(values)]
        return np.prod(prod_values)
    elif code == '/':
        return round(values[0][1] / values[1][1],2)
    elif code == '>=':
        return values[0][1] >= values[1][1]
    elif code == 'datediff':
        return relativedelta(values[0][1],values[1][1])
    elif code == 'years':
        return values[0][1].years
    elif code == 'months':
        return values[0][1].years * 12 + values[0][1].months
    elif code == 'getHour':
        return values[0][1].hour
    elif code == 'getDay':
        return values[0][1].day
    elif code == 'getMonth':
        return values[0][1].month
    elif code == 'getYear':
        return values[0][1].year
    elif code == 'getWeekday':
        return values[0][1].dayofweek
    elif code == 'getSeason':
        return getSeason(values[0][1],seasons)
    elif code == 'getDayPeriod':
        return getDayPeriod(values[0][1],day_periods)
    elif code == 'getHoliday':
        return generateHoliday(values[0][1],values[1][1],holidays)
    elif code == 'getEnergyPrice':
        return getDayPeriod(values[0][1],energy_prices)
    elif code == 'divide_by_30':
        return values[0][1] / 30
    elif code == 'getAverage':
        return generateAverage(values[0][0])
    elif code == 'getMax':
        return generateMax(values[0][0])
    elif code == 'getMin':
        return generateMin(values[0][0])
    elif code == 'getStd':
        return generateStd(values[0][0])
    elif code == 'getMedian':
        return generateMedian(values[0][0])
    elif code == 'generateAvg_2weeks':
        return generateAverage2Weeks(values[0][1],values[1][1])
    elif code == 'generateAvg_2w_100k':
        return values[0][1] * 100000 / values[1][1]
    elif code == 'generateSum_2weeks':
        return generateCumulative2Weeks(values[0][1],values[1][1])
    elif code == 'generateHighRisk_2weeks':
        return generateHighRisk(values[0][1],values[1][1],14)
    elif code == 'getLastYearTemp':
        return generateLastYearTemp(values[0][1],12)
    elif code == 'generateSum_2w_100k':
        return values[0][1] * 100000 / values[1][1]
    elif code == 'generateEstadio_8ed':
        return generateFromTable(values[0][1],values[1][1],estadio)
    elif code == 'generateN_8ed':
        return generateN8ed(values[0][1])
    elif code == 'getDistanceBaltimore':
        return haversine(values[0][1],values[1][1],center_baltimore[0],center_baltimore[1])
    elif code == 'getCrimeType':
        return [x[0] for x in crime_type if x[1] == values[0][1]][0]
    elif code == 'getWeapon':
        return [x[0] for x in weapon_type if x[1] == values[0][1]][0]
    elif code == 'getCases100k':
        return values[0][1] * 100000 / values[1][1]
    elif code == 'getCurrentRisk':
        return values[0][1] > 120
    elif code == 'getAverageDiffPos':
        return values[0][1] >= 0
    elif code == 'generatePM25_safe':
        return values[0][1] >= 35
    elif code == 'generatePM10_safe':
        return values[0][1] >= 150
    elif code == 'generateSO2_safe':
        return values[0][1] >= 0.14
    else:
        return lambda *x : x

def haversine(lat1, lon1, lat2, lon2):
      R = 6372.8
      dLat = radians(lat2 - lat1)
      dLon = radians(lon2 - lon1)
      lat1 = radians(lat1)
      lat2 = radians(lat2)
      a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
      c = 2*asin(sqrt(a))
      return R * c

def generateFromTable(t,n,table):
    return table[t][n]

def generateN8ed(gg_p):
    if gg_p == 0:
        return 1
    elif gg_p >= 1 and gg_p <= 2:
        return 2
    elif gg_p >= 3 and gg_p <= 6:
        return 3
    elif gg_p >= 7 and gg_p <= 15:
        return 4
    else:
        return 5

def generateAverage(column):
    return np.mean(temp_data[column].to_list())
def generateMax(column):
    return np.max(temp_data[column].to_list())
def generateMin(column):
    return np.min(temp_data[column].to_list())
def generateStd(column):
    return np.std(temp_data[column].to_list())
def generateMedian(column):
    return np.median(temp_data[column].to_list())

def getSeason(date,season_dict):
    date = date.replace(year=2000)
    return next(season for season, (start, end) in season_dict
                if start <= date <= end)
def getDayPeriod(date,season_dict):
    date = date.replace(year=2000,month=1,day=1)
    return next(season for season, (start, end) in season_dict
                if start <= date <= end)

def generateAverage2Weeks(date,cases):
    lastCases = temp_data['cases'].to_list()
    return np.mean(lastCases)

def generateHighRisk(date,sum_2weeks,offset):
    high_risk_day = date + datetime.timedelta(days=offset)
    high_risk_cumulative = temp_data[temp_data['current_date'] == high_risk_day]
    if len(high_risk_cumulative) == 0:
        return np.nan
    else:
        return high_risk_cumulative.iloc[0]['sum_2w_100k'] >= 120.0

def generateLastYearTemp(date,offset):
    last_year_day = date - relativedelta(months=offset)
    last_year_record = temp_data[temp_data['current_date'] == last_year_day]
    if len(last_year_record) == 0:
        return np.nan
    else:
        return last_year_record.iloc[0]['temperature']

def generateCumulative2Weeks(date,cases):
    lastCases = temp_data['cases'].to_list()
    return np.sum(lastCases)

def generateHoliday(date,country,holiday_data):
    return holiday_data.loc[(holiday_data['date'] == date) & (holiday_data['country'] == country)].any().all()

In [20]:
print('read csv')
print(read_time)

# # First data preparation - MVs
start_time = timeit.default_timer()
data_observed = data_preparation(data,options,target)
print('Data Preparation 1 Time')
print(timeit.default_timer() - start_time)

# # Add automatic variables to ER Model before generation
start_time = timeit.default_timer()
#model = generateAutoVariables(data_observed,options,template,model)
#print(timeit.default_timer() - start_time)

#data_observed = dankfe_4(ontology,data_observed)
#print('DANKFE4 time')
#print(timeit.default_timer() - start_time)

data_observed = dankfe_5(ontology,data_observed)

# Scaling and Balancing
#start_time = timeit.default_timer()
#data_observed_train, data_observed_test = data_preparation_2(data_observed,options,target)
#print('Data Preparation 2 Time')
#print(timeit.default_timer() - start_time)

#print('Saving csv Time')
#start_time = timeit.default_timer()
#data_observed_train.to_csv(f'{data_folder}{base_name}_dankfe4_train.csv',index=False)
#data_observed_test.to_csv(f'{data_folder}{base_name}_dankfe4_test.csv',index=False)
#print(timeit.default_timer() - start_time)
#sys.stdout.close()

read csv
0.18906160000005912
Data Preparation 1 Time
0.05146280000008119
