# How to test various featurings

In [None]:
import json
from predictsignauxfaibles.transformers import print_featuring_for_model_conf, get_featuring, apply_log, apply_sqrt

# 1. List of features
It is possible to create an array of variable names but also to use the file variables.json with the command as follows (by filling the `port`):

```
curl --proxy socks5h://localhost:<port> -OL https://raw.githubusercontent.com/signaux-faibles/opensignauxfaibles/master/js/reduce.algo2/docs/variables.json -o variables.json
```

In [None]:
with open("output/data/variables.json", 'r', encoding = 'utf-8') as f:
    variables = json.load(f)

features = list(set([x['name'] for x in variables]) - set("outcome"))

# 2. Exploration of relevant featuring
For each variable taken separately, the function `get_featuring` provide the optimal transformation for a single variable to explain the variable `outcome` in a simple LogisticModel. These transformations still need to be tested in the SF model afterwards.

In [None]:
res = get_featuring(features, [apply_log, apply_sqrt])
res

# 3. Print the tranformation for each feature if relevant ready to be plugged in a model_conf.py.

In [None]:
print_featuring_for_model_conf(res)

# 4. (optional) Build/Export train and test datasets

In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)
from predictsignauxfaibles.utils import load_conf

conf = load_conf("default")

train = conf.TRAIN_DATASET
train.sample_size = 1e4

test = conf.TEST_DATASET
test.sample_size = 1e4

savepath = "output/data/featuring"

train.fetch_data().raise_if_empty()
test.fetch_data().raise_if_empty()
logging.info("Succesfully loaded Features data from MongoDB")

if savepath is not None:
    train.data.to_csv(f"{savepath}_train.csv")
    test.data.to_csv(f"{savepath}_test.csv")
    logging.info(f"Saved Features extract to {savepath}")

# 5. Get data from csv

In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)

import pandas as pd

from predictsignauxfaibles.config import IGNORE_NA
from predictsignauxfaibles.pipelines import run_pipeline
from predictsignauxfaibles.utils import load_conf
from predictsignauxfaibles.evaluate import evaluate

csvpath = "output/data/featuring"

train_filepath = f"{csvpath}_train.csv"
test_filepath = f"{csvpath}_test.csv"

train_data = pd.read_csv(train_filepath)
logging.info(f"Succesfully loaded train data from {train_filepath}")

test_data = pd.read_csv(test_filepath)
logging.info(f"Succesfully loaded test data from {test_filepath}")

# 6. Evaluate a model with and without featuring and compare performance

In [None]:
def evaluate_to_compare(train_data, test_data, conf_name:str = "default"):
    conf = load_conf(conf_name)
    train = conf.TRAIN_DATASET
    train.sample_size = 1e4
    
    test = conf.TEST_DATASET
    test.sample_size = 1e4
    
    train.data = train_data
    test.data = test_data
    
    train_siren_set = train.data["siren"].unique().tolist()
    test.remove_siren(train_siren_set)
    
    train.replace_missing_data().remove_na(ignore=IGNORE_NA)
    train.data = run_pipeline(train.data, conf.TRANSFO_PIPELINE)
    
    test.replace_missing_data().remove_na(ignore=IGNORE_NA)
    test.data = run_pipeline(test.data, conf.TRANSFO_PIPELINE)
    
    model_pp = conf.MODEL_PIPELINE
    fit = model_pp.fit(train.data, train.data["outcome"])
    
    eval_metrics = evaluate(fit, test, conf.EVAL_BETA)
    return {
        'conf_name': conf_name,
        'aucpr': eval_metrics['aucpr']   
    }

In [None]:
perf_default = evaluate_to_compare(train_data, test_data, "default")
perf_default_with_featuring = evaluate_to_compare(train_data, test_data, "default_with_featuring")
print(perf_default)
print(perf_default_with_featuring)