In [1]:
import pandas as pd
import pickle
from concurrent.futures import ThreadPoolExecutor


from adl import DataProcessor
from adl import AnomalyDetector, CosineSimilarityCalculator, PropensityScorer
from adl import compare_results

# Process and load data for the different methods

In [2]:
# Path to Wuerth Data
DATA_PATH = "../01_data/dataset_wuerth.csv"

In [3]:
processor = DataProcessor(DATA_PATH)

combined_condition = lambda df: (df["dunning_level_max"] > 2) & (
    df["dunning_level_current"] > 2
)
# For Anomaly Detection
data_full = processor.process_data(mode=None, scale=None)
data_remove = processor.process_data(
    mode="remove", scale=None, conditions=combined_condition
)
data_extract = processor.process_data(
    mode="extract", scale=None, target_variable="flag_new_orsyshelf"
)
data_both = processor.process_data(
    mode="both", scale=None, conditions=combined_condition
)

# For Cosine Similarity
data_full_normalized = processor.process_data(mode=None, scale="normalize")
data_remove_normalized = processor.process_data(
    mode="remove", scale="normalize", conditions=combined_condition
)
data_extract_normalized = processor.process_data(
    mode="extract", scale="normalize", target_variable="flag_new_orsyshelf"
)
data_both_normalized = processor.process_data(
    mode="both", scale="normalize", conditions=combined_condition
)

# For Propensity scoring
data_full_standardized = processor.process_data(mode=None, scale="standardize")
data_remove_standardized = processor.process_data(
    mode="remove", scale="standardize", conditions=combined_condition
)
data_extract_standardized = processor.process_data(
    mode="extract", scale="standardize", target_variable="flag_new_orsyshelf"
)
data_both_standardized = processor.process_data(
    mode="both", scale="standardize", conditions=combined_condition
)

No scaling applied
No scaling applied
No scaling applied
No scaling applied
Normalizing applied
Normalizing applied
Normalizing applied
Normalizing applied
Unbiased standardizing applied
Unbiased standardizing applied
Unbiased standardizing applied
Unbiased standardizing applied


# Predicting potential customers
## Anomaly Detection

1. Calculating anomalies over 500 variations and counting how often a customer has been selected 
2. Storing the ordered data with the count how often a customer has been selected and is above a certain amount in 01_data/data_out (in order to save computational time when evaluating our method)

*Note: Uncomment line 27 in the following cell to compute the anomaly counts*

In [4]:
MAX_CPU = 4

anomaly_detector = AnomalyDetector()

count_threshold = [250, 200, 150, 100]


# Calculate cust_id_counts for each dataframe and directly save them to csv to
# save computational time when calling the script again


def process_and_save(data, i, suffix):
    anomalies = anomaly_detector.extract_anomalies(data, count_threshold=i)
    anomalies.to_csv(f"../01_data/data_out/anomaly_{suffix}_{i}.csv", index=False)


def run_anomaly():
    with ThreadPoolExecutor(max_workers=MAX_CPU) as executor:
        for i in count_threshold:
            executor.submit(process_and_save, data_full, i, "full")
            executor.submit(process_and_save, data_remove, i, "remove")
            executor.submit(process_and_save, data_extract, i, "extract")
            executor.submit(process_and_save, data_both, i, "both")


# running will take some time -> results stored in data_out
# run_anomaly()

Call anomaly detection results from csv to save computational time:

In [5]:
count = 150
# -> We decided to classify sth as anomaly if it has been detected in at least
# 30% of the different iterations

anomaly_full = pd.read_csv(
    f"../01_data/data_out/anomaly_full_{count}.csv", index_col=False
)
anomaly_remove = pd.read_csv(
    f"../01_data/data_out/anomaly_remove_{count}.csv", index_col=False
)
anomaly_extract = pd.read_csv(
    f"../01_data/data_out/anomaly_extract_{count}.csv", index_col=False
)
anomaly_both = pd.read_csv(
    f"../01_data/data_out/anomaly_both_{count}.csv", index_col=False
)

## Cosine Similarity

1. Compute the pairwise cosine similarity of each non orsy customer with every orsy customer and store those
2. Store either the average or the count how often the cosine similarity exceeded a certain threshold (the latter was used here) and this returns a dataframe witht the ordered cust_ids
3. The top n non orsy customers are returned where n equals the number of customers who exceeded the threshold in anomaly detection

In [6]:
csc = CosineSimilarityCalculator()

cosine_full = csc.get_both_cosine_metrics(
    data_full_normalized, threshold=0.8, n_best=anomaly_full.shape[0], sort_by="count"
)
cosine_remove = csc.get_both_cosine_metrics(
    data_remove_normalized,
    threshold=0.8,
    n_best=anomaly_remove.shape[0],
    sort_by="count",
)
cosine_extract = csc.get_both_cosine_metrics(
    data_extract_normalized,
    threshold=0.8,
    n_best=anomaly_extract.shape[0],
    sort_by="count",
)
cosine_both = csc.get_both_cosine_metrics(
    data_both_normalized, threshold=0.8, n_best=anomaly_both.shape[0], sort_by="count"
)

## Propensity Scoring

1. Get the tuned hyperparameters from `03_model_selection.ipynb`
2. Perform gradient boosting and extract the top n non orsy customers according to their likeliness of being an orsy customer.
3. The top n non orsy customers are returned where n equals the number of customers who exceeded the threshold in anomaly detection

In [7]:
with open("../01_data/data_model_eval/full_best_params.pkl", "rb") as file:
    full_best_params = pickle.load(file)

with open("../01_data/data_model_eval/remove_best_params.pkl", "rb") as file:
    remove_best_params = pickle.load(file)

with open("../01_data/data_model_eval/extract_best_params.pkl", "rb") as file:
    extract_best_params = pickle.load(file)

with open("../01_data/data_model_eval/both_best_params.pkl", "rb") as file:
    both_best_params = pickle.load(file)

In [8]:
propensity_scorer = PropensityScorer()

gb_full = propensity_scorer.gradient_boosting(
    data_full_standardized, n_best=anomaly_full.shape[0], **full_best_params
)
gb_remove = propensity_scorer.gradient_boosting(
    data_remove_standardized, n_best=anomaly_remove.shape[0], **remove_best_params
)
gb_extract = propensity_scorer.gradient_boosting(
    data_extract_standardized, n_best=anomaly_extract.shape[0], **extract_best_params
)
gb_both = propensity_scorer.gradient_boosting(
    data_both_standardized, n_best=anomaly_both.shape[0], **both_best_params
)

# Compare Models

1. For each data set we compare which non orsy customer was predicted as an orsy customer
2. 1 behind a cust_id means only predicted by one model, 2 by two and so on. There are alsdo columns indicating which model predicted someone to be a potential orsy customer

In [9]:
df_names = ["cosine", "anomaly", "propensity"]

common_full = compare_results(
    dataframes=[cosine_full, anomaly_full, gb_full], df_names=df_names
)
common_remove = compare_results([cosine_remove, anomaly_remove, gb_remove], df_names)
common_extract = compare_results(
    [cosine_extract, anomaly_extract, gb_extract], df_names
)
common_both = compare_results([cosine_both, anomaly_both, gb_both], df_names)

# Export data 

In [10]:
common_full.to_csv("../01_data/data_out/common_full.csv", index=False)
common_remove.to_csv("../01_data/data_out/common_remove.csv", index=False)
common_extract.to_csv("../01_data/data_out/common_extract.csv", index=False)
common_both.to_csv("../01_data/data_out/common_both.csv", index=False)