# Imports

In [None]:
import sys
sys.path.insert(0, "/home/axen/projects/actableai-ml")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, mean_squared_error, r2_score
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder as sk_OneHotEncoder
from sklearn.preprocessing import Normalizer as sk_Normalizer

from actableai.utils.debiasing import debias_dataframe, _make_residuals
from actableai.tasks.regression import AAIRegressionTask

In [None]:
# Set up matplotlib and seaborn
%matplotlib inline
sns.set()
plt.rcParams['figure.figsize'] = [12, 8]

pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Load and process Dataset

In [None]:
# TO CHANGE
dataset_path = "../../../data/law_data.csv"

In [None]:
df_law = pd.read_csv(dataset_path)

full_size = len(df_law)
train_size = int(full_size * 0.8)
test_size = full_size - train_size

df_test = df_law.sample(test_size, random_state=0)

df_law["ZFYA_exp"] = df_law["ZFYA"]
df_law.loc[df_test.index, "ZFYA_exp"] = np.nan

In [None]:
print("training_size:", len(df_law[~df_law["ZFYA_exp"].isna()]))
print("testing_size:", len(df_law[df_law["ZFYA_exp"].isna()]))

In [None]:
print("Bias in data")
sns.kdeplot(
    data=df_law,
    x="ZFYA_exp", hue="race",
    common_norm=False
)

# Train Regression Model

The training data should be returned in the regression function as well

In [None]:
target = "ZFYA_exp"
features = ["LSAT", "UGPA"]
debiasing_features = ["race"]
debiased_features = ["LSAT", "UGPA"]

In [None]:
task = AAIRegressionTask()
results = task.run(
    df=df_law,
    target=target,
    features=features,
    debiasing_features=debiasing_features,
    debiased_features=debiased_features,
    presets="medium_quality_faster_train",
    prediction_quantile_low=None,
    prediction_quantile_high=None
)

In [None]:
df_results = results["data"]["prediction_table"]
df_validation = results["data"]["validation_table"]

In [None]:
df_train = results["data"]["training_table"]

# Rename residuals
df_train.rename(inplace=True, columns={
    column: f"{column}_residuals"
    for column in debiased_features
})
df_train.rename(inplace=True, columns={
    f"{column}_orig": column
    for column in debiased_features
})

In [None]:
# Merge dataframes
df_results["dataset"] = "test"
df_validation["dataset"] = "val"
df_train["dataset"] = "train"

df_full_results = df_train.append(df_validation, ignore_index=True).append(df_results, ignore_index=True).copy()

## Train Basic Linear Regression

In [None]:
def make_preprocessor():
    return ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), make_column_selector(dtype_include="number")),
            ("txt", sk_OneHotEncoder(sparse=True, handle_unknown="ignore"), make_column_selector(dtype_exclude="number")),
        ]
    )

In [None]:
lr = make_pipeline(make_preprocessor(), sk_Normalizer(), LinearRegression())

In [None]:
lr = lr.fit(df_full_results[df_full_results["dataset"] == "train"][["LSAT_residuals", "UGPA_residuals"]], df_train["ZFYA_exp"])

In [None]:
df_full_results.loc[(df_full_results["dataset"] == "val") | (df_full_results["dataset"] == "test"), "ZFYA_exp_lr_predicted"] = \
    lr.predict(df_full_results[(df_full_results["dataset"] == "val") | (df_full_results["dataset"] == "test")][["LSAT_residuals", "UGPA_residuals"]])

## Original vs Residuals Densities

In [None]:
df_residuals = df_full_results.melt(
    value_vars=["LSAT_residuals", "UGPA_residuals", "LSAT", "UGPA"],
    id_vars=["race", "dataset"],
    var_name="col",
    value_name="value"
)

In [None]:
sns.displot(
    data=df_residuals,
    x="value",
    hue="race",
    col="col", col_wrap=2,
    col_order=["UGPA", "UGPA_residuals", "LSAT", "LSAT_residuals"],
    kind="kde", facet_kws={"sharex": False, "sharey": False},
    common_norm=False
)

In [None]:
sns.displot(
    data=df_residuals,
    x="value",
    hue="race",
    row="dataset", col="col",
    col_order=["UGPA", "UGPA_residuals", "LSAT", "LSAT_residuals"],
    kind="kde", facet_kws={"sharex": False, "sharey": False},
    common_norm=False
)

## Predictions Densities

In [None]:
df_predictions = df_full_results[(df_full_results["dataset"] == "val") | (df_full_results["dataset"] == "test")].melt(
    value_vars=["ZFYA", "ZFYA_exp_predicted", "ZFYA_exp_lr_predicted"],
    id_vars=["race", "dataset"],
    var_name="col",
    value_name="value"
)

In [None]:
sns.displot(
    data=df_predictions,
    x="value",
    hue="race",
    row="dataset", col="col",
    kind="kde", facet_kws={"sharex": False, "sharey": False},
    common_norm=False
)

## Compare RMSEs

In [None]:
df_errors = pd.DataFrame(columns=["model", "metric", "dataset", "value"])

for model_name, model in zip(["autogluon", "linear_regression"], ["ZFYA_exp_predicted", "ZFYA_exp_lr_predicted"]):
    for dataset in ["val", "test"]:
        df_errors = df_errors.append({
            "model": model_name,
            "metric": "rmse",
            "dataset": dataset,
            "value": mean_squared_error(
                df_full_results[df_full_results["dataset"] == dataset]["ZFYA"],
                df_full_results[df_full_results["dataset"] == dataset][model],
                squared=False
            )
        }, ignore_index=True)
        
        df_errors = df_errors.append({
            "model": model_name,
            "metric": "r2",
            "dataset": dataset,
            "value": r2_score(
                df_full_results[df_full_results["dataset"] == dataset]["ZFYA"],
                df_full_results[df_full_results["dataset"] == dataset][model]
            )
        }, ignore_index=True)

In [None]:
sns.catplot(
    data=df_errors,
    x="value", y="dataset", hue="model",
    col="metric",
    kind="bar", sharex=False
)