In [2]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import joblib
import re
import sys
sys.path.insert(0, '../')
import numpy as np

# Datasets
from aif360.datasets import MEPSDataset19
from aif360.datasets import CompasDataset

# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric

# Explainers
from aif360.explainers import MetricTextExplainer

# Scalers
from sklearn.preprocessing import StandardScaler


# Bias mitigation techniques
from aif360.algorithms.preprocessing import Reweighing,DisparateImpactRemover
from aif360.algorithms.preprocessing import LFR
from aif360.algorithms.preprocessing import OptimPreproc


  warn_deprecated('vmap', 'torch.vmap')


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
# dataset_orig_compas = MEPSDataset19()


dataset_orig_compas = CompasDataset()




In [5]:
# dataset_orig_panel19_train = MEPSDataset19()


dataset_orig_compas_train = CompasDataset()



In [56]:
type(dataset_orig_compas_train)



aif360.datasets.compas_dataset.CompasDataset

In [58]:
print(dataset_orig_compas_train.feature_names)


['sex', 'age', 'race', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'age_cat=25 - 45', 'age_cat=Greater than 45', 'age_cat=Less than 25', 'c_charge_degree=F', 'c_charge_degree=M', 'c_charge_desc=Abuse Without Great Harm', 'c_charge_desc=Agg Abuse Elderlly/Disabled Adult', 'c_charge_desc=Agg Assault W/int Com Fel Dome', 'c_charge_desc=Agg Battery Grt/Bod/Harm', 'c_charge_desc=Agg Fleeing and Eluding', 'c_charge_desc=Agg Fleeing/Eluding High Speed', 'c_charge_desc=Aggr Child Abuse-Torture,Punish', 'c_charge_desc=Aggrav Battery w/Deadly Weapon', 'c_charge_desc=Aggrav Child Abuse-Agg Battery', 'c_charge_desc=Aggrav Child Abuse-Causes Harm', 'c_charge_desc=Aggrav Stalking After Injunctn', 'c_charge_desc=Aggravated Assault', 'c_charge_desc=Aggravated Assault W/Dead Weap', 'c_charge_desc=Aggravated Assault W/dead Weap', 'c_charge_desc=Aggravated Assault W/o Firearm', 'c_charge_desc=Aggravated Assault w/Firearm', 'c_charge_desc=Aggravated Battery', 'c_charge_desc=Aggra

In [57]:
dataset_orig_compas_train.features

array([[ 0., 69.,  0., ...,  0.,  0.,  0.],
       [ 0., 34.,  0., ...,  0.,  0.,  0.],
       [ 0., 24.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0., 57.,  0., ...,  0.,  0.,  0.],
       [ 1., 33.,  0., ...,  0.,  0.,  0.],
       [ 1., 23.,  0., ...,  0.,  0.,  0.]])

In [23]:
# sens_ind = 0
# sens_attr = dataset_orig_panel19_train.protected_attribute_names[sens_ind]
# unprivileged_groups = [{sens_attr: v} for v in
#                     dataset_orig_panel19_train.unprivileged_protected_attributes[sens_ind]]
# privileged_groups = [{sens_attr: v} for v in
#                     dataset_orig_panel19_train.privileged_protected_attributes[sens_ind]]

In [45]:
sens_ind = 1
sens_attr_comp = dataset_orig_compas_train.protected_attribute_names[sens_ind]
unprivileged_groups_compas = [{sens_attr_comp: v} for v in
                    dataset_orig_compas_train.unprivileged_protected_attributes[sens_ind]]
privileged_groups_compas = [{sens_attr_comp: v} for v in
                    dataset_orig_compas_train.privileged_protected_attributes[sens_ind]]

In [46]:
sens_attr_comp

'race'

In [43]:
privileged_groups_compas

[{'race': 1.0}]

In [42]:
unprivileged_groups_compas

[{'race': 0.0}]

In [47]:
metric_orig_panel19_train = BinaryLabelDatasetMetric(
        dataset_orig_compas_train,
        unprivileged_groups=unprivileged_groups_compas,
        privileged_groups=privileged_groups_compas)

In [48]:
explainer_orig_panel19_train = MetricTextExplainer(metric_orig_panel19_train)


In [49]:
test_name=['Mean Difference','Consistency','Statistical Parity Difference','Disparate Impact']
test_definitions=['difference between mean values of two labels','Individual fairness metric that measures how similar the labels are for similar instances.','Difference in selection rates.','ratio of positive outcomes in the unprivileged group divided by the ratio of positive outcomes in the privileged group.']
test_results=[explainer_orig_panel19_train.mean_difference(),explainer_orig_panel19_train.consistency(),explainer_orig_panel19_train.statistical_parity_difference(),explainer_orig_panel19_train.disparate_impact()]
test_status=['Bias Detected','Bias Not Detected','Bias Detected','Bias Detected']
df=pd.DataFrame({'Test Name':test_name,'Test Definitions':test_definitions,'Test Results':test_results,'Test Status':test_status})


In [52]:
test_results

['Mean difference (mean label value on unprivileged instances - mean label value on privileged instances): -0.09713793951315464',
 'Consistency (Zemel, et al. 2013): [0.67630939]',
 'Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.09713793951315464',
 'Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8403836674666473']

In [64]:
RW = Reweighing(unprivileged_groups=unprivileged_groups_compas,privileged_groups=privileged_groups_compas)
dataset_transf_compas_train_rw = RW.fit_transform(dataset_orig_compas_train)

In [81]:
lfr_model = LFR(unprivileged_groups=unprivileged_groups_compas, 
                privileged_groups=privileged_groups_compas)

# Fit the model and transform the dataset
lfr_model.fit(dataset_orig_compas_train)
dataset_transf_compas_train_lfr = lfr_model.transform(dataset_orig_compas_train, threshold = 0.1)
dataset_transf_compas_train_lfr = dataset_orig_compas_train.align_datasets(dataset_transf_compas_train_lfr)


In [74]:
# import numpy as np

# def get_distortion_meps(vold, vnew):
#     # Initialize distortion score
#     distortion_score = 0.0

#     # Define weights for different categories of attributes
#     sensitive_weight = 3.0
#     health_status_weight = 2.0
#     socio_economic_weight = 1.0
#     behavior_weight = 1.5

#     # Sensitive attributes
#     for attr in ['sex', 'age', 'race']:
#         if vold[attr] != vnew[attr]:
#             distortion_score += sensitive_weight

#     # Health status indicators
#     health_attrs = ['PCS42', 'MCS42', 'K6SUM42', 'HIBPDX', 'DIABDX', 'CHDDX', 'ANGIDX', 'MIDX', 'OHRTDX', 'STRKDX', 'EMPHDX', 'CANCERDX', 'JTPAIN', 'ARTHDX', 'ASTHDX', 'ADHDADDX']
#     for attr in health_attrs:
#         # Assuming health status attributes are numerical and a difference in value indicates a change in health status
#         distortion_score += health_status_weight * abs(vold.get(attr, 0) - vnew.get(attr, 0))

#     # Socioeconomic and environmental factors
#     socio_attrs = ['REGION=1', 'REGION=2', 'REGION=3', 'REGION=4', 'MARRY', 'FTSTU', 'EMPST', 'POVCAT', 'INSCOV']
#     for attr in socio_attrs:
#         if vold[attr] != vnew[attr]:
#             distortion_score += socio_economic_weight

#     # Health-related behaviors
#     behavior_attrs = ['ADSMOK42', 'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM']
#     for attr in behavior_attrs:
#         if vold[attr] != vnew[attr]:
#             distortion_score += behavior_weight

#     return distortion_score

In [60]:
def get_distortion_compas(vold, vnew):
    # Initialize distortion score
    distortion_score = 0.0

    # Define weights for different categories of attributes
    sensitive_weight = 3.0
    criminal_history_weight = 2.0
    age_category_weight = 1.5
    charge_degree_weight = 1.0
    specific_charge_weight = 0.5  # Lower weight as there are many specific charges

    # Sensitive attributes
    for attr in ['sex', 'age', 'race']:
        if vold[attr] != vnew[attr]:
            distortion_score += sensitive_weight

    # Criminal history indicators
    criminal_attrs = ['juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']
    for attr in criminal_attrs:
        distortion_score += criminal_history_weight * abs(vold.get(attr, 0) - vnew.get(attr, 0))

    # Age category
    age_cats = ['age_cat=25 - 45', 'age_cat=Greater than 45', 'age_cat=Less than 25']
    for attr in age_cats:
        if vold.get(attr, 0) != vnew.get(attr, 0):
            distortion_score += age_category_weight

    # Charge degree
    charge_degrees = ['c_charge_degree=F', 'c_charge_degree=M']
    for attr in charge_degrees:
        if vold.get(attr, 0) != vnew.get(attr, 0):
            distortion_score += charge_degree_weight

    # Specific charges, just check if there's any change
    charge_changes = sum(vold.get(attr, 0) != vnew.get(attr, 0) for attr in vold if attr.startswith('c_charge_desc='))
    distortion_score += specific_charge_weight * charge_changes

    return distortion_score


In [35]:
# DI = DisparateImpactRemover()
# dataset_transf_compas_train_di = DI.fit_transform(dataset_orig_panel19_train)

In [66]:
DI = DisparateImpactRemover()
dataset_transf_compas_train_di = DI.fit_transform(dataset_orig_compas_train)

In [36]:
# metric_transf_panel19_train = BinaryLabelDatasetMetric(
# dataset_transf_panel19_train_rw,
# unprivileged_groups=unprivileged_groups,
# privileged_groups=privileged_groups)
# explainer_transf_panel19_train = MetricTextExplainer(metric_transf_panel19_train)
# test_results_rw=[explainer_transf_panel19_train.mean_difference()
#                ,explainer_transf_panel19_train.consistency()
#                ,explainer_transf_panel19_train.statistical_parity_difference()
#                ,explainer_transf_panel19_train.disparate_impact()]

In [67]:
metric_transf_compas_train = BinaryLabelDatasetMetric(
dataset_transf_compas_train_rw,
unprivileged_groups=unprivileged_groups_compas,
privileged_groups=privileged_groups_compas)
explainer_transf_compas_train = MetricTextExplainer(metric_transf_compas_train)
test_results_rw=[explainer_transf_compas_train.mean_difference()
               ,explainer_transf_compas_train.consistency()
               ,explainer_transf_compas_train.statistical_parity_difference()
               ,explainer_transf_compas_train.disparate_impact()]

In [37]:
# metric_transf_panel19_train = BinaryLabelDatasetMetric(
# dataset_transf_panel19_train_di,
# unprivileged_groups=unprivileged_groups,
# privileged_groups=privileged_groups)
# explainer_transf_panel19_train = MetricTextExplainer(metric_transf_panel19_train)
# test_results_di=[explainer_transf_panel19_train.mean_difference()
#                ,explainer_transf_panel19_train.consistency()
#                ,explainer_transf_panel19_train.statistical_parity_difference()
#                ,explainer_transf_panel19_train.disparate_impact()]

In [68]:
metric_transf_compas_train = BinaryLabelDatasetMetric(
dataset_transf_compas_train_di,
unprivileged_groups=unprivileged_groups_compas,
privileged_groups=privileged_groups_compas)
explainer_transf_compas_train = MetricTextExplainer(metric_transf_compas_train)
test_results_di=[explainer_transf_compas_train.mean_difference()
               ,explainer_transf_compas_train.consistency()
               ,explainer_transf_compas_train.statistical_parity_difference()
               ,explainer_transf_compas_train.disparate_impact()]

In [21]:
# metric_transf_panel19_train = BinaryLabelDatasetMetric(
# dataset_transf_panel19_train_lfr,
# unprivileged_groups=unprivileged_groups,
# privileged_groups=privileged_groups)
# explainer_transf_panel19_train = MetricTextExplainer(metric_transf_panel19_train)
# test_results_lfr=[explainer_transf_panel19_train.mean_difference()
#                ,explainer_transf_panel19_train.consistency()
#                ,explainer_transf_panel19_train.statistical_parity_difference()
#                ,explainer_transf_panel19_train.disparate_impact()]

In [82]:
metric_transf_compas_train = BinaryLabelDatasetMetric(
dataset_transf_compas_train_lfr,
unprivileged_groups=unprivileged_groups_compas,
privileged_groups=privileged_groups_compas)
explainer_transf_compas_train = MetricTextExplainer(metric_transf_compas_train)
test_results_lfr=[explainer_transf_compas_train.mean_difference()
               ,explainer_transf_compas_train.consistency()
               ,explainer_transf_compas_train.statistical_parity_difference()
               ,explainer_transf_compas_train.disparate_impact()]

  return metric_fun(privileged=False) / metric_fun(privileged=True)


In [70]:
test_results

['Mean difference (mean label value on unprivileged instances - mean label value on privileged instances): -0.09713793951315464',
 'Consistency (Zemel, et al. 2013): [0.67630939]',
 'Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.09713793951315464',
 'Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8403836674666473']

In [71]:
test_results_rw

['Mean difference (mean label value on unprivileged instances - mean label value on privileged instances): 1.1102230246251565e-16',
 'Consistency (Zemel, et al. 2013): [0.67630939]',
 'Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): 1.1102230246251565e-16',
 'Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 1.0000000000000002']

In [84]:
test_results_lfr

['Mean difference (mean label value on unprivileged instances - mean label value on privileged instances): 0.0',
 'Consistency (Zemel, et al. 2013): [1.]',
 'Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): 0.0',
 'Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): nan']

In [73]:
test_results_di

['Mean difference (mean label value on unprivileged instances - mean label value on privileged instances): -0.09713793951315464',
 'Consistency (Zemel, et al. 2013): [0.66774769]',
 'Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.09713793951315464',
 'Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8403836674666473']

In [42]:
# features = dataset_transf_panel19_train_rw.features
# label = dataset_transf_panel19_train_rw.labels.ravel()  # Flatten the label array if necessary
# weights = dataset_transf_panel19_train_rw.instance_weights
# feature_names = dataset_transf_panel19_train_rw.feature_names
# df_rw = pd.DataFrame(features, columns=feature_names)
# df_rw['label'] = label
# df_rw['weights'] = weights


In [77]:
features = dataset_transf_compas_train_rw.features
label = dataset_transf_compas_train_rw.labels.ravel()  # Flatten the label array if necessary
weights = dataset_transf_compas_train_rw.instance_weights
feature_names = dataset_transf_compas_train_rw.feature_names
df_rw = pd.DataFrame(features, columns=feature_names)
df_rw['label'] = label
df_rw['weights'] = weights


In [78]:
df_rw

Unnamed: 0,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,...,c_charge_desc=Viol Injunction Protect Dom Vi,c_charge_desc=Viol Pretrial Release Dom Viol,c_charge_desc=Viol Prot Injunc Repeat Viol,c_charge_desc=Violation License Restrictions,c_charge_desc=Violation Of Boater Safety Id,c_charge_desc=Violation of Injunction Order/Stalking/Cyberstalking,c_charge_desc=Voyeurism,c_charge_desc=arrest case no charge,label,weights
0,0.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.064676
1,0.0,34.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.932297
2,0.0,24.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.932297
3,0.0,44.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.064676
4,0.0,41.0,1.0,0.0,0.0,0.0,14.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.163658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6162,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.064676
6163,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.064676
6164,0.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.064676
6165,1.0,33.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.064676


In [44]:
features = dataset_orig_panel19_train.features
label = dataset_orig_panel19_train.labels
feature_names = dataset_orig_panel19_train.feature_names
dataset_orig_panel19_train_df = pd.DataFrame(features, columns=feature_names)
dataset_orig_panel19_train_df['label'] = label
dataset_orig_panel19_train_df

Unnamed: 0,AGE,RACE,PCS42,MCS42,K6SUM42,REGION=1,REGION=2,REGION=3,REGION=4,SEX=1,...,EMPST=4,POVCAT=1,POVCAT=2,POVCAT=3,POVCAT=4,POVCAT=5,INSCOV=1,INSCOV=2,INSCOV=3,label
0,53.0,1.0,25.93,58.47,3.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,56.0,1.0,20.42,26.57,17.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,23.0,1.0,53.12,50.33,7.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3.0,1.0,-1.00,-1.00,-1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,27.0,0.0,-1.00,-1.00,-1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15825,25.0,0.0,56.71,62.39,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15826,25.0,0.0,56.71,62.39,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15827,2.0,1.0,-1.00,-1.00,-1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15828,54.0,0.0,43.97,42.45,24.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [79]:
# # Assuming `dataset_transf_panel19_train_lfr` is your LFR transformed dataset
# features_lfr = dataset_transf_panel19_train_lfr.features
# label_lfr = dataset_transf_panel19_train_lfr.labels.ravel()  # Flatten the label array if necessary
# feature_names_lfr = dataset_transf_panel19_train_lfr.feature_names

# # Create a DataFrame
# df_lfr = pd.DataFrame(features_lfr, columns=feature_names_lfr)
# df_lfr['label'] = label_lfr


features = dataset_transf_compas_train_lfr.features
label = dataset_transf_compas_train_lfr.labels.ravel()  # Flatten the label array if necessary
weights = dataset_transf_compas_train_lfr.instance_weights
feature_names = dataset_transf_compas_train_lfr.feature_names
df_lfr = pd.DataFrame(features, columns=feature_names)
df_lfr['label'] = label
df_lfr['weights'] = weights


In [80]:
df_lfr

Unnamed: 0,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,...,c_charge_desc=Viol Injunction Protect Dom Vi,c_charge_desc=Viol Pretrial Release Dom Viol,c_charge_desc=Viol Prot Injunc Repeat Viol,c_charge_desc=Violation License Restrictions,c_charge_desc=Violation Of Boater Safety Id,c_charge_desc=Violation of Injunction Order/Stalking/Cyberstalking,c_charge_desc=Voyeurism,c_charge_desc=arrest case no charge,label,weights
0,0.521109,0.625498,0.716708,0.432452,0.533850,0.582301,0.445268,0.535120,0.653046,0.358865,...,0.545693,0.468199,0.480423,0.303290,0.520715,0.533933,0.408816,0.398183,1.0,1.0
1,0.523721,0.618111,0.719904,0.434879,0.531925,0.582281,0.440209,0.538981,0.647618,0.353103,...,0.541876,0.470039,0.481520,0.298589,0.526050,0.535778,0.411733,0.393008,1.0,1.0
2,0.521773,0.618395,0.721806,0.438287,0.533270,0.582259,0.444720,0.532756,0.644685,0.352959,...,0.546447,0.470344,0.484824,0.298436,0.524345,0.530726,0.411325,0.392385,1.0,1.0
3,0.522205,0.624398,0.716270,0.432044,0.534721,0.583117,0.442769,0.535952,0.651580,0.355646,...,0.545672,0.469955,0.480438,0.299492,0.522338,0.536884,0.408060,0.392491,1.0,1.0
4,0.516227,0.620342,0.726517,0.444027,0.531929,0.579084,0.457035,0.522022,0.644180,0.361615,...,0.551542,0.464862,0.489937,0.307846,0.517226,0.515481,0.413552,0.405947,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6162,0.525380,0.615405,0.719757,0.431285,0.530067,0.580368,0.434852,0.542806,0.648857,0.352716,...,0.536541,0.467504,0.480691,0.295075,0.525745,0.540234,0.412472,0.387391,1.0,1.0
6163,0.524266,0.616218,0.720508,0.433675,0.531036,0.581009,0.437997,0.539684,0.647388,0.352701,...,0.539669,0.468499,0.482022,0.296207,0.525471,0.537118,0.412211,0.389155,1.0,1.0
6164,0.521599,0.625426,0.716216,0.432378,0.534636,0.583074,0.444325,0.535372,0.652339,0.357115,...,0.546255,0.469557,0.480299,0.301559,0.521685,0.535433,0.408159,0.395590,1.0,1.0
6165,0.521538,0.623086,0.718106,0.434657,0.535028,0.583082,0.444573,0.533102,0.648802,0.354578,...,0.547564,0.470453,0.482579,0.298797,0.522425,0.534039,0.408527,0.391653,1.0,1.0


In [47]:
# Assuming `dataset_transf_panel19_train_di` is your DI transformed dataset
features_di = dataset_transf_panel19_train_di.features
label_di = dataset_transf_panel19_train_di.labels.ravel()  # Flatten the label array if necessary
feature_names_di = dataset_transf_panel19_train_di.feature_names

# Create a DataFrame
df_di = pd.DataFrame(features_di, columns=feature_names_di)
df_di['label'] = label_di
df_di

Unnamed: 0,AGE,RACE,PCS42,MCS42,K6SUM42,REGION=1,REGION=2,REGION=3,REGION=4,SEX=1,...,EMPST=4,POVCAT=1,POVCAT=2,POVCAT=3,POVCAT=4,POVCAT=5,INSCOV=1,INSCOV=2,INSCOV=3,label
0,53.0,1.0,25.93,58.47,3.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,56.0,1.0,20.42,26.53,17.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,23.0,1.0,52.92,50.28,7.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3.0,1.0,-1.00,-1.00,-1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,27.0,0.0,-1.00,-1.00,-1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15825,25.0,0.0,56.68,62.11,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15826,25.0,0.0,56.68,62.11,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15827,2.0,1.0,-1.00,-1.00,-1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15828,54.0,0.0,43.43,42.45,24.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [48]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

# Assuming 'df' is your Pandas DataFrame with features, 'label', and 'weights' columns
df=dataset_orig_panel19_train_df
# Split the DataFrame into features, labels, and weights

X = df.drop(['label'], axis=1)
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LogisticRegression()

# Train the model using the sample_weight parameter
# We need to extract the corresponding weights for the training samples
train_indices = X_train.index
train_weights = weights[train_indices]

model.fit(X_train, y_train)

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Probability predictions for log loss calculation
y_train_pred_proba = model.predict_proba(X_train)
y_test_pred_proba = model.predict_proba(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate log loss
train_loss = log_loss(y_train, y_train_pred_proba)
test_loss = log_loss(y_test, y_test_pred_proba)

# Print errors and losses
print(f"Training Accuracy: {train_accuracy}, Training Log Loss: {train_loss}")
print(f"Testing Accuracy: {test_accuracy}, Testing Log Loss: {test_loss}")

# Plotting
plt.figure(figsize=(10, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy, test_accuracy], color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Train vs Test Accuracy')

# Log Loss plot
plt.subplot(1, 2, 2)
plt.bar(['Train Log Loss', 'Test Log Loss'], [train_loss, test_loss], color=['blue', 'green'])
plt.ylabel('Log Loss')
plt.title('Train vs Test Log Loss')

plt.tight_layout()
plt.show()


Training Accuracy: 0.8678932406822489, Training Log Loss: 0.3226097093813417
Testing Accuracy: 0.8610233733417562, Testing Log Loss: 0.33582932969930285


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  plt.show()


In [63]:
df.head(1).to_csv('meps.csv')

In [49]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

# Assuming 'df' is your Pandas DataFrame with features, 'label', and 'weights' columns

# Split the DataFrame into features, labels, and weights
X = df_rw.drop(['label', 'weights'], axis=1)
y = df_rw['label']
weights = df_rw['weights']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model_rw = LogisticRegression()

# Train the model using the sample_weight parameter
# We need to extract the corresponding weights for the training samples
train_indices = X_train.index
train_weights = weights[train_indices]

model_rw.fit(X_train, y_train, sample_weight=train_weights)

# Predictions
y_train_pred = model_rw.predict(X_train)
y_test_pred = model_rw.predict(X_test)

# Probability predictions for log loss calculation
y_train_pred_proba = model_rw.predict_proba(X_train)
y_test_pred_proba = model_rw.predict_proba(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate log loss
train_loss = log_loss(y_train, y_train_pred_proba)
test_loss = log_loss(y_test, y_test_pred_proba)

# Print errors and losses
print(f"Training Accuracy: {train_accuracy}, Training Log Loss: {train_loss}")
print(f"Testing Accuracy: {test_accuracy}, Testing Log Loss: {test_loss}")

# Plotting
plt.figure(figsize=(10, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy, test_accuracy], color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Train vs Test Accuracy')

# Log Loss plot
plt.subplot(1, 2, 2)
plt.bar(['Train Log Loss', 'Test Log Loss'], [train_loss, test_loss], color=['blue', 'green'])
plt.ylabel('Log Loss')
plt.title('Train vs Test Log Loss')

plt.tight_layout()
plt.show()


Training Accuracy: 0.8663929248262792, Training Log Loss: 0.3352042516295803
Testing Accuracy: 0.8578648136449779, Testing Log Loss: 0.3442758714994835


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  plt.show()


In [50]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

# Assuming 'df' is your Pandas DataFrame with features, 'label', and 'weights' columns

# Split the DataFrame into features, labels, and weights
X = df_di.drop(['label'], axis=1)
y = df_di['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model_di = LogisticRegression()

# Train the model using the sample_weight parameter
# We need to extract the corresponding weights for the training samples
train_indices = X_train.index
train_weights = weights[train_indices]

model_di.fit(X_train, y_train, sample_weight=train_weights)

# Predictions
y_train_pred = model_di.predict(X_train)
y_test_pred = model_di.predict(X_test)

# Probability predictions for log loss calculation
y_train_pred_proba = model_di.predict_proba(X_train)
y_test_pred_proba = model_di.predict_proba(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate log loss
train_loss = log_loss(y_train, y_train_pred_proba)
test_loss = log_loss(y_test, y_test_pred_proba)

# Print errors and losses
print(f"Training Accuracy: {train_accuracy}, Training Log Loss: {train_loss}")
print(f"Testing Accuracy: {test_accuracy}, Testing Log Loss: {test_loss}")

# Plotting
plt.figure(figsize=(10, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy, test_accuracy], color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Train vs Test Accuracy')

# Log Loss plot
plt.subplot(1, 2, 2)
plt.bar(['Train Log Loss', 'Test Log Loss'], [train_loss, test_loss], color=['blue', 'green'])
plt.ylabel('Log Loss')
plt.title('Train vs Test Log Loss')

plt.tight_layout()
plt.show()


Training Accuracy: 0.8656822488945041, Training Log Loss: 0.3377688207025086
Testing Accuracy: 0.8572331017056223, Testing Log Loss: 0.3468531284456185


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  plt.show()


In [59]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

# Assuming 'df_lfr' is your Pandas DataFrame with features and 'label' columns, 
# resulting from the LFR transformation

# Split the DataFrame into features and labels
X = df_lfr.drop(['label'], axis=1)
y = df_lfr['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the model
model_lfr = LogisticRegression()

# Train the model (no sample weights are needed as the data is already transformed by LFR)
model_lfr.fit(X_train, y_train, sample_weight=train_weights)

# Predictions
y_train_pred = model_lfr.predict(X_train)
y_test_pred = model_lfr.predict(X_test)

# Probability predictions for log loss calculation
y_train_pred_proba = model_lfr.predict_proba(X_train)
y_test_pred_proba = model_lfr.predict_proba(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate log loss
train_loss = log_loss(y_train, y_train_pred_proba)
test_loss = log_loss(y_test, y_test_pred_proba)

# Print errors and losses
print(f"Training Accuracy: {train_accuracy}, Training Log Loss: {train_loss}")
print(f"Testing Accuracy: {test_accuracy}, Testing Log Loss: {test_loss}")

# Plotting
plt.figure(figsize=(10, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.bar(['Train Accuracy', 'Test Accuracy'], [train_accuracy, test_accuracy], color=['blue', 'green'])
plt.ylabel('Accuracy')
plt.title('Train vs Test Accuracy')

# Log Loss plot
plt.subplot(1, 2, 2)
plt.bar(['Train Log Loss', 'Test Log Loss'], [train_loss, test_loss], color=['blue', 'green'])
plt.ylabel('Log Loss')
plt.title('Train vs Test Log Loss')

plt.tight_layout()
plt.show()


Training Accuracy: 0.5713044851547694, Training Log Loss: 0.6770592058568611
Testing Accuracy: 0.5713834491471889, Testing Log Loss: 0.6836813366149022


  plt.show()
