# Tutorial 1: Matching

## 0. Import libraries 

In [1]:
import warnings 
from hypex import Matcher
from hypex.dataset import DataGenerator

warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Create or upload your dataset  
In this case we will create random dataset with known effect size  
If you have your own dataset, go to the part 2 


In [3]:
data = DataGenerator(na_columns=['feature_3', 'feature_2'], 
                     num_features=2, 
                     num_targets=2)
data.df

Unnamed: 0,info_1,info_2,feature_1,feature_2,feature_3,feature_4,treatment,target_1,target_2
0,9508,Q,female,Deposit,,0.0,0.0,-0.215974,-0.215974
1,1783,Q,female,,-1.946330,3.0,1.0,5.204231,5.204231
2,2815,U,female,Investment,0.423735,3.0,1.0,8.012220,3.079220
3,6961,Q,male,Credit,0.183354,1.0,1.0,4.699710,4.699710
4,13036,U,female,Deposit,-1.145800,2.0,0.0,0.504827,0.504827
...,...,...,...,...,...,...,...,...,...
4995,1753,Q,male,Credit,-1.032884,3.0,1.0,3.797474,0.979399
4996,13009,U,female,Deposit,-2.286353,1.0,0.0,-1.164799,-1.164799
4997,10744,U,male,Credit,-0.076360,3.0,0.0,2.785862,2.785862
4998,12343,U,female,Deposit,1.524106,0.0,1.0,4.919137,4.919137


In [4]:
data.df.columns

Index(['info_1', 'info_2', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'treatment', 'target_1', 'target_2'],
      dtype='object')

In [5]:
data.df.treatment.value_counts()

treatment
0.0    2528
1.0    2472
Name: count, dtype: int64

In [6]:
data.df.isna().sum()

info_1         0
info_2         0
feature_1      0
feature_2    500
feature_3    500
feature_4      0
treatment      0
target_1       0
target_2       0
dtype: int64

## 2. Matching  
### 2.0 Init params
info_col used to define informative attributes that should not be part of matching, such as user_id  
But to explicitly store this column in the table, so that you can compare directly after computation

In [9]:
info_col = [data.info_col_names[0]]

outcome = data.target_names
treatment = data.treatment_name

### 2.1 Simple matching
This is the easiest way to initialize and calculate metrics on a Matching task  
Use it when you are clear about each attribute or if you don't have any additional task conditions (Strict equality for certain features) 

In [10]:
# Standard model with base parameters
model = Matcher(input_data=data.df, outcome=outcome, treatment=treatment, info_col=info_col)
results, quality_results, df_matched = model.estimate()

[23.11.2023 15:44:50 | hypex | INFO]: Number of NaN values filled with zeros: 1000
Get treated index: 100%|██████████| 5000/5000 [00:00<00:00, 26720.59it/s]  


In [11]:
results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,2.652815,0.068678,0.0,2.518206,2.787425,target_1
ATC,2.623115,0.077757,0.0,2.470711,2.775519,target_1
ATT,2.683189,0.076139,0.0,2.533956,2.832422,target_1
ATE,-0.114397,0.08251,0.17,-0.276117,0.047322,target_2
ATC,-0.13336,0.092808,0.15,-0.315265,0.048544,target_2
ATT,-0.095005,0.092118,0.3,-0.275556,0.085546,target_2


In [12]:
quality_results.keys()

dict_keys(['psi', 'ks_test', 'smd', 'repeats'])

In [13]:
quality_results['ks_test']

Unnamed: 0,match_control_to_treat,match_treat_to_control
feature_3,0.902708,0.991299
feature_4,1.0,1.0


In [14]:
df_matched

Unnamed: 0,index,feature_3,feature_4,info_2_U,feature_1_male,feature_2_Credit,feature_2_Deposit,feature_2_Investment,feature_3_matched,feature_4_matched,...,feature_2_Investment_matched,index_matched,target_1,target_1_matched,target_1_matched_bias,target_2,target_2_matched,target_2_matched_bias,treatment,treatment_matched
0,1783,-1.946330,3.0,0,0,0,0,0,-1.621690,3.0,...,0.0,[5185],5.204231,2.240444,3.278713,5.204231,2.240444,3.450316,1,0
1,2815,0.423735,3.0,1,0,0,0,1,0.439773,3.0,...,1.0,[9157],8.012220,4.175422,3.852356,3.079220,8.624334,-5.521078,1,0
2,6961,0.183354,1.0,0,1,1,0,0,0.213917,1.0,...,0.0,[7645],4.699710,1.186502,3.542858,4.699710,1.186502,3.559013,1,0
3,10432,0.000000,3.0,1,1,1,0,0,0.000000,3.0,...,0.0,"[8854, 2941, 4294, 5695, 2998, 1624]",7.690724,2.652024,5.038700,7.690724,5.111347,2.579377,1,0
4,10222,0.090388,1.0,0,0,0,0,1,0.088173,1.0,...,1.0,[4336],2.617214,0.582712,2.032352,0.461260,3.671266,-3.213325,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2523,2902,0.842208,2.0,0,0,0,0,1,0.843187,2.0,...,1.0,[14560],4.095340,6.403056,2.305748,6.402154,3.407937,-2.995618,0,1
2524,4228,1.204038,2.0,1,0,1,0,0,1.111896,2.0,...,0.0,[11908],3.917107,6.630113,2.898269,7.444792,2.129195,-5.183684,0,1
2525,13009,-2.286353,1.0,1,0,0,1,0,-2.454503,1.0,...,0.0,[8008],-1.164799,-3.272139,-1.769257,-1.164799,-0.811464,0.594060,0,1
2526,10744,-0.076360,3.0,1,1,1,0,0,-0.086061,3.0,...,0.0,[2806],2.785862,7.040315,4.273958,2.785862,1.707816,-1.064157,0,1


In [15]:
df_matched[df_matched['info_2_U'] != df_matched['info_2_U_matched']]

Unnamed: 0,index,feature_3,feature_4,info_2_U,feature_1_male,feature_2_Credit,feature_2_Deposit,feature_2_Investment,feature_3_matched,feature_4_matched,...,feature_2_Investment_matched,index_matched,target_1,target_1_matched,target_1_matched_bias,target_2,target_2_matched,target_2_matched_bias,treatment,treatment_matched


### 2.2 Matching with a fixed variable  
Used when you have categorical feature(s) that you want to compare by strict equality  
group_col is used for strict comparison of categorical features.  
In our case there is only one attribute  
If there are several such attributes, you should make one of them and use it

In [16]:
group_col = 'info_2'

In [17]:
model = Matcher(input_data=data.df, outcome=outcome, treatment=treatment,
                info_col=info_col, group_col=group_col)
results, quality_results, df_matched = model.estimate()

[23.11.2023 15:45:52 | hypex | INFO]: Number of NaN values filled with zeros: 1000
Get treated index by group U: 100%|██████████| 4/4 [00:00<00:00, 31.27it/s]  


In [18]:
results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,2.653028,0.068668,0.0,2.518438,2.787617,target_1
ATC,2.62312,0.077746,0.0,2.470737,2.775502,target_1
ATT,2.683613,0.076124,0.0,2.534411,2.832815,target_1
ATE,-0.113997,0.082506,0.17,-0.27571,0.047715,target_2
ATC,-0.133361,0.092808,0.15,-0.315265,0.048542,target_2
ATT,-0.094195,0.092107,0.31,-0.274724,0.086334,target_2


In [19]:
df_matched

Unnamed: 0,index,feature_3,feature_4,feature_1_male,feature_2_Credit,feature_2_Deposit,feature_2_Investment,info_2,feature_3_matched,feature_4_matched,...,info_2_matched,index_matched,target_1,target_1_matched,target_1_matched_bias,target_2,target_2_matched,target_2_matched_bias,treatment,treatment_matched
0,1783,-1.946330,3.0,0,0,0,0,Q,-1.621690,3.0,...,Q,[5185],5.204231,2.240444,3.279149,5.204231,2.240444,3.450538,1,0
1,6961,0.183354,1.0,1,1,0,0,Q,0.213917,1.0,...,Q,[7645],4.699710,1.186502,3.542899,4.699710,1.186502,3.559034,1,0
2,10222,0.090388,1.0,0,0,0,1,Q,0.088173,1.0,...,Q,[4336],2.617214,0.582712,2.032349,0.461260,3.671266,-3.213327,1,0
3,13003,-0.551919,3.0,0,0,1,0,Q,-0.570587,3.0,...,Q,[13732],6.290954,2.216480,4.056339,6.290954,5.221042,1.041921,1,0
4,1555,-0.452623,1.0,1,0,0,1,Q,-0.462746,1.0,...,Q,[6115],1.772837,-0.206367,1.969371,1.772837,2.843365,-1.085705,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2523,11620,-0.514484,0.0,0,0,1,0,U,-0.554516,0.0,...,U,[14434],0.059858,-2.290294,-2.269874,0.059858,-2.290294,-2.292803,0,1
2524,4228,1.204038,2.0,0,1,0,0,U,1.111896,2.0,...,U,[11908],3.917107,6.630113,2.897786,7.444792,2.129195,-5.183594,0,1
2525,13009,-2.286353,1.0,0,0,1,0,U,-2.454503,1.0,...,U,[8008],-1.164799,-3.272139,-1.770139,-1.164799,-0.811464,0.594225,0,1
2526,10744,-0.076360,3.0,1,1,0,0,U,-0.086061,3.0,...,U,[2806],2.785862,7.040315,4.273907,2.785862,1.707816,-1.064148,0,1


In [20]:
df_matched[df_matched['info_2'] != df_matched['info_2_matched']]

Unnamed: 0,index,feature_3,feature_4,feature_1_male,feature_2_Credit,feature_2_Deposit,feature_2_Investment,info_2,feature_3_matched,feature_4_matched,...,info_2_matched,index_matched,target_1,target_1_matched,target_1_matched_bias,target_2,target_2_matched,target_2_matched_bias,treatment,treatment_matched


## 3. Results  
### 3.1 ATE, ATT, ATC

In [21]:
results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,2.653028,0.068668,0.0,2.518438,2.787617,target_1
ATC,2.62312,0.077746,0.0,2.470737,2.775502,target_1
ATT,2.683613,0.076124,0.0,2.534411,2.832815,target_1
ATE,-0.113997,0.082506,0.17,-0.27571,0.047715,target_2
ATC,-0.133361,0.092808,0.15,-0.315265,0.048542,target_2
ATT,-0.094195,0.092107,0.31,-0.274724,0.086334,target_2


### 3.2 SMD, PSI, KS-test, repeats

In [22]:
quality_results.keys()

dict_keys(['psi', 'ks_test', 'smd', 'repeats'])

In [23]:
quality_results['psi']

Unnamed: 0,column_treated,anomaly_score_treated,check_result_treated,column_untreated,anomaly_score_untreated,check_result_untreated
0,feature_1_male_treated,0.0,OK,feature_1_male_untreated,0.0,OK
1,feature_2_Credit_treated,0.0,OK,feature_2_Credit_untreated,0.0,OK
2,feature_2_Deposit_treated,0.0,OK,feature_2_Deposit_untreated,0.0,OK
3,feature_2_Investment_treated,0.0,OK,feature_2_Investment_untreated,0.0,OK
4,feature_3_treated,0.0,OK,feature_3_untreated,0.0,OK
5,feature_4_treated,0.0,OK,feature_4_untreated,0.0,OK
6,info_2_treated,0.0,OK,info_2_untreated,0.0,OK


In [24]:
quality_results['ks_test']

Unnamed: 0,match_control_to_treat,match_treat_to_control
feature_3,0.902708,0.991299
feature_4,1.0,1.0


In [25]:
quality_results['repeats']

{'match_control_to_treat': 0.4, 'match_treat_to_control': 0.38}

### 3.3 Validation
Validates estimated effect:
1. by replacing real treatment (`random_treatment`) with random placebo treatment.
 Estimated effect must be droped to zero;
2. by adding random feature (`random_feature`). Estimated effect shouldn't change
significantly, p-val < 0.05;
3. estimates effect on subset of data (`subset_refuter`) (default fraction is 0.8). Estimated effect
shouldn't change significantly, p-val < 0.05.

In [26]:
model.validate_result(refuter="random_treatment", effect_type="att", n_sim=10)

100%|██████████| 10/10 [00:07<00:00,  1.35it/s]


{'target_1': [-0.022073635054381528, 0.0],
 'target_2': [-0.01543623282222804, 0.1337448453290817]}

## 4. Save model

In [27]:
model.save("test_model.pickle")

In [28]:
model2 = Matcher.load("test_model.pickle")

In [29]:
model2.results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,2.653028,0.068668,0.0,2.518438,2.787617,target_1
ATC,2.62312,0.077746,0.0,2.470737,2.775502,target_1
ATT,2.683613,0.076124,0.0,2.534411,2.832815,target_1
ATE,-0.113997,0.082506,0.17,-0.27571,0.047715,target_2
ATC,-0.133361,0.092808,0.15,-0.315265,0.048542,target_2
ATT,-0.094195,0.092107,0.31,-0.274724,0.086334,target_2


In [30]:
model.results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,2.653028,0.068668,0.0,2.518438,2.787617,target_1
ATC,2.62312,0.077746,0.0,2.470737,2.775502,target_1
ATT,2.683613,0.076124,0.0,2.534411,2.832815,target_1
ATE,-0.113997,0.082506,0.17,-0.27571,0.047715,target_2
ATC,-0.133361,0.092808,0.15,-0.315265,0.048542,target_2
ATT,-0.094195,0.092107,0.31,-0.274724,0.086334,target_2
