# Tutorial 1: Matching

## 0. Import libraries 

In [32]:
import pandas as pd
import numpy as np
from hypex import Matcher
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## 1. Create or upload your dataset  
In this case we will create random dataset with known effect size  
If you have your own dataset, go to the part 2 


In [33]:
from hypex.utils.tutorial_data_creation import create_test_data

In [34]:
df = create_test_data(rs=42, na_step=45, nan_cols=['age', 'gender'])
df

Unnamed: 0,user_id,signup_month,treat,pre_spends,post_spends,age,gender,industry
0,0,0,0,504.5,422.777778,,F,Logistics
1,2,0,0,485.0,434.000000,34.0,,E-commerce
2,4,0,0,488.5,420.111111,33.0,F,E-commerce
3,5,0,0,475.0,408.333333,52.0,M,E-commerce
4,6,0,0,494.0,423.666667,33.0,M,Logistics
...,...,...,...,...,...,...,...,...
5468,9992,0,0,456.5,404.888889,34.0,M,E-commerce
5469,9993,0,0,503.0,431.888889,64.0,F,Logistics
5470,9994,0,0,480.0,411.444444,67.0,F,E-commerce
5471,9996,0,0,453.0,406.888889,27.0,M,Logistics


In [35]:
df.columns

Index(['user_id', 'signup_month', 'treat', 'pre_spends', 'post_spends', 'age',
       'gender', 'industry'],
      dtype='object')

In [36]:
df['treat'].value_counts()

treat
0    5002
1     471
Name: count, dtype: int64

In [37]:
df['gender'].isna().sum()

122

## 2. Matching  
### 2.0 Init params
info_col used to define informative attributes that should not be part of matching, such as user_id  
But to explicitly store this column in the table, so that you can compare directly after computation

In [38]:
info_col = ['user_id']

outcome = 'post_spends'
treatment = 'treat'
weights = {'pre_spends': 1} # additional weight to feature pre_spends

### 2.1 Simple matching
This is the easiest way to initialize and calculate metrics on a Matching task  
Use it when you are clear about each attribute or if you don't have any additional task conditions (Strict equality for certain features) 

In [39]:
# Standard model with base parameters
model = Matcher(input_data=df, outcome=outcome, treatment=treatment, info_col=info_col, weights=weights)
results, quality_results, df_matched = model.estimate()

[07.11.2023 18:16:48 | hypex | INFO]: Number of NaN values filled with zeros: 244


  0%|          | 0/5473 [00:00<?, ?it/s]

In [40]:
results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,-318.030575,0.555596,0.0,-319.119544,-316.941606,post_spends
ATC,-357.426446,0.566596,0.0,-358.536975,-356.315917,post_spends
ATT,100.351906,0.664084,0.0,99.0503,101.653511,post_spends


In [41]:
quality_results.keys()

dict_keys(['psi', 'ks_test', 'smd', 'repeats'])

In [42]:
quality_results['ks_test']

Unnamed: 0,match_control_to_treat,match_treat_to_control
age,1.0,0.039739
pre_spends,1.0,0.135706
signup_month,2.070021e-282,0.0


In [43]:
df_matched

Unnamed: 0,index,signup_month,pre_spends,age,gender_F,gender_M,industry_Logistics,signup_month_matched,pre_spends_matched,age_matched,gender_F_matched,gender_M_matched,industry_Logistics_matched,index_matched,post_spends,post_spends_matched,post_spends_matched_bias,treat,treat_matched
0,32,3,481.5,63.0,0,1,0,0.0,481.5,63.0,0.0,1.0,0.0,[5880],511.333333,410.666667,100.666667,1,0
1,38,3,493.0,20.0,0,1,0,0.0,492.5,20.0,0.0,1.0,0.0,[323],519.666667,402.111111,117.544834,1,0
2,92,3,511.0,40.0,1,0,1,0.0,509.5,40.0,1.0,0.0,1.0,[8850],535.777778,426.111111,109.634501,1,0
3,117,3,478.0,56.0,1,0,1,0.0,478.5,55.0,1.0,0.0,1.0,[1547],529.000000,414.777778,114.198069,1,0
4,119,3,470.0,57.0,1,0,1,0.0,471.0,57.0,1.0,0.0,1.0,[280],523.444444,410.222222,113.243666,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4997,9992,0,456.5,34.0,0,1,0,3.0,463.0,31.0,0.0,1.0,0.0,[9179],404.888889,515.555556,-347.308697,0,1
4998,9993,0,503.0,64.0,1,0,1,3.0,500.5,63.0,1.0,0.0,1.0,[5564],431.888889,521.111111,-368.480546,0,1
4999,9994,0,480.0,67.0,1,0,0,3.0,483.0,64.0,1.0,0.0,0.0,[9225],411.444444,514.222222,-355.092587,0,1
5000,9996,0,453.0,27.0,0,1,1,3.0,456.0,23.0,0.0,1.0,1.0,[6115],406.888889,518.000000,-346.760554,0,1


In [44]:
df_matched[df_matched['industry_Logistics'] != df_matched['industry_Logistics_matched']]

Unnamed: 0,index,signup_month,pre_spends,age,gender_F,gender_M,industry_Logistics,signup_month_matched,pre_spends_matched,age_matched,gender_F_matched,gender_M_matched,industry_Logistics_matched,index_matched,post_spends,post_spends_matched,post_spends_matched_bias,treat,treat_matched
4308,8634,0,459.0,31.0,0,0,0,3.0,466.5,33.0,0.0,0.0,1.0,[2998],428.333333,525.222222,-361.654016,0,1


### 2.2 Matching with a fixed variable  
Used when you have categorical feature(s) that you want to compare by strict equality  
group_col is used for strict comparison of categorical features.  
In our case there is only one attribute  
If there are several such attributes, you should make one of them and use it

In [45]:
group_col = "industry"

In [46]:
model = Matcher(input_data=df, outcome=outcome, treatment=treatment,
                info_col=info_col, group_col=group_col)
results, quality_results, df_matched = model.estimate()

[07.11.2023 18:16:52 | hypex | INFO]: Number of NaN values filled with zeros: 244


  0%|          | 0/4 [00:00<?, ?it/s]

In [47]:
results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,-317.208572,0.552844,0.0,-318.292147,-316.124997,post_spends
ATC,-356.533377,0.563853,0.0,-357.638529,-355.428224,post_spends
ATT,100.419181,0.657298,0.0,99.130877,101.707484,post_spends


In [48]:
df_matched[df_matched['industry'] != df_matched['industry_matched']]

Unnamed: 0,index,signup_month,pre_spends,age,gender_F,gender_M,industry,signup_month_matched,pre_spends_matched,age_matched,gender_F_matched,gender_M_matched,industry_matched,index_matched,post_spends,post_spends_matched,post_spends_matched_bias,treat,treat_matched


## 3. Results  
### 3.1 ATE, ATT, ATC

In [49]:
results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,-317.208572,0.552844,0.0,-318.292147,-316.124997,post_spends
ATC,-356.533377,0.563853,0.0,-357.638529,-355.428224,post_spends
ATT,100.419181,0.657298,0.0,99.130877,101.707484,post_spends


### 3.2 SMD, PSI, KS-test, repeats

In [50]:
quality_results.keys()

dict_keys(['psi', 'ks_test', 'smd', 'repeats'])

In [51]:
quality_results['psi']

Unnamed: 0,column_treated,anomaly_score_treated,check_result_treated,column_untreated,anomaly_score_untreated,check_result_untreated
0,age_treated,0.0,OK,age_untreated,0.02,OK
1,gender_F_treated,0.0,OK,gender_F_untreated,0.0,OK
2,gender_M_treated,0.0,OK,gender_M_untreated,0.0,OK
3,industry_treated,0.0,OK,industry_untreated,0.0,OK
4,pre_spends_treated,0.01,OK,pre_spends_untreated,0.01,OK
5,signup_month_treated,18.42,NOK,signup_month_untreated,0.0,OK


In [52]:
quality_results['ks_test']

Unnamed: 0,match_control_to_treat,match_treat_to_control
age,1.0,0.039739
pre_spends,0.9999999,0.129535
signup_month,2.070021e-282,0.0


In [53]:
quality_results['repeats']

{'match_control_to_treat': 0.93, 'match_treat_to_control': 0.09}

### 3.3 Validation
Validates estimated effect:
1. by replacing real treatment (`random_treatment`) with random placebo treatment.
 Estimated effect must be droped to zero;
2. by adding random feature (`random_feature`). Estimated effect shouldn't change
significantly, p-val < 0.05;
3. estimates effect on subset of data (`subset_refuter`) (default fraction is 0.8). Estimated effect
shouldn't change significantly, p-val < 0.05.

In [59]:
model.validate_result(refuter="random_treatment", effect_type="att", n_sim=10)

  0%|          | 0/10 [00:00<?, ?it/s]

{'post_spends': [-0.09534094192547984, 0.0]}

In [24]:
info_col = ['user_id']

treatment = 'treat'
weights = {'pre_spends': 10} # additional weight to feature pre_spends

In [25]:
model = Matcher(input_data=df, outcome=outcome, treatment=treatment, info_col=info_col, weights=weights)

[07.11.2023 18:12:49 | hypex | INFO]: Number of NaN values filled with zeros: 244


In [26]:
# you may specify threshold in order to receive only pair with 5% difference in post_spends
no_replacemet_df = model.match_no_rep(threshold=0.05) 

LinAlgError: Singular matrix

In [None]:
no_replacemet_df.head()

In [None]:
no_replacemet_df.shape

## 4. Save model

In [62]:
model.save("test_model.pickle")

In [63]:
model2 = Matcher.load("test_model.pickle")

In [64]:
model2.results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,-317.208572,0.552844,0.0,-318.292147,-316.124997,post_spends
ATC,-356.533377,0.563853,0.0,-357.638529,-355.428224,post_spends
ATT,100.419181,0.657298,0.0,99.130877,101.707484,post_spends


In [65]:
model.results

Unnamed: 0,effect_size,std_err,p-val,ci_lower,ci_upper,outcome
ATE,-317.208572,0.552844,0.0,-318.292147,-316.124997,post_spends
ATC,-356.533377,0.563853,0.0,-357.638529,-355.428224,post_spends
ATT,100.419181,0.657298,0.0,99.130877,101.707484,post_spends
