# Matching without replacement

## 0. Import libraries 

In [1]:
from hypex import Matcher

## 1. Create or upload your dataset  
In this case we will create random dataset with known effect size  
If you have your own dataset, go to the part 2 


In [2]:
from hypex.utils.tutorial_data_creation import create_test_data

In [3]:
df = create_test_data(num_users=10000, rs=42, na_step=45, nan_cols=['age', 'gender'])
df

Unnamed: 0,user_id,signup_month,treat,pre_spends,post_spends,age,gender,industry
0,0,0,0,504.5,422.777778,,F,Logistics
1,1,4,1,500.0,506.333333,51.0,,E-commerce
2,2,0,0,485.0,434.000000,56.0,F,Logistics
3,3,8,1,452.0,468.111111,46.0,M,E-commerce
4,4,0,0,488.5,420.111111,56.0,M,Logistics
...,...,...,...,...,...,...,...,...
9995,9995,2,1,482.0,501.666667,31.0,M,Logistics
9996,9996,0,0,453.0,406.888889,53.0,M,Logistics
9997,9997,0,0,461.0,415.111111,52.0,F,E-commerce
9998,9998,10,1,491.5,439.222222,22.0,M,E-commerce


In [4]:
df.columns

Index(['user_id', 'signup_month', 'treat', 'pre_spends', 'post_spends', 'age',
       'gender', 'industry'],
      dtype='object')

In [5]:
df['treat'].value_counts()

treat
0    5002
1    4998
Name: count, dtype: int64

In [6]:
df['gender'].isna().sum()

223

## 2. Matching  without replacement
### 2.0 Init params
info_col used to define informative attributes that should not be part of matching, such as user_id  
But to explicitly store this column in the table, so that you can compare directly after computation

In [7]:
info_col = ['user_id']

outcome = 'post_spends'
treatment = 'treat'

### 2.1 Matching
This is the easiest way to initialize and calculate metrics on a Matching task  
Use it when you are clear about each attribute or if you don't have any additional task conditions (Strict equality for certain features) 

In [8]:
# Standard model with base parameters
model = Matcher(input_data=df, outcome=outcome, treatment=treatment, info_col=info_col)

[21.06.2024 15:24:11 | hypex | INFO]: Number of NaN values filled with zeros: 446


In [9]:
df_matched = model.match_no_rep()

In [10]:
df_matched

Unnamed: 0,signup_month,treat,pre_spends,post_spends,age,gender_F,gender_M,industry_Logistics,user_id,signup_month_matched,treat_matched,pre_spends_matched,post_spends_matched,age_matched,gender_F_matched,gender_M_matched,industry_Logistics_matched,user_id_matched
0,4,1,500.0,506.333333,51.0,0,0,0,1,0.0,0.0,453.5,415.222222,42.0,0.0,1.0,0.0,5255.0
1,8,1,452.0,468.111111,46.0,0,1,0,3,0.0,0.0,497.5,415.777778,59.0,1.0,0.0,0.0,7072.0
2,11,1,476.5,441.888889,34.0,0,1,0,9,0.0,0.0,504.5,427.888889,46.0,0.0,1.0,0.0,6270.0
3,8,1,498.5,461.444444,60.0,1,0,0,11,0.0,0.0,490.0,414.777778,18.0,0.0,1.0,0.0,1512.0
4,8,1,497.0,461.777778,24.0,0,1,1,14,0.0,0.0,486.0,427.777778,41.0,1.0,0.0,1.0,9397.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9991,0,0,496.5,426.111111,0.0,1,0,0,225,,,,,,,,,
9992,0,0,499.5,406.333333,56.0,0,0,0,9946,10.0,1.0,489.5,436.333333,46.0,0.0,0.0,0.0,9991.0
9993,0,0,474.0,435.111111,28.0,0,1,1,8397,2.0,1.0,482.0,501.666667,31.0,0.0,1.0,1.0,9995.0
9994,0,0,492.5,411.555556,25.0,0,1,0,5675,10.0,1.0,491.5,439.222222,22.0,0.0,1.0,0.0,9998.0
