# Matching without replacement

## 0. Import libraries 

In [1]:
from lightautoml.addons.hypex import Matcher

## 1. Create or upload your dataset  
In this case we will create random dataset with known effect size  
If you have your own dataset, go to the part 2 


In [2]:
from lightautoml.addons.hypex.utils.tutorial_data_creation import create_test_data

In [3]:
df = create_test_data(num_users=10000, rs=42, na_step=45, nan_cols=['age', 'gender'])
df

Unnamed: 0,user_id,signup_month,treat,pre_spends,post_spends,age,gender,industry
0,0,0,0,504.5,422.777778,,F,Logistics
1,1,4,1,500.0,506.333333,51.0,,E-commerce
2,2,0,0,485.0,434.000000,56.0,F,Logistics
3,3,8,1,452.0,468.111111,46.0,M,E-commerce
4,4,0,0,488.5,420.111111,56.0,M,Logistics
...,...,...,...,...,...,...,...,...
9995,9995,2,1,482.0,501.666667,31.0,M,Logistics
9996,9996,0,0,453.0,406.888889,53.0,M,Logistics
9997,9997,0,0,461.0,415.111111,52.0,F,E-commerce
9998,9998,10,1,491.5,439.222222,22.0,M,E-commerce


In [4]:
df.columns

Index(['user_id', 'signup_month', 'treat', 'pre_spends', 'post_spends', 'age',
       'gender', 'industry'],
      dtype='object')

In [5]:
df['treat'].value_counts()

treat
0    5002
1    4998
Name: count, dtype: int64

In [6]:
df['gender'].isna().sum()

223

## 2. Matching  without replacement
### 2.0 Init params
info_col used to define informative attributes that should not be part of matching, such as user_id  
But to explicitly store this column in the table, so that you can compare directly after computation

In [7]:
info_col = ['user_id']

outcome = 'post_spends'
treatment = 'treat'

### 2.1 Matching
This is the easiest way to initialize and calculate metrics on a Matching task  
Use it when you are clear about each attribute or if you don't have any additional task conditions (Strict equality for certain features) 

In [8]:
# Standard model with base parameters
model = Matcher(input_data=df, outcome=outcome, treatment=treatment, info_col=info_col)

[18.12.2024 22:04:52 | hypex | INFO]: Number of NaN values filled with zeros: 446


In [9]:
df_matched = model.match_no_rep()

In [10]:
df_matched

Unnamed: 0,signup_month,treat,pre_spends,post_spends,age,gender_F,gender_M,industry_Logistics,user_id,signup_month_matched,treat_matched,pre_spends_matched,post_spends_matched,age_matched,gender_F_matched,gender_M_matched,industry_Logistics_matched,user_id_matched
0,0,0,504.5,422.777778,0.0,1,0,1,0,4,1,522.5,509.777778,0.0,1,0,1,4095
1,4,1,500.0,506.333333,51.0,0,0,0,1,0,0,488.0,420.333333,50.0,0,0,0,3916
2,0,0,485.0,434.000000,56.0,1,0,1,2,2,1,492.5,510.777778,56.0,1,0,1,6057
3,8,1,452.0,468.111111,46.0,0,1,0,3,0,0,453.5,415.222222,42.0,0,1,0,5255
4,0,0,488.5,420.111111,56.0,0,1,1,4,8,1,488.0,472.111111,59.0,0,1,1,6874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9991,2,1,482.0,501.666667,31.0,0,1,1,9995,0,0,474.0,435.111111,28.0,0,1,1,8397
9992,0,0,453.0,406.888889,53.0,0,1,1,9996,4,1,459.0,518.777778,57.0,0,0,1,8416
9993,0,0,461.0,415.111111,52.0,1,0,0,9997,7,1,459.5,472.111111,51.0,1,0,0,5727
9994,10,1,491.5,439.222222,22.0,0,1,0,9998,0,0,492.5,411.555556,25.0,0,1,0,5675
