# MatchVariables


MatchVariables() ensures that the columns in the test set are identical to those
in the train set.

If the test set contains additional columns, they are dropped. Alternatively, if the
test set lacks columns that were present in the train set, they will be added with a
value determined by the user, for example np.nan.

In [63]:
import numpy as np
import pandas as pd

from feature_engine.preprocessing import MatchVariables

In [64]:
# # Load titanic dataset from OpenML

# def load_titanic(filepath='../data/titanic.csv'):
#     # data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
#     data = pd.read_csv(filepath)
#     data = data.replace('?', np.nan)
#     data['cabin'] = data['cabin'].astype(str).str[0]
#     data['pclass'] = data['pclass'].astype('O')
#     data['age'] = data['age'].astype('float')
#     data['fare'] = data['fare'].astype('float')
#     data['embarked'].fillna('C', inplace=True)
#     data.drop(
#         # labels=['name', 'ticket', 'boat', 'body', 'home.dest'],
#         labels=['name', 'ticket'],
#         axis=1, inplace=True,
#     )
#     return data

# data = load_titanic()
# # data.head()
# # data.shape

# # separate the dataset into train and test

# train = data.iloc[0:1000, :]
# test = data.iloc[1000:, :]

# train.shape, test.shape

In [65]:
def load_titanic(train_path='../data/titanic-3/train.csv', test_path='../data/titanic-3/test.csv'):
    # Read both train and test datasets
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    # Common preprocessing for both datasets
    def preprocess_df(df):
        df = df.replace('?', np.nan)
        df['cabin'] = df['cabin'].astype(str).str[0]
        df['pclass'] = df['pclass'].astype('O')
        df['age'] = df['age'].astype('float')
        df['fare'] = df['fare'].astype('float')
        df['embarked'].fillna('C', inplace=True)
        df.drop(
            labels=['name', 'ticket'],
            axis=1, inplace=True,
        )
        return df
    
    # Apply preprocessing to both datasets
    train = preprocess_df(train)
    test = preprocess_df(test)
    
    return train, test

In [66]:
train, test = load_titanic()
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (891, 10)
Test shape: (418, 9)


In [67]:
# set up the transformer
match_cols = MatchVariables(missing_values="ignore")

# learn the variables in the train set
match_cols.fit(train)

In [68]:
# the transformer stores the input variables
# match_cols.input_features_
match_cols.feature_names_in_

['passengerid',
 'survived',
 'pclass',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'cabin',
 'embarked']

## 1 - Some columns are missing in the test set

In [69]:
match_cols

In [70]:
# Let's drop some columns in the test set for the demo
test_t = test.drop(["sex", "age"], axis=1)

In [71]:
# test.columns
test_t.shape

(418, 7)

In [72]:
test_t.head()

Unnamed: 0,passengerid,pclass,sibsp,parch,fare,cabin,embarked
0,892,3,0,0,7.8292,n,Q
1,893,3,1,0,7.0,n,S
2,894,2,0,0,9.6875,n,Q
3,895,3,0,0,8.6625,n,S
4,896,3,1,1,12.2875,n,S


In [73]:
# the transformer adds the columns back
test_tt = match_cols.transform(test_t)

print()
test_tt.head()

The following variables are added to the DataFrame: ['sex', 'survived', 'age']



Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,cabin,embarked
0,892,,3,,,0,0,7.8292,n,Q
1,893,,3,,,1,0,7.0,n,S
2,894,,2,,,0,0,9.6875,n,Q
3,895,,3,,,0,0,8.6625,n,S
4,896,,3,,,1,1,12.2875,n,S


Note how the missing columns were added back to the transformed test set, with
missing values, in the position (i.e., order) in which they were in the train set.

Similarly, if the test set contained additional columns, those would be removed:

## Test set contains variables not present in train set

In [74]:
test_t.loc[:, "new_col1"] = 5
test_t.loc[:, "new_col2"] = "test"

test_t.head()

Unnamed: 0,passengerid,pclass,sibsp,parch,fare,cabin,embarked,new_col1,new_col2
0,892,3,0,0,7.8292,n,Q,5,test
1,893,3,1,0,7.0,n,S,5,test
2,894,2,0,0,9.6875,n,Q,5,test
3,895,3,0,0,8.6625,n,S,5,test
4,896,3,1,1,12.2875,n,S,5,test


In [75]:
# set up the transformer with different
# fill value
match_cols = MatchVariables(
    fill_value=0, missing_values="ignore",
)

# learn the variables in the train set
match_cols.fit(train)

In [76]:
test_tt = match_cols.transform(test_t)

print()
test_tt.head()

The following variables are added to the DataFrame: ['sex', 'survived', 'age']
The following variables are dropped from the DataFrame: ['new_col2', 'new_col1']



Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,cabin,embarked
0,892,0,3,0,0,0,0,7.8292,n,Q
1,893,0,3,0,0,1,0,7.0,n,S
2,894,0,2,0,0,0,0,9.6875,n,Q
3,895,0,3,0,0,0,0,8.6625,n,S
4,896,0,3,0,0,1,1,12.2875,n,S


Note how the columns that were present in the test set but not in train set were dropped. And now, the missing variables were added back into the dataset with the value 0.