In [5]:
import pickle
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [6]:
data = pd.read_csv('data/raw/data.csv')
data.shape

(16880, 21)

Let's remove the columns with null values as discussed in the EDA notebook.ipynb

In [7]:
data.drop(columns=['C','K'], inplace=True)

For the categorical feature 'J' it was identified that there are a lot of countries with few values so we can create a new category called OTHERS where can group the countries with not too much data.

In [8]:
threshold = 0.02  # 10%
freq = data['J'].value_counts(normalize=True)
rare_categories = freq[freq < threshold].index
data['J'] = data['J'].replace(rare_categories, 'OTHERS')
data['J'].value_counts(normalize=True)

J
AR        0.552666
BR        0.262322
MX        0.140166
OTHERS    0.044846
Name: proportion, dtype: float64

As we have not too much categories, we can use OneHot Encoding, Ordinal Encoding creates a false order

In [10]:
encoder_onehot = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
encoder_onehot.fit(data[['J']])
data_onehot = pd.DataFrame(encoder_onehot.transform(data[['J']]), columns=encoder_onehot.get_feature_names_out(['J']), index=data.index)

with open('./4_deployment/api/artifacts/onehot_encoder.pkl', 'wb') as f:
    pickle.dump(encoder_onehot, f)

data_onehot

Unnamed: 0,J_BR,J_MX,J_OTHERS
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
16875,1.0,0.0,0.0
16876,1.0,0.0,0.0
16877,1.0,0.0,0.0
16878,1.0,0.0,0.0


In [11]:
data = pd.concat([data.drop(columns=['J']), data_onehot], axis=1)
data.columns

Index(['A', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'N', 'O', 'P', 'Q',
       'R', 'S', 'Monto', 'Fraude', 'J_BR', 'J_MX', 'J_OTHERS'],
      dtype='object')

Again, fix the comma and dot problem as we did in EDA

In [12]:
for col in ['Q','R','Monto']:
    data[col] = data[col].str.replace(",","").astype(float)

In [13]:
data.head().T

Unnamed: 0,0,1,2,3,4
A,0.0,0.0,0.0,9.0,0.0
B,10.0,10.0,7.0,16.0,8.0
D,0.0,0.0,0.0,0.0,0.0
E,0.0,0.0,1.0,0.0,0.0
F,0.0,0.0,0.0,0.0,0.0
G,0.0,0.0,0.0,0.0,0.0
H,0.0,0.0,0.0,0.0,0.0
I,0.0,0.0,1.0,0.0,0.0
L,0.0,0.0,0.0,0.0,0.0
M,3.0,1.0,3.0,3.0,1.0


In [14]:
data.shape

(16880, 21)

Saving the data as parquet will make it more easy to read later:

In [None]:
data.to_parquet('data/processed/data.parquet', index=False)