In [1]:
import polars as pl
import torch

In [2]:
device = 'cuda'
torch.set_default_device(device)

In [16]:
def check_cuda_memory():
    if torch.cuda.is_available():
        # Get total memory in bytes
        total_memory = torch.cuda.get_device_properties(0).total_memory
        
        # Get allocated memory in bytes
        allocated_memory = torch.cuda.memory_allocated(0)
        
        # Get cached memory in bytes (reserved by PyTorch but not used)
        cached_memory = torch.cuda.memory_reserved(0)
        
        # Calculate free memory
        free_memory = total_memory - allocated_memory - cached_memory
        
        # Convert to more readable units (GB)
        total_gb = total_memory / 1e9
        allocated_gb = allocated_memory / 1e9
        cached_gb = cached_memory / 1e9
        free_gb = free_memory / 1e9
        
        print(f"CUDA Memory Status:")
        print(f"Total: {total_gb:.2f} GB")
        print(f"Allocated: {allocated_gb:.2f} GB ({allocated_gb/total_gb*100:.1f}%)")
        print(f"Cached: {cached_gb:.2f} GB ({cached_gb/total_gb*100:.1f}%)")
        print(f"Free: {free_gb:.2f} GB ({free_gb/total_gb*100:.1f}%)")
    else:
        print("CUDA is not available on this system")

# Call the function to check memory
check_cuda_memory()

CUDA Memory Status:
Total: 6.10 GB
Allocated: 0.00 GB (0.0%)
Cached: 0.00 GB (0.0%)
Free: 6.10 GB (100.0%)


In [15]:
torch.cuda.empty_cache()

In [18]:
data = pl.read_csv('data/merchant_locs_train.csv', separator=',', quote_char='"')

In [17]:
import polars as pl
import numpy as np

def shuffled_train_test_split(df, test_size=0.2, seed=42):
    """
    Create a shuffled train-test split using Polars DataFrame.
    
    Parameters:
    -----------
    df : polars.DataFrame
        The input DataFrame to split
    test_size : float
        Proportion of the dataset to include in the test split (0.0 to 1.0)
    seed : int
        Random seed for reproducibility
    
    Returns:
    --------
    train_df, test_df : tuple of polars.DataFrame
        The train and test splits of the input DataFrame
    """
    # Generate a random column for shuffling
    np.random.seed(seed)
    n_rows = df.height
    random_values = np.random.rand(n_rows)
    
    # Add the random column to the DataFrame
    df_with_random = df.with_columns(
        pl.lit(random_values).alias("_random_")
    )
    
    # Sort by the random column
    shuffled_df = df_with_random.sort("_random_")
    
    # Calculate split point
    split_idx = int(n_rows * (1 - test_size))
    
    # Split the DataFrame
    train_df = shuffled_df[:split_idx].drop("_random_")
    test_df = shuffled_df[split_idx:].drop("_random_")
    
    return train_df, test_df

# Example usage

In [19]:
train_df, test_df = shuffled_train_test_split(data, test_size=0.1, seed=42)

In [21]:
train_df.write_csv("data/merchant_locs_offline_online_train.csv")

In [23]:
test_df.write_csv("data/merchant_locs_offline_online_validation.csv")

In [5]:
data['online_offline_flag'].value_counts()

online_offline_flag,count
str,u32
"""ONLINE""",3378
"""OFFLINE""",6622


In [7]:
data

source,merchant_name,merchant_location,online_offline_flag
str,str,str,str
"""CREDIT_CARD""","""TRAVELOKA-1223435704 JAKARTA I…",,"""ONLINE"""
"""QRIS_WONDR""","""PECEL LELE AYAM KREMES MA""","""BOGOR""","""OFFLINE"""
"""QRIS_WONDR""","""MAKAM SYEKH AHMAD MUTAMAK""","""PATI""","""OFFLINE"""
"""QRIS_WONDR""","""KEDAI HENOT""","""KOTA PALEMBANG""","""OFFLINE"""
"""QRIS_WONDR""","""PKM DUKUH KUPANG""","""SURABAYA""","""OFFLINE"""
…,…,…,…
"""QRIS_WONDR""","""MERAH PUTIH LAUNDRY FA...""","""SURAKARTA""","""OFFLINE"""
"""QRIS_WONDR""","""BUBUR AYAM SANTANI""","""BOGOR""","""OFFLINE"""
"""QRIS_WONDR""","""CINTA DAMAI PUTRA BAHAGIA""","""MAJALENGKA (K""","""OFFLINE"""
"""QRIS_WONDR""","""LUMPIA BASAH MAMAH DZAKI""","""BOGOR (KAB)""","""OFFLINE"""


In [6]:
mapping = {"ONLINE": 1, "OFFLINE": 0}

In [7]:
data = data.with_columns(
    pl.col("online_offline_flag").replace_strict(mapping).alias("label")
)

In [10]:
import re
def count_digit(x):
    return sum(c.isdigit() for c in x)

data = data.with_columns(
    pl.col("merchant_name").map_elements(lambda x: count_digit(x)/len(x), return_dtype = pl.Float32).alias("digit_ratio"),
    pl.col("merchant_name").map_elements(lambda x: len(re.findall(r'\d+',x)), return_dtype = pl.Int32).alias("num_blocks"),
    #pl.when(pl.col("merchant_location").is_null()).then(pl.lit(0)).otherwise(pl.lit(1)).alias("location_is_known"),
    pl.when(pl.col("merchant_name").str.contains(r"((\bAFM\b)|(ALFAMART))|((\bIDM\b|INDOMARET))|(FAMILY\s?MART)|(HHB\s[A-Z0-9]{3})|(7-(11|ELEVEN))")).then(pl.lit(1)).otherwise(pl.lit(0)).alias("known_offline_merchant"),
    pl.when(pl.col("merchant_name").str.contains(r"\.CO(\.|M)?|(CO\.ID)|(\.(SG|JP))|(GOOGL)|(FACEB(OO)?K)|(BILL P(A)?YM(ENT)?)")).then(pl.lit(1)).otherwise(pl.lit(0)).alias("known_online_merchant"),
    pl.col("merchant_name").map_elements(lambda x: len(re.sub(r'[a-zA-Z0-9\s]','',x)), return_dtype = pl.Int32).alias("num_non_alphanumerics"),
)

In [11]:
data

source,merchant_name,merchant_location,online_offline_flag,label,digit_ratio,num_blocks,known_offline_merchant,known_online_merchant,num_non_alphanumerics
str,str,str,str,i64,f32,i32,i32,i32,i32
"""CREDIT_CARD""","""-TKP291822088JAKARTA ID""",,"""ONLINE""",1,0.391304,1,0,0,1
"""CREDIT_CARD""","""XL AXIATA 2DS *853957 JAKARTA …",,"""ONLINE""",1,0.194444,2,0,0,1
"""CREDIT_CARD""","""-TKP288134990JAKARTA ID""",,"""ONLINE""",1,0.391304,1,0,0,1
"""QRIS_WONDR""","""BRUNO PREMI""","""CIREBON""","""OFFLINE""",0,0.0,0,0,0,0
"""QRIS_WONDR""","""NASI LENGKO""","""JAKARTA TIMUR""","""OFFLINE""",0,0.0,0,0,0,0
…,…,…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""GRAB* A-7COAA8GWWFW2 SOUTH JAK…",,"""ONLINE""",1,0.083333,3,0,0,2
"""QRIS_WONDR""","""NASI PECEL MASPERRI""","""BALIKPAPAN""","""OFFLINE""",0,0.0,0,0,0,0
"""QRIS_WONDR""","""AMPERA TARUNA FAMILY""","""SIDOARJO""","""OFFLINE""",0,0.0,0,0,0,0
"""QRIS_WONDR""","""AYAM BAKAR SHINAI, KLP DU""","""TANGERANG""","""OFFLINE""",0,0.0,0,0,0,1


In [11]:
from sklearn.model_selection import train_test_split
X = data[['merchant_name','digit_ratio', 'num_blocks', #'location_is_known', 
          'known_offline_merchant']]
y = data['label']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [13]:
X_test_raw

merchant_name,digit_ratio,num_blocks,known_offline_merchant
str,f32,i32,i32
"""LAB CITO BOGOR""",0.0,0,0
"""INDOMARET TU67 SALATIGA""",0.086957,1,1
"""TRAVELOKA3DS*1210524667 JAKART…",0.323529,2,0
"""-TKP292496391JAKARTA ID""",0.391304,1,0
"""TRAVELOKAC*1216941198 JAKARTA …",0.3125,1,0
…,…,…,…
"""EBAY O*08-12584-27572 408-3766…",0.611111,5,0
"""-TKP292218179JAKARTA ID""",0.391304,1,0
"""BATIK ATIK.""",0.0,0,0
"""KOPERASI GAJAYANA MALANG""",0.0,0,0


In [14]:
# bert auto tokenizer
import transformers, tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
bert_tokenizer = transformers.AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
bert_model = transformers.AutoModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

In [16]:
bert_model.to('cuda')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 312, padding_idx=0)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-3): 4 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [17]:
bert_input = bert_tokenizer(list(X_train_raw['merchant_name']), return_tensors="pt", padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
# Get embeddings
with torch.no_grad():
    bert_output = bert_model(**bert_input)

In [19]:
bert_feature = bert_output.last_hidden_state[:, 0, :]

In [20]:
bert_feature.shape

torch.Size([7000, 312])

In [38]:
# preprocessing for tfidf: remove special characters, numbers, extra space, convert to lowercase
import re

def clean_merchant_name(x):
    x1 = re.sub(r'[^a-zA-Z\s]', '', x)
    x2 = re.sub(r'\s+', ' ', x1).lower()
    return x2

data = data.with_columns(
    pl.col("merchant_name").map_elements(lambda x: clean_merchant_name(x), return_dtype = pl.Utf8).alias("merchant_name_clean")
)

In [184]:
X = data['merchant_name_clean']
y = data['label']

In [189]:
from sklearn.model_selection import train_test_split

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [200]:
# sklearn TFIDF tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='char_wb',  # Character n-grams including word boundaries
    ngram_range=(2, 5),  # 2-4 character sequences
    min_df=4,
    max_df=0.95,
    sublinear_tf=True, 
    norm='l2',  
    use_idf=True, 
    smooth_idf=True, 
)
X_train = vectorizer.fit_transform(X_train_raw)

In [198]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 19484934 stored elements and shape (350000, 119182)>

In [201]:
X_test = vectorizer.transform(X_test_raw)

In [202]:
X_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8325890 stored elements and shape (150000, 119182)>

In [82]:
# modeling with torch

In [22]:
import torch

In [23]:
device = 'cuda'
torch.set_default_device(device)

In [67]:
X_train.indptr
X_train.indices

array([22129, 42212,  2633, ..., 45516, 28812, 38533],
      shape=(3260473,), dtype=int32)

In [72]:
X_train_csr_tensor = torch.sparse_csr_tensor(torch.from_numpy(X_train.indptr),
                                       torch.from_numpy(X_train.indices),
                                       torch.from_numpy(X_train.data),
                                        dtype = torch.float32)

In [77]:
X_test_csr_tensor = torch.sparse_csr_tensor(torch.from_numpy(X_test.indptr),
                                       torch.from_numpy(X_test.indices),
                                       torch.from_numpy(X_test.data),
                                        dtype = torch.float32)

In [81]:
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)

In [125]:
# baseline model, xgboost

In [230]:
X_train_raw

merchant_name,digit_ratio,num_blocks,known_offline_merchant
str,f32,i32,i32
"""TOKO PALL""",0.0,0,0
"""TRAVELOKA*1221658246 JAKARTA I…",0.322581,1,0
"""SEKIAN KOPI PORIS, CIPOND""",0.0,0,0
"""RM PADANG BASALERO""",0.0,0,0
"""WARUNG IJO BPK. WAHYU""",0.0,0,0
…,…,…,…
"""SHOP TKPD 6092057227 JAKARTA I…",0.322581,1,0
"""CAHAYA SIDRAP BIRD SHOP""",0.0,0,0
"""FLIPSIDE TEXNO""",0.0,0,0
"""MIE AYAM BAKSO MBAK YU WI""",0.0,0,0


In [26]:
X_train = torch.cat([torch.tensor(X_train_raw[['digit_ratio', 'num_blocks', 
                                               #'location_is_known',
                                               'known_offline_merchant']].to_numpy()).to(device), 
                     #bert_feature
                    ], dim=1)
y_train = torch.tensor(y_train)

In [27]:
X_train.shape

torch.Size([7000, 3])

In [28]:
# TRAINING

In [29]:
test_bert_input = bert_tokenizer(list(X_test_raw['merchant_name']), return_tensors="pt", padding=True, truncation=True)
# Get embeddings
with torch.no_grad():
    test_bert_output = bert_model(**test_bert_input)

In [30]:
test_bert_feature = test_bert_output.last_hidden_state[:, 0, :]

In [31]:
X_test = torch.cat([torch.tensor(X_test_raw[['digit_ratio','num_blocks',
                                             #'location_is_known',
                                             'known_offline_merchant']].to_numpy()).to(device), 
                    #test_bert_feature
                   ], 
                   dim=1)
y_test = torch.tensor(y_test)

In [32]:
import xgboost as xgb
model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=500,
    learning_rate=0.1,
    max_depth=5,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',  # For faster training with sparse data
    device='cuda',
    random_state = 1,
    early_stopping_rounds = 50
)

In [None]:
# Train model
model.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=True,
)

In [34]:
y_test

tensor([0, 0, 1,  ..., 0, 0, 0], device='cuda:0')

In [35]:
# 6. Evaluate model
from sklearn.metrics import classification_report, accuracy_score
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test.to('cpu'), y_pred))
print("\nClassification Report:")
print(classification_report(y_test.to('cpu'), y_pred))

Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2020
           1       0.96      0.94      0.95       980

    accuracy                           0.97      3000
   macro avg       0.97      0.96      0.97      3000
weighted avg       0.97      0.97      0.97      3000



## OFFLINE ONLY

In [96]:
data_test_2_raw = pl.read_csv('data/merchant_locs_train.csv', separator=',', quote_char='"').sample(100000)

In [97]:
data_test_2_raw = data_test_2_raw.filter(pl.col('merchant_location').is_null() & (pl.col('online_offline_flag') == 'OFFLINE'))

In [98]:
data_test_2_raw['online_offline_flag'].value_counts()

online_offline_flag,count
str,u32
"""OFFLINE""",2025


In [99]:
data_test_2 = data_test_2_raw.with_columns(
    pl.col("merchant_name").map_elements(lambda x: count_digit(x)/len(x), return_dtype = pl.Float32).alias("digit_ratio"),
    pl.col("merchant_name").map_elements(lambda x: len(re.findall(r'\d+',x)), return_dtype = pl.Int32).alias("num_blocks"),
    #pl.when(pl.col("merchant_location").is_null()).then(pl.lit(0)).otherwise(pl.lit(1)).alias("location_is_known"),
    pl.when(pl.col("merchant_name").str.contains(r"((\bAFM\b)|(ALFAMART))|((\bIDM\b|INDOMARET))|(FAMILY\s?MART)|(HHB\s[A-Z0-9]{3})")).then(pl.lit(1)).otherwise(pl.lit(0)).alias("known_offline_merchant")
)

In [100]:
data_test_2 = data_test_2.with_columns(
    pl.col("online_offline_flag").replace_strict(mapping).alias("label")
)

In [101]:
data_test_2

source,merchant_name,merchant_location,online_offline_flag,digit_ratio,num_blocks,known_offline_merchant,label
str,str,str,str,f32,i32,i32,i64
"""CREDIT_CARD""","""ALFAMART W172 MBL MEDAN ID""",,"""OFFLINE""",0.115385,1,1,0
"""CREDIT_CARD""","""ALFAMART 2ABN NGAGEL KAB PATI …",,"""OFFLINE""",0.03125,1,1,0
"""CREDIT_CARD""","""IDM TNQL BATAM ID""",,"""OFFLINE""",0.0,0,1,0
"""CREDIT_CARD""","""TOKO LIBERTY MINAHASA KAB.ID""",,"""OFFLINE""",0.0,0,0,0
"""CREDIT_CARD""","""ALFAMART AD61 MBL TANGERANG ID""",,"""OFFLINE""",0.066667,1,1,0
…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""IDM TNTJ MANADO ID""",,"""OFFLINE""",0.0,0,1,0
"""CREDIT_CARD""","""IDM FWR9 - AMD SUKARAMI PALEMB…",,"""OFFLINE""",0.027778,1,1,0
"""CREDIT_CARD""","""1M05 ALFAMART VILLA J TANGERAN…",,"""OFFLINE""",0.081081,2,1,0
"""CREDIT_CARD""","""IDM TXMU-ZONA EROPA BEKASI ID""",,"""OFFLINE""",0.0,0,1,0


In [102]:
def create_bert_feature(data):
    bert_input = bert_tokenizer(list(data), return_tensors="pt", padding=True, truncation=True)
    # Get embeddings
    with torch.no_grad():
        bert_output = bert_model(**bert_input)
    return bert_output.last_hidden_state[:, 0, :]

In [103]:
data_test_2_bert = create_bert_feature(data_test_2['merchant_name'])

In [104]:
#data_test_2 = vectorizer.transform(data_test_2_raw['merchant_name_clean'])

In [105]:
X_test_2 = torch.cat([torch.tensor(data_test_2[['digit_ratio','num_blocks',
                                                #'location_is_known', 
                                                'known_offline_merchant']].to_numpy()).to(device), 
                      data_test_2_bert
                     ], 
                     dim=1)
y_test_2 = torch.tensor(data_test_2['label'])

In [106]:
X_test_2.shape

torch.Size([2025, 315])

In [107]:
# 6. Evaluate model
from sklearn.metrics import classification_report, accuracy_score
y_pred = model.predict(X_test_2)
print("Accuracy:", accuracy_score(y_test_2.to('cpu'), y_pred))
print("\nClassification Report:")
print(classification_report(y_test_2.to('cpu'), y_pred))


NameError: name 'model' is not defined

In [220]:
data_test_2_raw = data_test_2_raw.with_columns(
    pl.lit(y_pred).alias('predicted_label')
)

In [204]:
data_test_2_raw.filter(pl.col('predicted_label') == 1).write_csv('wrong_prediction.csv')

# Track Feature Experiments

In [54]:
import mlflow
from mlflow.models import infer_signature
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

In [20]:
# bert auto tokenizer
import transformers, tqdm
bert_tokenizer = transformers.AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
bert_model = transformers.AutoModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
bert_model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 312, padding_idx=0)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-3): 4 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [22]:
def create_bert_feature(data):
    bert_input = bert_tokenizer(list(data), return_tensors="pt", padding=True, truncation=True)
    # Get embeddings
    with torch.no_grad():
        bert_output = bert_model(**bert_input)
    return bert_output.last_hidden_state[:, 0, :]

In [32]:
feature_sets = {
    'basic':(['digit_ratio', 'num_blocks', 'known_offline_merchant'], []),
    'basic_with_bert':(['digit_ratio', 'num_blocks', 'known_offline_merchant'], ['bert'])
}

In [64]:
# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Feature Experiment")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

2025/03/03 22:22:45 INFO mlflow.tracking.fluent: Experiment with name 'MLflow Feature Experiment' does not exist. Creating a new experiment.


In [114]:
def evaluate_feature_sets(X,y, feature_set_name, features):
    with mlflow.start_run(run_name=f'feature_experiment_{feature_set_name}'):
        # Log feature set metadata
        experiment_id = mlflow.active_run().info.experiment_id
        mlflow.log_param("feature_set", feature_set_name)

        has_bert = features[1] == ['bert']
        num_features = len(features[0]) + 312 if has_bert else len(features[0])
        mlflow.log_param("num_features", num_features)
        mlflow.log_param("features", features)

        if has_bert:
            bert_feature = create_bert_feature(data['merchant_name'])
            X = torch.cat([torch.tensor(X[features[0]].to_numpy()).to(device), bert_feature], dim=1)
        else:
            X = torch.tensor(X[features[0]].to_numpy()).to(device)
        y = torch.tensor(y)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
        params = {
                'objective':'binary:logistic',
                'n_estimators': 100,
                'learning_rate':0.1,
                'max_depth':5,
                'min_child_weight':3,
                'subsample':0.8,
                'colsample_bytree':0.8,
                'tree_method':'hist',
                'device':'cuda',
                'random_state' : 1,
                'early_stopping_rounds' :50
        }
        mlflow.log_params(params)
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)])


        # 6. Evaluate model

        y_pred = model.predict(X_test)
        # Log multiple metrics
        mlflow.log_metric("accuracy", accuracy_score(y_test.to('cpu'), y_pred))
        mlflow.log_metric("f1_score", f1_score(y_test.to('cpu'), y_pred))
        mlflow.log_metric("roc_auc", roc_auc_score(y_test.to('cpu'), y_pred))

        # Evaluate model to specific dataset (accuracy only)
        if has_bert:
            data_test_bert = create_bert_feature(data_test_2['merchant_name'])
            X_test_specific = torch.cat([torch.tensor(data_test_2[features[0]].to_numpy()).to(device), data_test_bert], 
                     dim=1)
        else:
            X_test_specific = torch.tensor(data_test_2[features[0]].to_numpy()).to(device)
            
        y_test_specific = torch.tensor(data_test_2['label'])
        y_pred = model.predict(X_test_specific)
        mlflow.log_metric("accuracy_specific", accuracy_score(y_test_specific.to('cpu'), y_pred))

        # Save report
        X_test_specific = data_test_2.with_columns(
            pl.lit(y_pred).alias('predicted_label')
        )
        X_test_specific[['merchant_name']].filter(pl.col('predicted_label') == 1).write_csv(f"prediction_result_{feature_set_name}.csv")
        mlflow.log_artifact(f"prediction_result_{feature_set_name}.csv")
        
        # Log model
        # Infer the model signature
        signature = infer_signature(X_train, model.predict(X_train))
        
        mlflow.sklearn.log_model(model, "model",signature=signature)

In [115]:
X = data.drop('label')
y = data['label']

In [116]:
for feature_name, features in feature_sets.items():
    evaluate_feature_sets(X,y,feature_name, features)

[0]	validation_0-logloss:0.57302
[1]	validation_0-logloss:0.52129
[2]	validation_0-logloss:0.47845
[3]	validation_0-logloss:0.44330
[4]	validation_0-logloss:0.41373
[5]	validation_0-logloss:0.37492
[6]	validation_0-logloss:0.34252
[7]	validation_0-logloss:0.31576
[8]	validation_0-logloss:0.29426
[9]	validation_0-logloss:0.28023
[10]	validation_0-logloss:0.26812
[11]	validation_0-logloss:0.25210
[12]	validation_0-logloss:0.23631
[13]	validation_0-logloss:0.22844
[14]	validation_0-logloss:0.21756
[15]	validation_0-logloss:0.21135
[16]	validation_0-logloss:0.20031
[17]	validation_0-logloss:0.19069
[18]	validation_0-logloss:0.18290
[19]	validation_0-logloss:0.17661
[20]	validation_0-logloss:0.17143
[21]	validation_0-logloss:0.16530
[22]	validation_0-logloss:0.16103
[23]	validation_0-logloss:0.15720
[24]	validation_0-logloss:0.15290
[25]	validation_0-logloss:0.14986
[26]	validation_0-logloss:0.14617
[27]	validation_0-logloss:0.14267
[28]	validation_0-logloss:0.14004
[29]	validation_0-loglos



🏃 View run feature_experiment_basic at: http://127.0.0.1:5000/#/experiments/648015007643292645/runs/3db965d48a8a4ee58e8cd088e4984b6b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/648015007643292645
[0]	validation_0-logloss:0.55070
[1]	validation_0-logloss:0.48226
[2]	validation_0-logloss:0.42554
[3]	validation_0-logloss:0.37799
[4]	validation_0-logloss:0.33690
[5]	validation_0-logloss:0.30216
[6]	validation_0-logloss:0.27167
[7]	validation_0-logloss:0.24513
[8]	validation_0-logloss:0.22230
[9]	validation_0-logloss:0.20202
[10]	validation_0-logloss:0.18401
[11]	validation_0-logloss:0.16792
[12]	validation_0-logloss:0.15355
[13]	validation_0-logloss:0.14054
[14]	validation_0-logloss:0.12898
[15]	validation_0-logloss:0.11872
[16]	validation_0-logloss:0.10959
[17]	validation_0-logloss:0.10109
[18]	validation_0-logloss:0.09384
[19]	validation_0-logloss:0.08710
[20]	validation_0-logloss:0.08107
[21]	validation_0-logloss:0.07588
[22]	validation_0-logloss:0.07063
[23]	validation_0-



🏃 View run feature_experiment_basic_with_bert at: http://127.0.0.1:5000/#/experiments/648015007643292645/runs/f667ced415764497bde8b08f101e591b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/648015007643292645


In [37]:
data['label']

label
i64
0
1
0
0
1
…
0
0
0
1
