# Match Plants Classical Benchmark (Logistic Regression)

Baseline: use frozen ResNet18 embeddings (precomputed), build simple pair features,
and train a logistic regression classifier. No deep model training.


In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression

DATA_DIR = 'data'
TRAIN_CSV = os.path.join(DATA_DIR, 'train_data.csv')
EMB_PATH = os.path.join(DATA_DIR, 'embeddings_resnet18.npz')


In [2]:
train_df = pd.read_csv(TRAIN_CSV)
print('train', train_df.shape)
print(train_df['class'].value_counts())
train_df.head()


train (2400, 4)
class
0    1601
1     799
Name: count, dtype: int64


Unnamed: 0,Pair_Num,img_idx1,img_idx2,class
0,372,182,684,0
1,71,477,990,0
2,2561,769,240,1
3,1104,906,36,0
4,2149,123,344,1


In [3]:
emb = np.load(EMB_PATH)
img_ids = emb['img_ids']
embeddings = emb['embeddings']

id_to_idx = {int(i): idx for idx, i in enumerate(img_ids)}
unique_ids = np.unique(train_df[['img_idx1', 'img_idx2']].values)
missing = [int(i) for i in unique_ids if int(i) not in id_to_idx]
print('missing ids', missing[:5], 'count', len(missing))


missing ids [] count 0


In [4]:
def build_features(df):
    idx1 = np.array([id_to_idx[int(i)] for i in df['img_idx1']])
    idx2 = np.array([id_to_idx[int(i)] for i in df['img_idx2']])
    v1 = embeddings[idx1]
    v2 = embeddings[idx2]
    # Pair features: absolute diff + elementwise product
    x = np.concatenate([np.abs(v1 - v2), v1 * v2], axis=1)
    y = df['class'].values.astype(int)
    return x, y


In [5]:
train_pairs, val_pairs = train_test_split(
    train_df, test_size=0.2, stratify=train_df['class'], random_state=42
)
print('pair split', train_pairs.shape, val_pairs.shape)

x_train, y_train = build_features(train_pairs)
x_val, y_val = build_features(val_pairs)

clf = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs'
)
clf.fit(x_train, y_train)

val_probs = clf.predict_proba(x_val)[:, 1]
best_f1 = 0.0
best_t = 0.5
for t in np.linspace(0.1, 0.9, 81):
    pred = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_t = float(t)

pred_best = (val_probs >= best_t).astype(int)
acc_best = accuracy_score(y_val, pred_best)
pred_05 = (val_probs >= 0.5).astype(int)
acc_05 = accuracy_score(y_val, pred_05)

metrics = {
    'n_train': int(len(train_pairs)),
    'n_val': int(len(val_pairs)),
    'pos_ratio_train': float(train_pairs['class'].mean()),
    'pos_ratio_val': float(val_pairs['class'].mean()),
    'best_f1': float(best_f1),
    'best_t': float(best_t),
    'acc_best_t': float(acc_best),
    'acc_at_0_5': float(acc_05),
}
metrics


pair split (1920, 4) (480, 4)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'n_train': 1920,
 'n_val': 480,
 'pos_ratio_train': 0.3328125,
 'pos_ratio_val': 0.3333333333333333,
 'best_f1': 0.631578947368421,
 'best_t': 0.43000000000000005,
 'acc_best_t': 0.7375,
 'acc_at_0_5': 0.7354166666666667}