In [1]:
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import joblib
import optuna
import shutil
import glob
import json
import gc

warnings.filterwarnings("ignore")

In [10]:
class CFG:
    train_path = "train.csv"
    test_path = "test.csv"
    sample_sub_path = "sample_submission.csv"

    original_path = "Fertilizer Prediction.csv"

    target = "Fertilizer Name"
    n_folds = 5
    seed = 42

DATA LOADİNG AND PREPROCESSİNG

In [11]:
train = pd.read_csv(CFG.train_path, index_col="id")
test = pd.read_csv(CFG.test_path, index_col="id")
original = pd.read_csv(CFG.original_path)

cat_cols = train.select_dtypes(include="object").columns.tolist()
cat_cols = [c for c in cat_cols if c != CFG.target]
train[cat_cols] = train[cat_cols].astype(str).astype("category")
test[cat_cols] = test[cat_cols].astype(str).astype("category")
original[cat_cols] = original[cat_cols].astype(str).astype("category")

label_encoder = LabelEncoder()
train[CFG.target] = label_encoder.fit_transform(train[CFG.target])
original[CFG.target] = label_encoder.transform(original[CFG.target])

X = train.drop(CFG.target, axis=1)
y = train[CFG.target]
X_test = test

X_original = original.drop(CFG.target, axis=1)
y_original = original[CFG.target]

In [12]:
mutual_info = mutual_info_classif(X, y, random_state=CFG.seed, discrete_features=True)

mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info = pd.DataFrame(mutual_info.sort_values(ascending=False), columns=['Mutual Information'])
mutual_info.style.bar(subset=['Mutual Information'], cmap='RdYlGn')

Unnamed: 0,Mutual Information
Moisture,0.003308
Phosphorous,0.002984
Nitrogen,0.002373
Crop Type,0.002088
Potassium,0.001367
Humidity,0.001015
Temparature,0.000966
Soil Type,0.000617


In [13]:
mutual_info = mutual_info_classif(X_original, y_original, random_state=CFG.seed, discrete_features=True)

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_original.columns
mutual_info = pd.DataFrame(mutual_info.sort_values(ascending=False), columns=['Mutual Information'])
mutual_info.style.bar(subset=['Mutual Information'], cmap='RdYlGn')

Unnamed: 0,Mutual Information
Phosphorous,0.001362
Moisture,0.001355
Nitrogen,0.001068
Potassium,0.000626
Humidity,0.000494
Temparature,0.000432
Crop Type,0.00032
Soil Type,0.000107


In [None]:
def map3(y_true, y_pred_probs):
    y_true = [[x] for x in y_true]
    y_pred_probs = np.argsort(y_pred_probs, axis=1)[:, -3:][:, ::-1].tolist()
    
    def ap3(y_true, y_pred_probs):
        y_pred_probs = y_pred_probs[:3]

        score = 0.0
        num_hits = 0.0

        for i,p in enumerate(y_pred_probs):
            if p in y_true and p not in y_pred_probs[:i]:
                num_hits += 1.0
                score += num_hits / (i+1.0)

        if not y_true:
            return 0.0

        return score
    
    return np.mean([ap3(a,p) for a,p in zip(y_true, y_pred_probs)])

In [14]:
class Trainer:
    def __init__(self, model, config=CFG):
        self.model = model
        self.config = config

    def fit_predict(self, X, y, X_test, X_original=None, y_original=None, fit_args={}):
        print(f"Training {self.model.__class__.__name__}\n")
        
        scores = []        
        oof_pred_probs = np.zeros((X.shape[0], y.nunique()))
        test_pred_probs = np.zeros((X_test.shape[0], y.nunique()))
        
        skf = StratifiedKFold(n_splits=self.config.n_folds, random_state=self.config.seed, shuffle=True)
        for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            if X_original is not None and y_original is not None:
                X_train = pd.concat([X_train, X_original])
                y_train = pd.concat([y_train, y_original])
            
            model = clone(self.model)
            
            if fit_args:
                model.fit(X_train, y_train, **fit_args, eval_set=[(X_val, y_val)])
            else:
                model.fit(X_train, y_train)
            
            y_pred_probs = model.predict_proba(X_val)
            oof_pred_probs[val_idx] = y_pred_probs
            
            temp_test_pred_probs = model.predict_proba(X_test)
            test_pred_probs += temp_test_pred_probs / self.config.n_folds
            
            score = map3(y_val, y_pred_probs)
            scores.append(score)
            
            del model, X_train, y_train, X_val, y_val, y_pred_probs
            gc.collect()
            
            if fit_args:
                print(f"\n--- Fold {fold_idx + 1} - MAP@3: {score:.6f}\n\n")
            else:
                print(f"--- Fold {fold_idx + 1} - MAP@3: {score:.6f}")
                            
        overall_score = map3(y, oof_pred_probs)
            
        print(f"\n------ Overall MAP@3: {overall_score:.6f} | Average MAP@3: {np.mean(scores):.6f} ± {np.std(scores):.6f}")
        
        return oof_pred_probs, test_pred_probs, scores

    def tune(self, X, y):        
        scores = []        
        
        skf = StratifiedKFold(n_splits=self.config.n_folds, random_state=self.config.seed, shuffle=True)
        for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            model = clone(self.model)
            model.fit(X_train, y_train)
            
            y_pred_probs = model.predict_proba(X_val)            
            score = map3(y_val, y_pred_probs)
            scores.append(score)
            
            del model, X_train, y_train, X_val, y_val, y_pred_probs
            gc.collect()
            
        return np.mean(scores)

In [8]:
lgbm_params = {
 
    "boosting_type": "gbdt",
    "n_estimators": 10000,
    "learning_rate": 0.06408094783107429,
    "num_leaves": 169,
    "max_depth": 10,
    "min_child_samples": 19,
    "subsample": 0.6420340301820501,
    "colsample_bytree": 0.43403799235854973,
    "reg_alpha": 6.294093849568123,
    "reg_lambda": 5.5559072866866455,
    "random_state": 42,
    "verbosity": -1
}

lgbm_goss_params = {
    "boosting_type": "goss",
    "colsample_bytree": 0.39736332491996407,
    "learning_rate": 0.008033740989500222,
    "min_child_samples": 29,
    "min_child_weight": 0.6732469853333759,
    "n_estimators": 10000,
    "n_jobs": -1,
    "num_leaves": 89,
    "random_state": 42,
    "reg_alpha": 15.595856670965969,
    "reg_lambda": 51.43625034648377,
    "subsample": 0.07846482736630467,
    "verbose": -1
}

xgb_params = {

    "max_depth": 12,
    "colsample_bytree": 0.467,
    "subsample": 0.86,
    "n_estimators": 10000,
    "learning_rate": 0.03,
    "gamma": 0.26,
    "max_delta_step": 4,
    "reg_alpha": 2.7,
    "reg_lambda": 1.4,
    "early_stopping_rounds": 100,
    "objective": 'multi:softprob',
    "random_state": 13,
    "enable_categorical": True,
}

In [9]:
scores = {}
oof_pred_probs = {}
test_pred_probs = {}

In [15]:
lgbm_model = LGBMClassifier(**lgbm_params)
lgbm_trainer = Trainer(lgbm_model)

fit_args = {
    "callbacks": [
        log_evaluation(period=500), 
        early_stopping(stopping_rounds=100)
    ]
}

oof_pred_probs["LightGBM (gbdt)"], test_pred_probs["LightGBM (gbdt)"], scores["LightGBM (gbdt)"] = lgbm_trainer.fit_predict(X, y, X_test, X_original, y_original, fit_args)

Training LGBMClassifier

Training until validation scores don't improve for 100 rounds
[500]	valid_0's multi_logloss: 1.8979
[1000]	valid_0's multi_logloss: 1.8964
Early stopping, best iteration is:
[916]	valid_0's multi_logloss: 1.89617

--- Fold 1 - MAP@3: 0.360647


Training until validation scores don't improve for 100 rounds
[500]	valid_0's multi_logloss: 1.89711
Early stopping, best iteration is:
[878]	valid_0's multi_logloss: 1.89455

--- Fold 2 - MAP@3: 0.360561


Training until validation scores don't improve for 100 rounds
[500]	valid_0's multi_logloss: 1.89629
[1000]	valid_0's multi_logloss: 1.89353
Early stopping, best iteration is:
[924]	valid_0's multi_logloss: 1.89343

--- Fold 3 - MAP@3: 0.363106


Training until validation scores don't improve for 100 rounds
[500]	valid_0's multi_logloss: 1.89793
[1000]	valid_0's multi_logloss: 1.89598
Early stopping, best iteration is:
[921]	valid_0's multi_logloss: 1.89581

--- Fold 4 - MAP@3: 0.360882


Training until validation sco

In [16]:
xgb_model = XGBClassifier(**xgb_params)
xgb_trainer = Trainer(xgb_model)

fit_args = {
    "verbose": 500
}

oof_pred_probs["XGBoost"], test_pred_probs["XGBoost"], scores["XGBoost"] = xgb_trainer.fit_predict(X, y, X_test, X_original, y_original, fit_args)

Training XGBClassifier

[0]	validation_0-mlogloss:1.94563
[500]	validation_0-mlogloss:1.89872
[1000]	validation_0-mlogloss:1.89194
[1500]	validation_0-mlogloss:1.89108

--- Fold 1 - MAP@3: 0.365473


[0]	validation_0-mlogloss:1.94561
[500]	validation_0-mlogloss:1.89826
[1000]	validation_0-mlogloss:1.89089
[1500]	validation_0-mlogloss:1.88981
[1560]	validation_0-mlogloss:1.88984

--- Fold 2 - MAP@3: 0.365563


[0]	validation_0-mlogloss:1.94561
[500]	validation_0-mlogloss:1.89743
[1000]	validation_0-mlogloss:1.88975
[1500]	validation_0-mlogloss:1.88835
[1653]	validation_0-mlogloss:1.88842

--- Fold 3 - MAP@3: 0.367652


[0]	validation_0-mlogloss:1.94562
[500]	validation_0-mlogloss:1.89878
[1000]	validation_0-mlogloss:1.89187
[1500]	validation_0-mlogloss:1.89076
[1591]	validation_0-mlogloss:1.89083

--- Fold 4 - MAP@3: 0.365872


[0]	validation_0-mlogloss:1.94562
[500]	validation_0-mlogloss:1.89801
[1000]	validation_0-mlogloss:1.89081
[1500]	validation_0-mlogloss:1.88987
[1527]	validation

In [17]:
# Submission dosyasını oluştur
final_test_preds = np.argmax(test_pred_probs["XGBoost"], axis=1)
final_test_preds_labels = label_encoder.inverse_transform(final_test_preds)
submission = pd.read_csv(CFG.sample_sub_path, index_col="id")
submission[CFG.target] = final_test_preds_labels
submission.to_csv("submission2.csv")

print("✅ submission.csv başarıyla kaydedildi!")

✅ submission.csv başarıyla kaydedildi!


In [18]:
X = pd.DataFrame(np.concatenate(list(oof_pred_probs.values()), axis=1))
X_test = pd.DataFrame(np.concatenate(list(test_pred_probs.values()), axis=1))

In [19]:
lr_model = LogisticRegression(
    solver="liblinear",
    penalty="l1",
    C=0.8825262312223568,
    tol=0.00031213649178629937,
    fit_intercept=False
)

lr_trainer = Trainer(lr_model)
_, lr_test_pred_probs, scores["LogisticRegression"] = lr_trainer.fit_predict(X, y, X_test)

Training LogisticRegression

--- Fold 1 - MAP@3: 0.365663
--- Fold 2 - MAP@3: 0.366039
--- Fold 3 - MAP@3: 0.367821
--- Fold 4 - MAP@3: 0.366324
--- Fold 5 - MAP@3: 0.365991

------ Overall MAP@3: 0.366368 | Average MAP@3: 0.366368 ± 0.000756


In [20]:
final_predictions = []
for i in np.argsort(lr_test_pred_probs)[:, -3:][:, ::-1]:
    prediction = label_encoder.inverse_transform(i)
    final_predictions.append(" ".join(prediction))

In [21]:
sub = pd.read_csv(CFG.sample_sub_path)
sub[CFG.target] = final_predictions
sub.to_csv(f"sub_logistic-regression_{np.mean(scores['LogisticRegression']):.6f}.csv", index=False)
sub.head()

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 20-20 DAP
1,750001,17-17-17 10-26-26 20-20
2,750002,20-20 28-28 Urea
3,750003,14-35-14 17-17-17 DAP
4,750004,20-20 Urea 10-26-26
