In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import cohen_kappa_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna
from optuna.samplers import TPESampler


In [94]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

testAns = test.copy()

In [83]:
train = train.drop(columns=["id"])
test = test.drop(columns=["id"])

In [84]:
train["quality"] = train["quality"] - 3

In [85]:
def add_columns(df):
    df = df.copy()
    df["total_acid"] = df["fixed acidity"] + df["volatile acidity"] + df["citric acid"]
    df["acid/density"] = df["total_acid"] / df["density"]
    df["alcohol_density"] = df["alcohol"] * df["density"]
    df["sulphate/density"] = df["sulphates"] / df["density"]
    df["sulphates/acid"] = df["sulphates"] / df["volatile acidity"]
    df["sulphates/chlorides"] = df["sulphates"] / df["chlorides"]
    df["sulphates*alcohol"] = df["sulphates"] * df["alcohol"]
    df["pH_round1"] = df["pH"].round(1)
    df["log1p_residual_sugar"] = np.log1p(df["residual sugar"])
    df["citric_acid_per_alcohol"] = np.where(df["alcohol"] == 0, 0, df["citric acid"] / df["alcohol"])
    df["citric_acid_type"] = df["citric acid"].apply(lambda x: 0 if x == 0 else (1 if x == 0.49 else 2))
    df["alcohol_mean_groupby_pH"] = df.groupby("pH_round1")["alcohol"].transform("mean")
    return df

In [86]:
train = add_columns(train)
test = add_columns(test)

numeric_cols = train.drop(columns=['quality']).select_dtypes(include=np.number).columns

kmeans = KMeans(n_clusters=3, random_state=42)
train['cluster'] = kmeans.fit_predict(train[numeric_cols])
test['cluster'] = kmeans.predict(test[numeric_cols])

X = train.drop(columns=['quality'])
y = train['quality']

In [87]:
numeric_cols = train.select_dtypes(include=np.number).drop(columns=["quality"]).columns

kmeans = KMeans(n_clusters=3, random_state=42)
train['cluster'] = kmeans.fit_predict(train[numeric_cols])
test['cluster'] = kmeans.predict(test[numeric_cols])

In [88]:
X = train.drop(columns=['quality'])
y = train['quality']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [89]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
lgb = LGBMClassifier(random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)


In [90]:
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)
cat.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3086
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 24
[LightGBM] [Info] Start training from score -5.403678
[LightGBM] [Info] Start training from score -3.572579
[LightGBM] [Info] Start training from score -0.832409
[LightGBM] [Info] Start training from score -0.923189
[LightGBM] [Info] Start training from score -2.107155
[LightGBM] [Info] Start training from score -4.298912


<catboost.core.CatBoostClassifier at 0x168558e1880>

In [91]:
def evaluate(model, name):
    preds = model.predict(X_val)
    score = cohen_kappa_score(y_val, preds, weights='quadratic')
    print(f"{name} Quadratic Kappa Score: {score:.4f}")
    return model.predict(test)

In [92]:
preds_xgb = evaluate(xgb, "XGBoost")
preds_lgb = evaluate(lgb, "LightGBM")
preds_cat = evaluate(cat, "CatBoost")


XGBoost Quadratic Kappa Score: 0.3190
LightGBM Quadratic Kappa Score: 0.3515
CatBoost Quadratic Kappa Score: 0.3393


In [93]:
final_preds = np.round((preds_xgb + preds_lgb + preds_cat) / 3).astype(int) + 3  # remettre sur l’échelle 3-8


In [96]:
def lgb_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": 42
    }

    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return cohen_kappa_score(y_val, preds, weights="quadratic")

study_lgb = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study_lgb.optimize(lgb_objective, n_trials=30)
lgb_best = LGBMClassifier(**study_lgb.best_params)
lgb_best.fit(X, y)
preds_lgb = lgb_best.predict(test)


NameError: name 'optuna' is not defined

In [None]:
def cat_objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "verbose": 0,
        "random_state": 42
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return cohen_kappa_score(y_val, preds, weights="quadratic")

study_cat = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study_cat.optimize(cat_objective, n_trials=30)
cat_best = CatBoostClassifier(**study_cat.best_params, verbose=0)
cat_best.fit(X, y)
preds_cat = cat_best.predict(test)


In [None]:
def xgb_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "random_state": 42
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return cohen_kappa_score(y_val, preds, weights="quadratic")

study_xgb = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study_xgb.optimize(xgb_objective, n_trials=30)
xgb_best = XGBClassifier(**study_xgb.best_params)
xgb_best.fit(X, y)
preds_xgb = xgb_best.predict(test)


In [None]:
final_preds = np.round((preds_xgb + preds_lgb + preds_cat) / 3).astype(int) + 3  # remettre sur l’échelle 3-8


In [95]:
sample_submission['quality'] = final_preds
sample_submission.to_csv("submission.csv", index=False)