In [1]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# Load the datasets
train_data = pd.read_csv("./input/wine_train.csv")
test_data = pd.read_csv("./input/test.csv")

# Handle missing values in training data
train_data = train_data.dropna()

# Separate features and target for training
X = train_data.drop(columns=['id', 'quality'])
y = train_data['quality']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LGBMRegressor(
    n_estimators=100,   # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size
    max_depth=-1,       # No depth limit
    num_leaves=31,      # Controls tree complexity
    boosting_type="gbdt",  # Gradient Boosting Decision Trees
    # random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Train the model on the training subset
model.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = model.predict(X_val)

# Calculate RMSE on the validation set
rmse = mean_squared_error(y_val, val_predictions) ** 0.5
print(f"Validation RMSE: {rmse:.4f}")

# Retrain the model on the full training data
model.fit(X, y)

model_filename = "wine_quality_model.pkl"
joblib.dump(model, model_filename)
print(f"Model saved as {model_filename}")

# Drop the 'id' column from the test data
X_test = test_data.drop(columns=['id'])

# Make predictions on the test dataset
test_predictions = model.predict(X_test)

# Create submission DataFrame

data = {
    "id": test_data["id"],
    "quality": test_predictions
}

submission = pd.DataFrame(data)

# submission["quality"] = submission["quality"].apply(round)

submission["quality"] = submission["quality"].apply(lambda x: f"{round(x):.1f}")
submission["quality"] = submission["quality"].astype(float)  # Convert back to float if needed

# Save the submission file
submission.to_csv("submission.csv", index=False)
print("submission.csv file has been saved successfully!")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 966
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 11
[LightGBM] [Info] Start training from score 5.664333
Validation RMSE: 0.7305
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 984
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 11
[LightGBM] [Info] Start training from score 5.660533
Model saved as wine_quality_model.pkl
submission.csv file has been saved successfully!
