In [9]:
!pip install scikit-learn==1.5.2



In [10]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# import xgboost as xgb

# data = pd.read_csv('https://byui-cse.github.io/cse450-course/ice/wine/data/wine-training.csv')
# X = data.drop(columns=['wine'])
# y = data['wine']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# xgb_model = xgb.XGBClassifier(
#     objective='multi:softmax',
#     num_class=3,
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=6,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42
# )

# xgb_model.fit(X_train, y_train)

# train_score = xgb_model.score(X_train, y_train)
# test_score = xgb_model.score(X_test, y_test)

# print(f"Train Score: {train_score:.4f}")
# print(f"Test Score: {test_score:.4f}")

# missing_values = data.isnull().sum()

# wine_distribution = data['wine'].value_counts(normalize=True)

# missing_values, wine_distribution

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb

data = pd.read_csv('https://byui-cse.github.io/cse450-course/ice/wine/data/wine-training.csv')
X = data.drop(columns=['wine'])
y = data['wine']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

dtrain = xgb.DMatrix(X_train_balanced, label=y_train_balanced)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'eval_metric': 'mlogloss'
}

evals = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=300,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=True
)

y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

train_score = accuracy_score(y_train_balanced, y_train_pred)
test_score = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train_balanced, y_train_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Train Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")
print(f"Train F1 Score: {train_f1:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


[0]	train-mlogloss:1.03986	eval-mlogloss:1.04835
[1]	train-mlogloss:0.98210	eval-mlogloss:0.98818
[2]	train-mlogloss:0.92883	eval-mlogloss:0.94899
[3]	train-mlogloss:0.87974	eval-mlogloss:0.90293
[4]	train-mlogloss:0.83372	eval-mlogloss:0.85867
[5]	train-mlogloss:0.78994	eval-mlogloss:0.81580
[6]	train-mlogloss:0.74976	eval-mlogloss:0.77636
[7]	train-mlogloss:0.71277	eval-mlogloss:0.74347
[8]	train-mlogloss:0.67703	eval-mlogloss:0.70950
[9]	train-mlogloss:0.64517	eval-mlogloss:0.67966
[10]	train-mlogloss:0.61358	eval-mlogloss:0.65029
[11]	train-mlogloss:0.58424	eval-mlogloss:0.62528
[12]	train-mlogloss:0.55623	eval-mlogloss:0.59629
[13]	train-mlogloss:0.53114	eval-mlogloss:0.57299
[14]	train-mlogloss:0.50635	eval-mlogloss:0.54809
[15]	train-mlogloss:0.48273	eval-mlogloss:0.52492
[16]	train-mlogloss:0.46105	eval-mlogloss:0.50573
[17]	train-mlogloss:0.44069	eval-mlogloss:0.48713
[18]	train-mlogloss:0.42144	eval-mlogloss:0.46914
[19]	train-mlogloss:0.40258	eval-mlogloss:0.44887
[20]	train

In [12]:
import pandas as pd
import xgboost as xgb

new_data = pd.read_csv('https://byui-cse.github.io/cse450-course/ice/wine/data/wine-holdout.csv')

if 'wine' in new_data.columns:
    new_data = new_data.drop(columns=['wine'])

X_new = new_data
dholdout = xgb.DMatrix(new_data)

# Predict using the trained model
y_pred_holdout = model.predict(dholdout)

# Save predictions as a single-column CSV with header 'wine'
results = pd.DataFrame({'wine': y_pred_holdout.astype(int)})  # Convert to int for classification labels
results.to_csv("JasonRao-ice1-predictions.csv", index=False)

print("Predictions saved to JasonRao-ice1-predictions.csv")

Predictions saved to JasonRao-ice1-predictions.csv
