# Week 6 Assignment - Wine Dataset Classification

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

## Load Dataset and Split

In [2]:
# Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target

print("Dataset shape:", X.shape)
print("Number of classes:", len(np.unique(y)))
print("Classes:", np.unique(y))

Dataset shape: (178, 13)
Number of classes: 3
Classes: [0 1 2]


In [3]:
# Split the dataset into train and test set (70:30 ratio, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 124
Test set size: 54


## Question 1 & 2: Logistic Regression

In [4]:
# Train Logistic Regression model with specified parameters
lr_model = LogisticRegression(max_iter=10000, C=0.5, random_state=1, solver='sag')
lr_model.fit(X_train, y_train)

# Make predictions on test set
y_pred_lr = lr_model.predict(X_test)

# Calculate accuracy on test dataset
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print(f"Accuracy on test dataset: {accuracy_lr:.3f}")
print(f"\nQuestion 1 Answer: {accuracy_lr:.3f}")

Accuracy on test dataset: 0.944

Question 1 Answer: 0.944


In [5]:
# Get the number of iterations to converge
n_iterations = lr_model.n_iter_[0]

print(f"Number of iterations to converge: {n_iterations}")
print(f"\nQuestion 2 Answer: {n_iterations}")

Number of iterations to converge: 4750

Question 2 Answer: 4750


## Question 3 & 4: Decision Tree with GridSearchCV

In [6]:
# Define the parameter grid for Decision Tree
dt_param_grid = {
    'criterion': ['entropy', 'gini'],
    'splitter': ['random', 'best'],
    'min_samples_leaf': [2, 4, 6, 8, 10],
    'max_depth': [3, 4, 5, 6]
}

# Create DecisionTreeClassifier model
dt_model = DecisionTreeClassifier(random_state=1)

# Perform GridSearchCV
dt_grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=dt_param_grid,
    cv=4,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the grid search
print("Training Decision Tree GridSearchCV...")
dt_grid_search.fit(X_train, y_train)
print("GridSearchCV training completed!")

Training Decision Tree GridSearchCV...
GridSearchCV training completed!


In [7]:
# Get the best model
best_dt_model = dt_grid_search.best_estimator_

# Display best parameters
print("Best Parameters:")
print(dt_grid_search.best_params_)

Best Parameters:
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 2, 'splitter': 'best'}


In [8]:
# Make predictions on test set using the best model
y_pred_dt = best_dt_model.predict(X_test)

# Calculate accuracy on test dataset
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print(f"Accuracy on test dataset: {accuracy_dt:.3f}")
print(f"\nQuestion 3 Answer: {accuracy_dt:.3f}")

Accuracy on test dataset: 0.907

Question 3 Answer: 0.907


In [9]:
# Get the best max_depth value
best_max_depth = dt_grid_search.best_params_['max_depth']

print(f"Best max_depth: {best_max_depth}")
print(f"\nQuestion 4 Answer: {best_max_depth}")

Best max_depth: 4

Question 4 Answer: 4


## Question 5 & 6: AdaBoost with GridSearchCV

In [10]:
# Define the parameter grid for AdaBoost
ada_param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.5, 1, 2]
}

# Create AdaBoostClassifier model
ada_model = AdaBoostClassifier(random_state=1)

# Perform GridSearchCV
ada_grid_search = GridSearchCV(
    estimator=ada_model,
    param_grid=ada_param_grid,
    cv=4,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the grid search
print("Training AdaBoost GridSearchCV...")
ada_grid_search.fit(X_train, y_train)
print("GridSearchCV training completed!")

Training AdaBoost GridSearchCV...
GridSearchCV training completed!


In [11]:
# Get the best model
best_ada_model = ada_grid_search.best_estimator_

# Display best parameters
print("Best Parameters:")
print(ada_grid_search.best_params_)

Best Parameters:
{'learning_rate': 0.5, 'n_estimators': 100}


In [12]:
# Make predictions on test set using the best model
y_pred_ada = best_ada_model.predict(X_test)

# Calculate accuracy on test dataset
accuracy_ada = accuracy_score(y_test, y_pred_ada)

print(f"Accuracy on test dataset: {accuracy_ada:.3f}")
print(f"\nQuestion 5 Answer: {accuracy_ada:.3f}")

Accuracy on test dataset: 0.963

Question 5 Answer: 0.963


In [13]:
# Get the best n_estimators value
best_n_estimators = ada_grid_search.best_params_['n_estimators']

print(f"Best n_estimators: {best_n_estimators}")
print(f"\nQuestion 6 Answer: {best_n_estimators}")

Best n_estimators: 100

Question 6 Answer: 100


## Question 7: Voting Classifier

In [14]:
# Create individual estimators with random_state=1
bagging_clf = BaggingClassifier(random_state=1)
rf_clf = RandomForestClassifier(random_state=1)
gb_clf = GradientBoostingClassifier(random_state=1)

# Create Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('bagging', bagging_clf),
        ('random_forest', rf_clf),
        ('gradient_boosting', gb_clf)
    ],
    voting='hard'
)

# Train the Voting Classifier
print("Training Voting Classifier...")
voting_clf.fit(X_train, y_train)
print("Training completed!")

Training Voting Classifier...
Training completed!


In [15]:
# Make predictions on test set
y_pred_voting = voting_clf.predict(X_test)

# Calculate accuracy on test dataset
accuracy_voting = accuracy_score(y_test, y_pred_voting)

print(f"Accuracy on test dataset: {accuracy_voting:.3f}")
print(f"\nQuestion 7 Answer: {accuracy_voting:.3f}")

Accuracy on test dataset: 0.981

Question 7 Answer: 0.981


## Summary of All Answers

In [16]:
# Print all answers in summary
print("=" * 60)
print("SUMMARY OF ALL ANSWERS")
print("=" * 60)
print(f"\nQuestion 1 - Logistic Regression Accuracy: {accuracy_lr:.3f}")
print(f"Question 2 - Logistic Regression Iterations: {n_iterations}")
print(f"\nQuestion 3 - Decision Tree Accuracy: {accuracy_dt:.3f}")
print(f"Question 4 - Decision Tree Best max_depth: {best_max_depth}")
print(f"\nQuestion 5 - AdaBoost Accuracy: {accuracy_ada:.3f}")
print(f"Question 6 - AdaBoost Best n_estimators: {best_n_estimators}")
print(f"\nQuestion 7 - Voting Classifier Accuracy: {accuracy_voting:.3f}")
print("=" * 60)

SUMMARY OF ALL ANSWERS

Question 1 - Logistic Regression Accuracy: 0.944
Question 2 - Logistic Regression Iterations: 4750

Question 3 - Decision Tree Accuracy: 0.907
Question 4 - Decision Tree Best max_depth: 4

Question 5 - AdaBoost Accuracy: 0.963
Question 6 - AdaBoost Best n_estimators: 100

Question 7 - Voting Classifier Accuracy: 0.981
