In [1]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_classif, f_regression, mutual_info_classif
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, scale
from sklearn.pipeline import Pipeline

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


Performing basic exploratory analysis and cleaning.

In [2]:
# fetch dataset
wine_quality = fetch_ucirepo(id=186)

In [3]:
# data (as pandas dataframes)
X = wine_quality.data.features
y = wine_quality.data.targets

In [4]:
X.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5


In [5]:
y.head(10)

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
5,5
6,5
7,7
8,7
9,5


In [6]:
# metadata
print(wine_quality.metadata)

{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

In [7]:
# variable information
print(wine_quality.variables)

                    name     role         type  ...             description units missing_values
0          fixed_acidity  Feature   Continuous  ...                    None  None             no
1       volatile_acidity  Feature   Continuous  ...                    None  None             no
2            citric_acid  Feature   Continuous  ...                    None  None             no
3         residual_sugar  Feature   Continuous  ...                    None  None             no
4              chlorides  Feature   Continuous  ...                    None  None             no
5    free_sulfur_dioxide  Feature   Continuous  ...                    None  None             no
6   total_sulfur_dioxide  Feature   Continuous  ...                    None  None             no
7                density  Feature   Continuous  ...                    None  None             no
8                     pH  Feature   Continuous  ...                    None  None             no
9              sulphates  Feat

In [8]:
red = pd.read_csv('/content/winequality-red.csv')
red.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [9]:
white = pd.read_csv('/content/winequality-white.csv')
white.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6


Doing some data cleaning and preparation

In [10]:
cleanred = red.drop_duplicates()
cleanred.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,5


In [11]:
cleanwhite = white.drop_duplicates()
cleanwhite.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6
10,8.1,0.27,0.41,1.45,0.033,11.0,63.0,0.9908,2.99,0.56,12.0,5
11,8.6,0.23,0.4,4.2,0.035,17.0,109.0,0.9947,3.14,0.53,9.7,5
12,7.9,0.18,0.37,1.2,0.04,16.0,75.0,0.992,3.18,0.63,10.8,5
13,6.6,0.16,0.4,1.5,0.044,48.0,143.0,0.9912,3.54,0.52,12.4,7


In [12]:
cleanred.shape

(1359, 12)

In [13]:
cleanred.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [14]:
cleanwhite.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


Appending a categorical boolean so that the algorithm can learn to associate the descriptors with a type of wine.
Also joining the newly labeled tables together and so that the model can read all the data together and learn the differences in each type.

In [15]:
cleanred['wine_type'] = 1
cleanwhite['wine_type'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanred['wine_type'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanwhite['wine_type'] = 0


In [16]:
wines = pd.concat([cleanred, cleanwhite], ignore_index=True)
wines

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [17]:
winesx = wines.drop('wine_type', axis=1)
winesy = wines['wine_type']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(winesx, winesy, test_size=0.2, random_state=42)

In [19]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9868421052631579
Precision: 0.9824561403508771
Recall: 0.9688581314878892


In [22]:
SelectKBest(f_regression(X_train, y_train), k=4)

In [23]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
pipe2 = make_pipeline(OneHotEncoder(categories='auto', max_categories=10, handle_unknown='ignore'), LogisticRegression(max_iter=1000))
pipe.fit(X_train, y_train)
pipe2.fit(X_train, y_train)

In [24]:
pipepred = pipe.predict(X_test)
pipepred2 = pipe2.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, pipepred)
precision = precision_score(y_test, pipepred)
recall = recall_score(y_test, pipepred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9953007518796992
Precision: 0.9896551724137931
Recall: 0.9930795847750865


In [26]:
accuracy = accuracy_score(y_test, pipepred2)
precision = precision_score(y_test, pipepred2)
recall = recall_score(y_test, pipepred2)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.8392857142857143
Precision: 0.7234848484848485
Recall: 0.6608996539792388


I had a lot of trouble understanding and setting up models with feature selection, so I used Gemini to create a basic framework to use.

In [34]:
# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('feature_selection', SelectKBest()),  # Feature selection
    ('classifier', LogisticRegression(max_iter=1000)),  # Classifier
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'feature_selection__score_func': [f_classif, mutual_info_classif], # Selection methods
    'feature_selection__k': [3, 5, 7, 9],  # Number of features to select
}

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best pipeline and its performance
best_pipeline = grid_search.best_estimator_
best_score = grid_search.best_score_

print(f"Best Pipeline: {best_pipeline}")
print(f"Best Score: {best_score}")

Best Pipeline: Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selection', SelectKBest(k=9)),
                ('classifier', LogisticRegression(max_iter=1000))])
Best Score: 0.988498782259932


I want to try using a KFold in the pipeline

In [33]:
# Define the pipeline
pipeline2 = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('feature_selection', SelectKBest()),  # Feature selection
    ('classifier', LogisticRegression(max_iter=1000)),  # Classifier
])

# Define the parameter grid for Kfold
kf = KFold(n_splits=6)

# Perform cross-validation with KFold
scores = []
for train_index, test_index in kf.split(winesx):
    X_train, X_test = winesx.iloc[train_index], winesx.iloc[test_index]
    y_train, y_test = winesy.iloc[train_index], winesy.iloc[test_index]

    pipeline2.fit(X_train, y_train)
    y_pred = pipeline2.predict(X_test)

    scores.append(accuracy_score(y_test, y_pred))

average_accuracy = np.mean(scores)

print(f"Average Accuracy: {average_accuracy}")

Average Accuracy: 0.9875975867454231


I tried using 5, 6, 7, and 8 splits because I observed in previous testing that larger groups seemed to produce more solid results, but in this Kfold test, .9875 with 6 splits seems to be the highest I could get the accuracy to go.

I will complete the exercise with the first model.

In [105]:
def wine_quality(row, pipeline):
  pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('feature_selection', SelectKBest(k=9)),  # Feature selection
    ('classifier', LogisticRegression(max_iter=1000)),  # Classifier
])
  pipeline.fit(X_train, y_train)


  result = pipeline.predict(X_test.iloc[[row]])
  return result[0]




In [112]:
wine_quality(1, pipeline)

0