In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 
  
# metadata 
print(wine_quality.metadata) 
  
# variable information 
print(wine_quality.variables) 

In [None]:
wine_df = pd.concat([X, y], axis=1)

## Step 1: Data exploration

In [None]:
wine_df.head

In [None]:
wine_df.describe()


In [None]:
wine_df.isnull().sum()

### there are no nulls in the dataset, so we can move onto looking at the correlation of features in the dataset

In [None]:
correlation_matrix = wine_df.corr()
correlation_matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Assuming 'correlation_matrix' is a DataFrame containing the correlations
print(correlation_matrix.to_string())

### The analysis reveals several important correlations within the wine dataset. There is a strong positive correlation between free sulfur dioxide and total sulfur dioxide (0.720934), suggesting that these two sulfur dioxide variables likely increase together. Similarly, a significant positive correlation exists between residual sugar and density (0.552517), indicating that wines with higher residual sugar also tend to have higher density. On the other hand, a strong negative correlation is observed between alcohol and density (-0.686745), suggesting that as alcohol content increases, wine density tends to decrease. Additionally, there is a negative relationship between quality and volatile acidity (-0.265699), implying that wines with higher volatile acidity often have lower quality. Notably, alcohol content shows a positive correlation with quality (0.444319), indicating that higher alcohol levels are frequently associated with better wine quality. Conversely, fixed acidity, citric acid, chlorides, and residual sugar all exhibit weak correlations with quality, suggesting they may not be strong predictors of wine quality in the analysis

#### The correlation analysis provides several implications for modeling a supervised learning algorithm in this wine dataset. Firstly, the strong correlations between certain features, such as free sulfur dioxide and total sulfur dioxide, suggest that these variables may provide redundant information, which could lead to multicollinearity issues in the model. This redundancy can be addressed by removing one of the correlated features or applying dimensionality reduction techniques to ensure a more stable model.

#### Secondly, the identified relationships between features and the target variable (quality) can guide feature selection. For instance, the positive correlation between alcohol content and quality indicates that including alcohol as a predictor could improve model performance. In contrast, features with weak correlations to quality, like fixed acidity and citric acid, may be candidates for exclusion, simplifying the model without sacrificing accuracy.

#### Additionally, the negative correlation between volatile acidity and quality suggests that it could be a crucial feature for predicting quality, prompting the need for further exploration of its impact. Overall, these insights will help in constructing a more effective supervised learning model by emphasizing relevant features, reducing complexity, and potentially improving predictive accuracy.

In [None]:
print(wine_df.dtypes)

#### floats (numerical values) are suitable for training a supervised learning model. 

In [None]:
#Split the features (X) from the target variable (y).

X = wine_df.drop(columns='quality')  # All features except 'quality'
y = wine_df['quality']  # Target variable 

In [None]:
#Split the Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

## Step 2: Modelling, Preprocessing and Pipeline creation

In [None]:
# Logistic Regression Pipeline
logistic_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Step for scaling
    ('logreg', LogisticRegression())  # Step for logistic regression model
])

# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Step for scaling
    ('rf', RandomForestClassifier(random_state=42))  # Step for random forest model
])

In [None]:
# Fit the logistic regression pipeline
logistic_pipeline.fit(X_train, y_train)

In [None]:
# Fit the random forest pipeline
rf_pipeline.fit(X_train, y_train)

In [None]:
# Logistic Regression Predictions
y_pred_log = logistic_pipeline.predict(X_test)

In [None]:
# Random Forest Predictions
y_pred_rf = rf_pipeline.predict(X_test)

In [None]:
# Logistic Regression Evaluation
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))
print("Accuracy Score:", accuracy_score(y_test, y_pred_log))

In [None]:
# Random Forest Evaluation
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Accuracy Score:", accuracy_score(y_test, y_pred_rf))

## Step 3: hyperparameter tuning 

### random forest seems to have had best F1 so we are taking that model to the hyperparameter tuning step

In [None]:
param_grid = {

    'rf__n_estimators': [100, 200],           # Number of trees
    'rf__max_depth': [None, 10, 20],          # Maximum depth of the trees
    'rf__min_samples_split': [2, 5, 10]       # Minimum number of samples required to split an internal node
}

In [None]:
# Create the grid search with the pipeline
grid_search = GridSearchCV(estimator=rf_pipeline, 
                           param_grid=param_grid, 
                           cv=5,                      # 5-fold cross-validation
                           scoring='f1',             # Metric to optimize
                           n_jobs=-1)                # Use all available cores

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print ("Best Parameters:", grid_search.best_params_)
print ("Best Cross-Validation Score:", grid_search.best_score_)

In [None]:
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

In [None]:
best_model = grid_search.best_estimator_  
test_score = best_model.score(X_test, y_test)
print("Test score of the best model: ", test_score)

In [None]:
#reassigning 

y_pred_rf = best_model.predict(X_test)

## Incorporating model in a function

In [None]:
def wine_quality(row, best_model):
    
      # Reshape the row to ensure it’s a 2D array with one sample
    row = np.array(row).reshape(1, -1)  
    # Sickitlearn library seemed to be having issues reading the row. 
    # Full row of 11 features needs to be input in order to get a prediction from the function. 
    
    # Use the model to predict wine quality
    prediction = best_model.predict(row)
    
    # Return the predicted quality
    return prediction[0]





In [None]:
# Assuming row has 11 features
sample_row = [7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 31.0, 0.9978, 3.51, 0.56, 9.4]
quality_prediction = wine_quality(sample_row, best_model)


In [None]:
quality_prediction