# Wine quality prediction

# Imports

In [None]:
import hopsworks
import pandas as pd
import missingno as msno
import numpy as np
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
sns.set_palette('husl')
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(level='INFO')

mlogger = logging.getLogger('matplotlib')
mlogger.setLevel(logging.WARNING)

## Project setup

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

## Load data

In [None]:
# fetch dataset 
dataset_path = '../dataset/wine/wine.csv'
with open(dataset_path, 'r') as f:
    wine_df = pd.read_csv(f)
wine_df.columns = wine_df.columns.str.replace(' ', '_')
wine_df.head()

In [None]:
wine_df.info()


In [None]:
wine_df.describe()


## Preprocessing

### Encode categorical features

In [None]:
# change the type column to binary values 0 and 1 red (0) and white (1)
wine_df['type'] = wine_df['type'].map({'red': 0, 'white': 1})

### Fill missing values

In [None]:
# get each of the missing vlaue indexes for each column
missing_value_indexes = {}
for col in wine_df.columns:
    missing_value_indexes[col] = wine_df[wine_df[col].isnull()].index.tolist()
print('indexes of missing values:')
missing_value_indexes

Replace missing values with a random value from a normal distribution with mean and std of the feature since those features are (more or less) normally distributed according to the histogram plots below

In [None]:
# Replace missing values with a random value from a normal distribution with mean and std of the feature
# Can also use mean for the imputation seems more common when searching online
for col in wine_df.columns:
    mean_val = wine_df[col].mean()
    std_val = wine_df[col].std()
    missing_indexes = wine_df[wine_df[col].isnull()].index.tolist()
    for index in missing_indexes:
        wine_df.loc[index, col] = np.random.normal(mean_val, std_val)

In [None]:
# get each of the missing vlaue indexes for each column
missing_value_indexes = {}
for col in wine_df.columns:
    missing_value_indexes[col] = wine_df[wine_df[col].isnull()].index.tolist()
print('indexes of missing values:')
missing_value_indexes

### Balance the data

In [None]:
target = 'quality'
qualities = wine_df[target]
original_shape = wine_df.shape


print(f'original dataset shape {original_shape}')
print('-------------------')
print(f'Original dataset distribution {qualities.value_counts()}')



In [None]:
wine_df['quality'] = wine_df['quality'].apply(lambda x: 1 if x > 5 else 0)

### Remove duplicate rows

In [None]:
# remove duplicates
count_rows_before = wine_df.shape[0]
wine_df = wine_df.drop_duplicates()
count_rows_after = wine_df.shape[0]

print(f'Number of rows before removing duplicates {count_rows_before}')
print(f'Number of rows after removing duplicates {count_rows_after}')

## Exploratory Data Analysis (EDA)


### Univariate analysis

In [None]:
wine_df.hist(figsize=(15, 15), bins=50)

In [None]:
# violin plots using seaborn for the range of values for each feature in seperate plots
fig, axes = plt.subplots(7, 2, figsize=(15, 15))

axes = axes.flatten()
for i, col in enumerate(wine_df.columns):
    sns.violinplot(x=col, data=wine_df, ax=axes[i])

plt.tight_layout()
plt.show()


### Bivariate analysis

In [None]:
fig, axs = plt.subplots(4, 3, figsize=(15, 15))

feature_ranges = {}
for i, col in enumerate(wine_df.columns):
    
    if col != 'quality':
        for qualities in wine_df['quality'].unique():
            feature_ranges[col + '_' + str(qualities)] = [wine_df[wine_df['quality'] == qualities][col].min(), wine_df[wine_df['quality'] == qualities][col].max()]
            
        sns.barplot(x='quality', y=col, data=wine_df, ax=axs[i//3, i%3])
# remove the empty plot
print(feature_ranges)
#fig.delaxes(axs[3, 2])
plt.tight_layout()
plt.show()

### Multivariate analysis

In [None]:
correlation = wine_df.corr()
sns.heatmap(correlation, annot=True, cmap='BrBG').figure.set_size_inches(14, 12)


From the correlation matrix we chose Alchohol, density, chlorides and volatile acidity because they had a high correaltion with the quality. Type was dropped because it has a high correlation with the other features so it does not bring any new information. Free sulfur dioxide and total sulfur dioxide are very highly correlated with eachother so the one with lower correlation to quality(free sulfur dioxide) was dropped,  while we kept total sulfur dioxide. pH and fixed acidity and residual sugar was dropped for similar reasons of having low predictive power. Citric acid was kept because it does not have a high correlation to most of the other features being used so it is possible that it brings some new information. 

In [None]:
g = sns.pairplot(wine_df, hue='quality', markers='+')
plt.show()

### Feature selection

In [None]:
# List of columns to be dropped
columns_to_drop = ['type','fixed_acidity','residual_sugar','free_sulfur_dioxide','pH','sulphates']
#columns_to_drop = ['fixed_acidity', 'citric_acid', 'residual_sugar', 'total_sulfur_dioxide','free_sulfur_dioxide','pH','sulphates']

wine_df1=wine_df.drop(columns=columns_to_drop)

wine_df1

In [None]:
# remove duplicates
count_rows_before = wine_df1.shape[0]
wine_df1 = wine_df1.drop_duplicates()
count_rows_after = wine_df1.shape[0]

print(f'Number of rows before removing duplicates {count_rows_before}')
print(f'Number of rows after removing duplicates {count_rows_after}')

In [None]:


target_column = 'quality'
features = wine_df1.drop(columns=[target_column])
target = wine_df1[target_column]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=20,random_state=1)  # You can adjust parameters here

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict labels for the test set
predicted_labels = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
report = classification_report(y_test, predicted_labels)
print("Classification Report:")
print(report)

### Insert our Wine DataFrame into a FeatureGroup


In [None]:
wine_fg = fs.get_or_create_feature_group(
     name="wine_final",
     version=1,
     primary_key=["volatile_acidity","citric_acid","chlorides",'total_sulfur_dioxide',"density","alcohol"], 
     description="wine dataset")
wine_fg.insert(wine_df1)

### Data Validation
If you want, you can enable data validation for your feature group.
The code below will prevent iris flower data from being written your your feature group if you write values outside the expected ranges.

In [None]:
# from great_expectations.core import ExpectationSuite, ExpectationConfiguration

# def expect(suite, column, min_val, max_val):
#     suite.add_expectation(
#     ExpectationConfiguration(
#         expectation_type="expect_column_values_to_be_between",
#         kwargs={
#             "column":column, 
#             "min_value":min_val,
#             "max_value":max_val,
#         }
#     )
# )

In [None]:
# suite = ExpectationSuite(expectation_suite_name="iris_dimensions")

# expect(suite, "sepal_length", 4.5, 8.0)
# expect(suite, "sepal_width", 2.1, 4.5)
# expect(suite, "petal_length", 1.2, 7)
# expect(suite, "petal_width", 0.2, 2.5)
# iris_fg.save_expectation_suite(expectation_suite=suite, validation_ingestion_policy="STRICT")    