# Wine quality prediction

# Imports

In [None]:
import seaborn as sns
sns.set_palette('husl')
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(level='INFO')

mlogger = logging.getLogger('matplotlib')
mlogger.setLevel(logging.WARNING)

In [None]:
import hopsworks
import pandas as pd
import missingno as msno

## Project setup

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

## Load data

In [None]:
# fetch dataset 
dataset_path = '../dataset/wine/wine.csv'
with open(dataset_path, 'r') as f:
    wine_df = pd.read_csv(f)
wine_df.columns = wine_df.columns.str.replace(' ', '_')
wine_df.head()

In [None]:
wine_df.info()


In [None]:
wine_df['type'] = wine_df['type'].map({'red': 0, 'white': 1})

In [None]:
wine_df.describe()


In [None]:
# Drop the type column as it is not relevant for the analysis
#wine_df = wine_df.drop([ 'type'], axis=1)
wine_df.info()


In [None]:
# change the quality column to binary values 0 and 1 every value above 5 is good (1) and every value below 5 is bad (0)
wine_df['quality'] = wine_df['quality'].apply(lambda x: 1 if x > 5 else 0)


In [None]:
# Count occurrences of 0s and 1s in the quality
counts = wine_df['quality'].value_counts()

# Print the counts
print(counts)


## Exploratory Data Analysis (EDA)


### Missing data

In [None]:
msno.matrix(wine_df)

In [None]:
# get each of the missing vlaue indexes for each column
missing_value_indexes = {}
for col in wine_df.columns:
    missing_value_indexes[col] = wine_df[wine_df[col].isnull()].index.tolist()
print('indexes of missing values:')
missing_value_indexes

In [None]:
import numpy as np

# Replace missing values with a random value from a normal distribution with mean and std
# Can also use mean for the imputation seems more common when searching online
for col in wine_df.columns:
    mean_val = wine_df[col].mean()
    std_val = wine_df[col].std()
    missing_indexes = wine_df[wine_df[col].isnull()].index.tolist()
    for index in missing_indexes:
        wine_df.loc[index, col] = np.random.normal(mean_val, std_val)

In [None]:
# get each of the missing vlaue indexes for each column
missing_value_indexes = {}
for col in wine_df.columns:
    missing_value_indexes[col] = wine_df[wine_df[col].isnull()].index.tolist()
print('indexes of missing values:')
missing_value_indexes

### Univariate analysis

In [None]:
wine_df.hist(figsize=(15, 15), bins=50)

In [None]:
# violin plots using seaborn for the range of values for each feature in seperate plots
fig, axes = plt.subplots(7, 2, figsize=(15, 15))

axes = axes.flatten()
for i, col in enumerate(wine_df.columns):
    sns.violinplot(x=col, data=wine_df, ax=axes[i])

plt.tight_layout()
plt.show()


### Bivariate analysis

In [None]:
fig, axs = plt.subplots(4, 3, figsize=(15, 15))

feature_ranges = {}
for i, col in enumerate(wine_df.columns):
    
    if col != 'quality':
        print(i)
        for quality in wine_df['quality'].unique():
            feature_ranges[col + '_' + str(quality)] = [wine_df[wine_df['quality'] == quality][col].min(), wine_df[wine_df['quality'] == quality][col].max()]
            
        sns.barplot(x='quality', y=col, data=wine_df, ax=axs[i//3, i%3])
# remove the empty plot
print(feature_ranges)
#fig.delaxes(axs[3, 2])
plt.tight_layout()
plt.show()

### Multivariate analysis

In [None]:
correlation = wine_df.corr()
sns.heatmap(correlation, annot=True, cmap='BrBG').figure.set_size_inches(14, 12)


In [None]:
g = sns.pairplot(wine_df, hue='quality', markers='+')
plt.show()

### Drop Features with lower than 0.1 correlation to quality according to Correlation Matrix


In [None]:
# List of columns to be dropped
columns_to_drop = ['fixed_acidity', 'citric_acid', 'residual_sugar', 'total_sulfur_dioxide','free_sulfur_dioxide','pH','sulphates']

# Dropping columns from wine_df
wine_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
wine_df

### Insert our Wine DataFrame into a FeatureGroup


In [None]:
wine_fg = fs.get_or_create_feature_group(
     name="wine",
     version=1,
     primary_key=["type","volatile_acidity","chlorides","density","alcohol"], 
     description="wine dataset")
wine_fg.insert(wine_df)

### Data Validation
If you want, you can enable data validation for your feature group.
The code below will prevent iris flower data from being written your your feature group if you write values outside the expected ranges.

In [None]:
# from great_expectations.core import ExpectationSuite, ExpectationConfiguration

# def expect(suite, column, min_val, max_val):
#     suite.add_expectation(
#     ExpectationConfiguration(
#         expectation_type="expect_column_values_to_be_between",
#         kwargs={
#             "column":column, 
#             "min_value":min_val,
#             "max_value":max_val,
#         }
#     )
# )

In [None]:
# suite = ExpectationSuite(expectation_suite_name="iris_dimensions")

# expect(suite, "sepal_length", 4.5, 8.0)
# expect(suite, "sepal_width", 2.1, 4.5)
# expect(suite, "petal_length", 1.2, 7)
# expect(suite, "petal_width", 0.2, 2.5)
# iris_fg.save_expectation_suite(expectation_suite=suite, validation_ingestion_policy="STRICT")    