# wczytywanie danych

In [5]:
# import libraries
import numpy as np
import pandas as pd
import copy


# define classes
class Wine:
    def __init__(self, filename, shuffle=False, asTest=0.2, normalize=True):
        with open(filename, 'rb') as f:
            data_csv = pd.read_csv (f, sep=';')
            self.df = pd.DataFrame(data_csv)
            self.df['predicted_quality'] = 0

            self.columns = list(self.df.columns)

            if shuffle:
                self.df = self.df.sample(frac = 1)
            
            self.split_to_train_test(asTest)

            if normalize:
                self.normalize()

    def split_to_train_test(self, asTest):
        self.train = self.df.copy()
        self.test = self.df.copy()
        train_count = int(len(self.df) * (1-asTest))
        self.train = self.train.drop(list(range(train_count+1, len(self.train))))
        self.test = self.test.drop(list(range(0, train_count)))

    def normalize(self):
        self.train, min_max_from_train = Wine.normalize_dataframe(self.train)
        self.test, dont_care = Wine.normalize_dataframe(self.test, minMax=min_max_from_train)

    def normalize_dataframe(df, excludeCols=['quality', 'predicted_quality'], minMax=None):
        if(minMax is None):
            minMax = {}
            for col in list(df.columns):
                _min = min(df[col])
                _max = max(df[col])
                minMax[col] = {'min': _min, 'max': _max}

        for col in list(df.columns):
            if col not in excludeCols:
                _min = minMax[col]['min']
                _max = minMax[col]['max']
                df[col] = df[col].apply(lambda x: (x-_min)/(_max-_min))
        return df, minMax
        



In [6]:
# load data

red_wine = Wine(r'./data/csv/winequality-red.csv')
white_wine = Wine(r'./data/csv/winequality-white.csv')


if red_wine.columns == white_wine.columns:
    COLUMNS = red_wine.columns
    print(f'columns : {COLUMNS}')
else:
    print('error: different columns in files')

print(f'red wine train samples  = {len(red_wine.train)}')
print(f'red wine test samples   = {len(red_wine.test)}')

print(f'white wine train samples = {len(white_wine.train)}')
print(f'white wine test samples  = {len(white_wine.test)}')



columns : ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'predicted_quality']
red wine train samples  = 1280
red wine test samples   = 320
white wine train samples = 3919
white wine test samples  = 980
