In [None]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

In [None]:
# Get data from the link
csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

data = pd.read_csv(csv_url, sep=';')

## Simple inspection

In [None]:
# with pandas
data.describe()

In [None]:
# Generate the profile report with Pandas Profiling
profile = ProfileReport(
    data,
    title="Example of summarization of wine data"
)


In [None]:
#profile.to_notebook_iframe()

## Unit tests

### Basic examples - function tests

We will learn how the unit test work on  a simple function. First, we will define a function `square`, which returns the square of a number. Then, we will test it by writing assertions (correct answers) in a test function. 

In [None]:
import pytest

# install the following to be able to run the tests in notebook
import ipytest
ipytest.autoconfig()

In [None]:
# A simple function: calculate square of a number
def square(x):
    return x * x

In [None]:
%%run_pytest[clean]

# Let's test the function
# Think about the limit cases
def test_square():
    assert square(2) == 4
    assert square(0) == 0
    assert square(-2) == 4


Make the test fail to be sure to understand how it works.

### Basic examples - data tests

As we did for the function, we can also write assertions for the data. In the following example we will define a data frame on the fly and thest for the null values in it. 

In [None]:
%%run_pytest[clean]

def test_column_is_null():
    df = pd.DataFrame(data = [(1, 0), (2, None)],
                      columns = ['a', 'b'])
    
    assert np.all(pd.notna(df))

## Test the wine data

Previously, we generated the data frame inside the test function. If we want to run multiple tests on the same df, we would rather pass it to each function as an argument (as usual in programming). To do that in testing, we need to define the data as **fixtures**. They look like ordinary function definitions, preceeded by a decorator `@pytest.fixture`. 

### Raw data tests

In [None]:
# Define fixtures
@pytest.fixture
def input_schema():
    # Define range and type for each column
    schema = {
    'fixed acidity': {'min': 1.0, 'max': 17.0, 'type': float},
    'volatile acidity': {'min': 0.0, 'max': 2.0, 'type': float},
    'citric acid': {'min': 0.0, 'max': 2.0, 'type': float},
    'residual sugar': {'min': 0.5, 'max': 17.0, 'type': float},
    'chlorides': {'min': 0.0, 'max': 17.0, 'type': float},
    'free sulfur dioxide': {'min': 0.0, 'max': 80.0, 'type': float},
    'total sulfur dioxide': {'min': 0.0, 'max': 300.0, 'type': float},
    'density': {'min': 0.8, 'max': 1.1, 'type': float},
    'pH': {'min': 1.0, 'max': 10.0, 'type': float},
    'sulphates': {'min': 0.0, 'max': 2.0, 'type': float},
    'alcohol': {'min': 7.0, 'max': 17.0, 'type': float},
    'quality': {'min': 1, 'max': 10, 'type': int},
    }
    return schema


# Download the data
@pytest.fixture
def input_data():
    csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    data = pd.read_csv(csv_url, sep=';')
    return data

Write the following tests:
- is the number of columns in the data frame the same as in schema definition?
- are the values within defined ranges?
- are the types of the columns correct?

In [None]:
%%run_pytest[clean]

def test_number_of_columns(input_data, input_schema):
    
    # assert that the column number is the same as the length of the schema


def test_input_data_ranges(input_data, input_schema):
    
    # find min and max value for each column
    # read min and max value for each column from schema

    # for min value of the column: assert that it's always greater or equal than the min from the schema
    # for max value of the column: assert that it's always lesser or equal than the max from the schema
        
        
def test_input_types(input_data, input_schema):
    
    # find the type of each column in the df
    # read the type for each column from schema

    # assert that the type of the column is the same as defined in the schema
    

    

### Feature engineering tests

**NOTE:** Data transformaton should be done only on test dataset. You fit the transformer on the test dataset and then apply it on the train dataset. Since we are only illustrating the functioning of the unit testing, we will do it on the whole dataset.

In [None]:
from sklearn.preprocessing import StandardScaler
from numpy import mean, std

In [None]:
# Let's transform a column...

# define standard scaler
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(data[['alcohol']])
print(scaled)

In [None]:
# And check the stats...
print('mean:', mean(scaled))
print('std:', std(scaled))

In [None]:
@pytest.fixture
def scaled_alcohol():
    csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    data = pd.read_csv(csv_url, sep=';')
    
    # Define scaler
    scaler = StandardScaler()
    # Transform data
    scaled = scaler.fit_transform(data[['alcohol']])
    return scaled

In [None]:
%%run_pytest[clean]
# Test: is mean around zero and std around one?

def test_scaled_mean_zero(scaled_alcohol):
    
    mean_val = mean(scaled_alcohol)
    std_val = std(scaled_alcohol)
    
    assert pytest.approx(mean_val) == 0.0
    assert pytest.approx(std_val) == 1.0


## Additional exercises:

- implement and Test MinMaxScaler
- test null on 'quality'
- repeat the tests on synthetic data