# Environment Setup

In [1]:
# Essential modules for data manipulation
import pandas as pd
import numpy as np

# Custom modules to assist the commom data exploration and preparation tasks
import src.data.explore as dataexp

# Modules to preprocess data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Modules to persist classes
import joblib

# Modules to create pipelines and transformers
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 4. Preprocessing Pipeline

## 4.1 Collect FInal Unprocessed Data

In [2]:
df, dfNone = dataexp.read_csv_data('beer_reviews.csv', None, '../data/interim/')
df.head(5)

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_style
0,Vecchio Birraio,2.0,2.5,1.5,1.5,5.0,65
1,Vecchio Birraio,2.5,3.0,3.0,3.0,6.2,51
2,Vecchio Birraio,2.5,3.0,3.0,3.0,6.5,59
3,Vecchio Birraio,3.0,3.5,2.5,3.0,5.0,61
4,Caldera Brewing Company,4.5,4.0,4.0,4.5,7.7,9


## 4.2 Create Preprocessing Data pipeline

### 4.2.1 Setup Pipeline to encode brewery_name

In [3]:
# Setup abv columns and pipeline to impute most trequent value
brewery_name_encoder = joblib.load('../models/brewery_name_encoder.joblib')
bn_cols = ['brewery_name']
bn_pipe = Pipeline (
    steps=[
        ('bnencode', brewery_name_encoder)
    ]
)

### 4.2.2 Setup Pipeline to encode and scale review rating columns

In [4]:
# Define cat columns and pipeline to do ordinal encoding
rev_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']
rev_pipe = Pipeline (
    steps = [
        #('ordencoder', OrdinalEncoder()),
        #('revscaler', MinMaxScaler())
        ('revpass','passthrough')
    ]
)

### 4.2.3 Setup Pipeline to impute most frequent value and scale abv_col

In [6]:
# Setup abv columns and pipeline to impute most trequent value
abv_cols = ['beer_abv']
abv_pipe = Pipeline (
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
        #('abvscaler', MinMaxScaler())
    ]
)

### 4.2.4 Setup Pipeline to encode review month

In [7]:
# Setup One Hot Encoded columns and pipeline to perform one hot encodind
#ohe_cols = ['review_month']
#ohe_pipe = Pipeline(
#    steps=[
#        ('one_hot_encoder', OneHotEncoder(sparse=False, drop='first'))
#    ]
#)

### 4.2.5 Define Transformer for specific pipelines and columns

In [10]:
# Define a column transformer to apply defined pipelines to specific columns
preprocessor = ColumnTransformer (
    transformers = [
        ('bn_trans', bn_pipe, bn_cols),
        ('rev_trans', rev_pipe, rev_cols),
        ('abv_trans', abv_pipe, abv_cols)
    ]
)

## 4.2.6 Fit the preprocessor to the data

In [11]:
npdata = preprocessor.fit_transform(df)

## 4.2.7 Review Transformed data

In [12]:
# Convery NP Array to dataframe and display data
df_npdata = pd.DataFrame(npdata, columns=['brewery_name', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv'])
df_npdata.head(10)

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv
0,5438.0,2.0,2.5,1.5,1.5,5.0
1,5438.0,2.5,3.0,3.0,3.0,6.2
2,5438.0,2.5,3.0,3.0,3.0,6.5
3,5438.0,3.0,3.5,2.5,3.0,5.0
4,1480.0,4.5,4.0,4.0,4.5,7.7
5,1480.0,3.5,3.5,3.0,3.5,4.7
6,1480.0,3.5,3.5,4.0,4.0,4.7
7,1480.0,2.5,3.5,2.0,3.5,4.7
8,1480.0,3.0,3.5,3.5,4.0,4.7
9,1480.0,3.5,5.0,4.0,4.0,4.7


In [13]:
df_npdata.describe()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv
count,779578.0,779578.0,779578.0,779578.0,779578.0,779578.0
mean,2924.711331,3.571197,3.719639,3.577883,3.597017,6.637261
std,1691.664689,0.761809,0.681744,0.755965,0.807797,2.253341
min,0.0,1.0,0.0,1.0,1.0,0.01
25%,1471.0,3.0,3.5,3.0,3.0,5.0
50%,2935.0,3.5,4.0,3.5,3.5,5.9
75%,4489.0,4.0,4.0,4.0,4.0,8.0
max,5741.0,5.0,5.0,5.0,5.0,57.7


## 4.2.8 Save the preprocessor for use in the API

In [24]:
joblib.dump(preprocessor, '../models/preproc_beer_type_prediction.joblib', compress=9)

['../models/preproc_beer_type_prediction.joblib']