In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split

from src.data.sets import save_sets

# Introduction

In [3]:
project_dir = Path(find_dotenv()).parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
report_dir = project_dir / 'reports'

## Load data


In [4]:
path = raw_data_dir / 'beer_reviews.csv'
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [5]:
df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586609,14359,The Defiant Brewing Company,1162684892,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2,33061
1586610,14359,The Defiant Brewing Company,1161048566,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,The Horseman's Ale,5.2,33061
1586611,14359,The Defiant Brewing Company,1160702513,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,The Horseman's Ale,5.2,33061
1586612,14359,The Defiant Brewing Company,1160023044,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2,33061


# Profile report

# Subset data

The target variable is `beer_style`.

The required parameters for the API are:
* `brewery_name`
* `review_aroma`
* `review_appearance`
* `review_palate`
* `review_taste`

So the model will be trained on this subset of the data.

In [6]:
target = 'beer_style'
col_list = [target] + [
    'brewery_name',
    'review_aroma',
    'review_appearance',
    'review_palate',
    'review_taste'
]
df_subset = df[col_list]

In [7]:
df_subset.describe()

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste
count,1586614.0,1586614.0,1586614.0,1586614.0
mean,3.735636,3.841642,3.743701,3.79286
std,0.6976167,0.6160928,0.6822184,0.7319696
min,1.0,0.0,1.0,1.0
25%,3.5,3.5,3.5,3.5
50%,4.0,4.0,4.0,4.0
75%,4.0,4.0,4.0,4.5
max,5.0,5.0,5.0,5.0


In [8]:
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 6 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   beer_style         1586614 non-null  object 
 1   brewery_name       1586599 non-null  object 
 2   review_aroma       1586614 non-null  float64
 3   review_appearance  1586614 non-null  float64
 4   review_palate      1586614 non-null  float64
 5   review_taste       1586614 non-null  float64
dtypes: float64(4), object(2)
memory usage: 72.6+ MB


In [9]:
path = processed_data_dir / 'subset'
df_subset.to_csv(path.with_suffix('.csv'), index=False)
df_subset.to_parquet(path.with_suffix('.parquet'), index=False)

# Profile report (subset)

In [12]:
profile_report = ProfileReport(df_subset,
                               explorative=True)
path = report_dir / 'pr_beer_reviews_raw_subset'
profile_report.to_file(path.with_suffix('.html'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=20.0, style=ProgressStyle(descrip…

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)





HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




## Split data

In [10]:
X = df_subset.copy(deep=True)
y = X.pop(target)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [11]:
save_sets(X_train=X_train,
          X_test=X_test,
          y_train=y_train,
          y_test=y_test,
          path=processed_data_dir)