## Data Cleaning for reviews.csv

In [1]:
# Dependencies and packages
%reload_ext lab_black

import os
import pandas as pd
import numpy as np
import math as math

In [2]:
csv_path = os.path.join("../data/csv/reviews.csv")
df_reviews = pd.read_csv(csv_path)

In [3]:
df_reviews.head()

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31


In [4]:
# Find the DF size pre-cleaning
print(df_reviews.shape)

(9073128, 10)


In [5]:
# Find the number of unique beers in reviews.csv
df_reviews["beer_id"].value_counts()

11757     17160
2093      15947
7971      14927
1093      14915
29619     14292
          ...  
203777        1
126326        1
109950        1
282050        1
8188          1
Name: beer_id, Length: 309542, dtype: int64

In [6]:
# Find unique users who reviewed
df_reviews["username"].value_counts()

Sammy            13798
kylehay2004      12221
acurtis          12016
StonedTrippin    11859
jaydoc           11800
                 ...  
Adolfou812           1
Apoda_01             1
BryceCamp18          1
JDM1856              1
jonesra              1
Name: username, Length: 164934, dtype: int64

In [7]:
# Find value counts for every column in reviews.csv
df_reviews.apply(lambda x: x.isnull().value_counts())

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
False,9073128.0,9069313,9073128.0,9073128.0,5283110,5283110,5283110,5283110,5283110,9073128.0
True,,3815,,,3790018,3790018,3790018,3790018,3790018,


In [23]:
df_clean = df_reviews.copy()

In [24]:
# Delete all the beers that have less than 50 reviews
df_clean = df_clean[df_clean.groupby("beer_id").beer_id.transform("count") > 20].copy()
df_clean.head()

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31
5,125646,jneff33,2017-12-20,,4.5,4.75,5.0,5.0,5.0,4.91


In [25]:
# The algorithm below reduces the memory size of a dataframe
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024 ** 2
    print("Memory usage of properties dataframe is :", start_mem_usg, " MB")
    NAlist = []  # Keeps track of columns that have missing values filled in.
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings

            # Print current column type
            print("******************************")
            print("Column: ", col)
            print("dtype before: ", props[col].dtype)

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col].fillna(mn - 1, inplace=True)

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = props[col] - asint
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)

            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)

            # Print new column type
            print("dtype after: ", props[col].dtype)
            print("******************************")

    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024 ** 2
    print("Memory usage is: ", mem_usg, " MB")
    print("This is ", 100 * mem_usg / start_mem_usg, "% of the initial size")
    return props, NAlist

In [26]:
props = df_clean
props, NAlist = reduce_mem_usage(props)
print("_________________")
print("")
print(
    "Warning: the following columns have missing values filled with 'df['column_name'].min() -1': "
)
print("_________________")
print("")
print(NAlist)

Memory usage of properties dataframe is : 675.1208877563477  MB
******************************
Column:  beer_id
dtype before:  int64
dtype after:  uint32
******************************
******************************
Column:  look
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  smell
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  taste
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  feel
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  overall
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  score
dtype before:  float64
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  460.30969619750977  MB
This 

In [27]:
df_clean = props.copy()

In [28]:
df_clean

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
1,125646,_dirty_,2017-12-21,,4.50,4.50,4.50,4.50,4.50,4.50
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.50,4.50,4.50,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.50,4.25,4.25,4.25,4.31
5,125646,jneff33,2017-12-20,,4.50,4.75,5.00,5.00,5.00,4.91
...,...,...,...,...,...,...,...,...,...,...
9073123,104824,CTJman,2014-06-08,,0.00,0.00,0.00,0.00,0.00,4.00
9073124,104824,IMSPEAKNOENGLISH,2014-06-06,,0.00,0.00,0.00,0.00,0.00,5.00
9073125,104824,twizzard,2014-06-05,,0.00,0.00,0.00,0.00,0.00,5.00
9073126,104824,bootdown21,2014-06-04,,0.00,0.00,0.00,0.00,0.00,4.50


In [29]:
df_clean = df_clean.drop(["date", "look", "smell", "taste", "feel", "overall"], axis=1)

In [30]:
df_clean = df_clean.drop(["text"], axis=1)

In [31]:
df_clean

Unnamed: 0,beer_id,username,score
1,125646,_dirty_,4.50
2,125646,CJDUBYA,4.75
3,125646,GratefulBeerGuy,4.58
4,125646,LukeGude,4.31
5,125646,jneff33,4.91
...,...,...,...
9073123,104824,CTJman,4.00
9073124,104824,IMSPEAKNOENGLISH,5.00
9073125,104824,twizzard,5.00
9073126,104824,bootdown21,4.50


In [33]:
df_clean.to_csv(os.path.join("../data/csv/clean_reviews.csv"))

In [32]:
df_clean["beer_id"].value_counts()

11757     17160
2093      15947
7971      14927
1093      14915
29619     14292
          ...  
42801        21
46036        21
19232        21
360409       21
244820       21
Name: beer_id, Length: 41463, dtype: int64

In [34]:
# We now have 41463 unique beers instead of 309542 prior cleaning.

In [35]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8044495 entries, 1 to 9073127
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   beer_id   uint32 
 1   username  object 
 2   score     float32
dtypes: float32(1), object(1), uint32(1)
memory usage: 184.1+ MB


In [36]:
df_clean.apply(lambda x: x.isnull().value_counts())

Unnamed: 0,beer_id,username,score
False,8044495.0,8040830,8044495.0
True,,3665,
