# Proyecto de limpieza de datos con dask

In [58]:
import dask.dataframe as dd
from dask.distributed import Client
import pandas as pd

In [59]:
# Creamos el cliente para el cluster
client = Client(n_workers = 2, threads_per_worker=1, memory_limit='1GB')
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41861 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:41861/status,

0,1
Dashboard: http://127.0.0.1:41861/status,Workers: 2
Total threads: 2,Total memory: 1.86 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44923,Workers: 2
Dashboard: http://127.0.0.1:41861/status,Total threads: 2
Started: Just now,Total memory: 1.86 GiB

0,1
Comm: tcp://127.0.0.1:38971,Total threads: 1
Dashboard: http://127.0.0.1:37239/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:37243,
Local directory: /tmp/dask-scratch-space/worker-53td4x8v,Local directory: /tmp/dask-scratch-space/worker-53td4x8v

0,1
Comm: tcp://127.0.0.1:44757,Total threads: 1
Dashboard: http://127.0.0.1:35509/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:45933,
Local directory: /tmp/dask-scratch-space/worker-oea0wh35,Local directory: /tmp/dask-scratch-space/worker-oea0wh35


In [70]:
beers = dd.read_csv('data/beer_small.csv', blocksize=20e5)
beers.compute()

Unnamed: 0.1,Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,784200,952,Great Dane Pub & Brewing Company (Downtown),1136269921,4.5,4.0,4.0,dirtylou,American IPA,4.0,4.0,Texas Speedbump IPA,,11846
1,1305265,29,Anheuser-Busch,1234830966,4.5,4.0,3.0,talkinghatrack,Light Lager,3.0,4.0,Bud Light Lime,4.2,41821
2,1526298,45,Brooklyn Brewery,1078599557,4.5,4.0,4.0,PopeJonPaul,Scotch Ale / Wee Heavy,4.0,4.5,Brooklyn Heavy Scotch Ale,7.5,16355
3,450647,590,New Glarus Brewing Company,1288790879,4.5,4.5,4.5,sweemzander,American Wild Ale,4.5,4.0,R&D Bourbon Barrel Kriek,5.5,60588
4,1223094,4,Allagash Brewing Company,1295320417,4.5,4.5,4.0,Jmoore50,American Wild Ale,4.0,4.0,Allagash Victor Francenstein,9.7,56665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15868,1291002,2378,"Kennebec Brewing Company, Inc.",1181265447,2.5,3.0,4.0,BuckSpin,American Stout,2.5,3.0,Gurglin' Sturgeon Stout,4.0,6071
15869,1388483,292,"Kirin Brewery Company, Limited",1202955854,4.5,3.5,3.5,saztheorybook,Happoshu,3.0,4.0,Sparkling Hop,5.0,40030
15870,1535177,1534,Brouwerij Het Anker,1267764634,4.5,4.0,4.0,SShelly,Belgian Strong Pale Ale,4.5,4.5,Cuvée Van De Keizer Rood (Red),10.0,42538
15871,1164804,10099,Dogfish Head Brewery,1322275887,4.5,4.5,4.0,therica,Russian Imperial Stout,4.5,4.5,Miles Davis' Bitches Brew,9.0,59151


In [61]:
print(beers.dtypes)

Unnamed: 0                      int64
brewery_id                      int64
brewery_name          string[pyarrow]
review_time                     int64
review_overall                float64
review_aroma                  float64
review_appearance             float64
review_profilename    string[pyarrow]
beer_style            string[pyarrow]
review_palate                 float64
review_taste                  float64
beer_name             string[pyarrow]
beer_abv                      float64
beer_beerid                     int64
dtype: object


In [62]:
def check_negative(column_name):
    negatives = beers[column_name] < 0
    return negatives.sum().compute()

def check_nan(column_name):
    nan = beers[column_name].isna().sum().compute()
    return nan

for col in beers.columns:
    if check_nan(col) != 0:
        print(f"La columna: {col} tiene valores nulos")
    elif beers[col].dtype == 'int64' or beers[col].dtype == 'float64':
        if check_negative(col) != 0:
            print(f"La columna: {col} tiene valores negativos")

La columna: brewery_name tiene valores nulos
La columna: review_profilename tiene valores nulos
La columna: beer_abv tiene valores nulos


In [63]:
print(beers.dtypes)

Unnamed: 0                      int64
brewery_id                      int64
brewery_name          string[pyarrow]
review_time                     int64
review_overall                float64
review_aroma                  float64
review_appearance             float64
review_profilename    string[pyarrow]
beer_style            string[pyarrow]
review_palate                 float64
review_taste                  float64
beer_name             string[pyarrow]
beer_abv                      float64
beer_beerid                     int64
dtype: object


In [64]:
def shorten_int_float(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        # Como en el analisis the arriba detectamos que no hay columnas con valores negativos entonces procedemos mas facilmente
        dtype = df[col].dtype
        if dtype == 'int64':
            max_val = df[col].max().compute()
            if max_val < 2**8:
                df[col] = df[col].astype('uint8')
            elif max_val < 2**16:
                df[col] = df[col].astype('uint16')
            elif max_val < 2**32:
                df[col] = df[col].astype('uint32')

        elif dtype == 'float64':
            max_val = df[col].max().compute()
            if max_val < 2**16:
                df[col] = df[col].astype('float16')
            elif max_val < 2**32:
                df[col] = df[col].astype('float32')

    return df

beers = shorten_int_float(beers)

In [65]:
def convert_review_time(ddf):

    # Se hace copia de los metadatos del ddf (esto es el esquema de las columnas)
    meta = ddf._meta.copy()

    # Cambiamos el tipo de dato en los metadatos para despues especificar en la lambda function
    meta['review_time'] = pd.to_datetime(meta['review_time'], unit='s')

    # Hacemos una lambda function sobre cada particion del ddf el cual assigna a una columna un tipo de dato diferente pero especificamos los metadatos.
    # Especificamos los metadatos para que a la hora de hacer el cambio de columna los otros tipos de datos no se cambien solos dada la inferencia de tipo de dato de dask.
    return ddf.map_partitions(lambda df: df.assign(review_time=dd.to_datetime(df['review_time'], unit='s')), meta=meta)

beers = convert_review_time(beers)

In [66]:
print(beers.dtypes)

Unnamed: 0                     uint32
brewery_id                     uint16
brewery_name          string[pyarrow]
review_time            datetime64[ns]
review_overall                float16
review_aroma                  float16
review_appearance             float16
review_profilename    string[pyarrow]
beer_style            string[pyarrow]
review_palate                 float16
review_taste                  float16
beer_name             string[pyarrow]
beer_abv                      float16
beer_beerid                    uint32
dtype: object


In [67]:
'''
en el output de arriba donde checamos las columnas obtuvimos lo siguiente:
La columna: brewery_name tiene valores nulos
La columna: review_profilename tiene valores nulos
La columna: beer_abv tiene valores nulos
'''
beers['brewery_name'] = beers['brewery_name'].fillna('N/A')
beers['review_profilename'] = beers['review_profilename'].fillna('N/A')
beers['beer_abv'] = beers['beer_abv'].fillna(-1)

beers.compute()

print(beers.isna().sum().compute())


Unnamed: 0            0
brewery_id            0
brewery_name          0
review_time           0
review_overall        0
review_aroma          0
review_appearance     0
review_profilename    0
beer_style            0
review_palate         0
review_taste          0
beer_name             0
beer_abv              0
beer_beerid           0
dtype: int64


In [68]:
beers.compute()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0.1,Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,784200,952,Great Dane Pub & Brewing Company (Downtown),2006-01-03 06:32:01,4.5,4.0,4.0,dirtylou,American IPA,4.0,4.0,Texas Speedbump IPA,-1.000000,11846
1,1305265,29,Anheuser-Busch,2009-02-17 00:36:06,4.5,4.0,3.0,talkinghatrack,Light Lager,3.0,4.0,Bud Light Lime,4.199219,41821
2,1526298,45,Brooklyn Brewery,2004-03-06 18:59:17,4.5,4.0,4.0,PopeJonPaul,Scotch Ale / Wee Heavy,4.0,4.5,Brooklyn Heavy Scotch Ale,7.500000,16355
3,450647,590,New Glarus Brewing Company,2010-11-03 13:27:59,4.5,4.5,4.5,sweemzander,American Wild Ale,4.5,4.0,R&D Bourbon Barrel Kriek,5.500000,60588
4,1223094,4,Allagash Brewing Company,2011-01-18 03:13:37,4.5,4.5,4.0,Jmoore50,American Wild Ale,4.0,4.0,Allagash Victor Francenstein,9.703125,56665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15868,1291002,2378,"Kennebec Brewing Company, Inc.",2007-06-08 01:17:27,2.5,3.0,4.0,BuckSpin,American Stout,2.5,3.0,Gurglin' Sturgeon Stout,4.000000,6071
15869,1388483,292,"Kirin Brewery Company, Limited",2008-02-14 02:24:14,4.5,3.5,3.5,saztheorybook,Happoshu,3.0,4.0,Sparkling Hop,5.000000,40030
15870,1535177,1534,Brouwerij Het Anker,2010-03-05 04:50:34,4.5,4.0,4.0,SShelly,Belgian Strong Pale Ale,4.5,4.5,Cuvée Van De Keizer Rood (Red),10.000000,42538
15871,1164804,10099,Dogfish Head Brewery,2011-11-26 02:51:27,4.5,4.5,4.0,therica,Russian Imperial Stout,4.5,4.5,Miles Davis' Bitches Brew,9.000000,59151


# Limpieza de datos usando unicamente pandas

In [71]:
beers = pd.read_csv('data/beer_small.csv')
beers

Unnamed: 0.1,Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,784200,952,Great Dane Pub & Brewing Company (Downtown),1136269921,4.5,4.0,4.0,dirtylou,American IPA,4.0,4.0,Texas Speedbump IPA,,11846
1,1305265,29,Anheuser-Busch,1234830966,4.5,4.0,3.0,talkinghatrack,Light Lager,3.0,4.0,Bud Light Lime,4.2,41821
2,1526298,45,Brooklyn Brewery,1078599557,4.5,4.0,4.0,PopeJonPaul,Scotch Ale / Wee Heavy,4.0,4.5,Brooklyn Heavy Scotch Ale,7.5,16355
3,450647,590,New Glarus Brewing Company,1288790879,4.5,4.5,4.5,sweemzander,American Wild Ale,4.5,4.0,R&D Bourbon Barrel Kriek,5.5,60588
4,1223094,4,Allagash Brewing Company,1295320417,4.5,4.5,4.0,Jmoore50,American Wild Ale,4.0,4.0,Allagash Victor Francenstein,9.7,56665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158656,1291002,2378,"Kennebec Brewing Company, Inc.",1181265447,2.5,3.0,4.0,BuckSpin,American Stout,2.5,3.0,Gurglin' Sturgeon Stout,4.0,6071
158657,1388483,292,"Kirin Brewery Company, Limited",1202955854,4.5,3.5,3.5,saztheorybook,Happoshu,3.0,4.0,Sparkling Hop,5.0,40030
158658,1535177,1534,Brouwerij Het Anker,1267764634,4.5,4.0,4.0,SShelly,Belgian Strong Pale Ale,4.5,4.5,Cuvée Van De Keizer Rood (Red),10.0,42538
158659,1164804,10099,Dogfish Head Brewery,1322275887,4.5,4.5,4.0,therica,Russian Imperial Stout,4.5,4.5,Miles Davis' Bitches Brew,9.0,59151


In [73]:
def shorten_int_float(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        # Como en el analisis the arriba detectamos que no hay columnas con valores negativos entonces procedemos mas facilmente
        dtype = df[col].dtype
        if dtype == 'int64':
            max_val = df[col].max()
            if max_val < 2**8:
                df[col] = df[col].astype('uint8')
            elif max_val < 2**16:
                df[col] = df[col].astype('uint16')
            elif max_val < 2**32:
                df[col] = df[col].astype('uint32')

        elif dtype == 'float64':
            max_val = df[col].max()
            if max_val < 2**16:
                df[col] = df[col].astype('float16')
            elif max_val < 2**32:
                df[col] = df[col].astype('float32')

    return df

beers = shorten_int_float(beers)

In [76]:
beers['review_time'] = pd.to_datetime(beers['review_time'], unit='s')
beers['brewery_name'] = beers['brewery_name'].fillna('N/A')
beers['review_profilename'] = beers['review_profilename'].fillna('N/A')
beers['beer_abv'] = beers['beer_abv'].fillna(-1)
print(beers.isna().sum())
beers.dtypes

Unnamed: 0            0
brewery_id            0
brewery_name          0
review_time           0
review_overall        0
review_aroma          0
review_appearance     0
review_profilename    0
beer_style            0
review_palate         0
review_taste          0
beer_name             0
beer_abv              0
beer_beerid           0
dtype: int64


Unnamed: 0                    uint32
brewery_id                    uint16
brewery_name                  object
review_time           datetime64[ns]
review_overall               float16
review_aroma                 float16
review_appearance            float16
review_profilename            object
beer_style                    object
review_palate                float16
review_taste                 float16
beer_name                     object
beer_abv                     float16
beer_beerid                   uint32
dtype: object