### Columnas de Baños

In [1541]:
import pandas as pd
import numpy as np
import re 

In [1542]:
data_location = "./properati.csv"
data = pd.read_csv(data_location, sep=",")
print(data.dtypes)
data.head(3)

Unnamed: 0                      int64
operation                      object
property_type                  object
place_name                     object
place_with_parent_names        object
country_name                   object
state_name                     object
geonames_id                   float64
lat-lon                        object
lat                           float64
lon                           float64
price                         float64
currency                       object
price_aprox_local_currency    float64
price_aprox_usd               float64
surface_total_in_m2           float64
surface_covered_in_m2         float64
price_usd_per_m2              float64
price_per_m2                  float64
floor                         float64
rooms                         float64
expenses                      float64
properati_url                  object
description                    object
title                          object
image_thumbnail                object
dtype: objec

Unnamed: 0.1,Unnamed: 0,operation,property_type,place_name,place_with_parent_names,country_name,state_name,geonames_id,lat-lon,lat,...,surface_covered_in_m2,price_usd_per_m2,price_per_m2,floor,rooms,expenses,properati_url,description,title,image_thumbnail
0,0,sell,PH,Mataderos,|Argentina|Capital Federal|Mataderos|,Argentina,Capital Federal,3430787.0,"-34.6618237,-58.5088387",-34.661824,...,40.0,1127.272727,1550.0,,,,http://www.properati.com.ar/15bo8_venta_ph_mat...,"2 AMBIENTES TIPO CASA PLANTA BAJA POR PASILLO,...",2 AMB TIPO CASA SIN EXPENSAS EN PB,https://thumbs4.properati.com/8/BluUYiHJLhgIIK...
1,1,sell,apartment,La Plata,|Argentina|Bs.As. G.B.A. Zona Sur|La Plata|,Argentina,Bs.As. G.B.A. Zona Sur,3432039.0,"-34.9038831,-57.9643295",-34.903883,...,,,,,,,http://www.properati.com.ar/15bob_venta_depart...,Venta de departamento en décimo piso al frente...,VENTA Depto 2 dorm. a estrenar 7 e/ 36 y 37 ...,https://thumbs4.properati.com/7/ikpVBu2ztHA7jv...
2,2,sell,apartment,Mataderos,|Argentina|Capital Federal|Mataderos|,Argentina,Capital Federal,3430787.0,"-34.6522615,-58.5229825",-34.652262,...,55.0,1309.090909,1309.090909,,,,http://www.properati.com.ar/15bod_venta_depart...,2 AMBIENTES 3ER PISO LATERAL LIVING COMEDOR AM...,2 AMB 3ER PISO CON ASCENSOR APTO CREDITO,https://thumbs4.properati.com/5/SXKr34F_IwG3W_...


In [1543]:
data.shape

(121220, 26)

In [1544]:
#Creación de una nueva columna 'baños' vacía, la cual va a contener la cantidad de baños en una propiedad
data['baños'] = np.nan

In [1545]:
#Comprobación de valores null en 'baños'
data['baños'].isnull().value_counts()

True    121220
Name: baños, dtype: int64

In [1546]:
#Cambio a minúscula estas dos nuevas columnas creadas
data['description_copia'] = data['description'].astype(str).str.lower()
data['title_copia']= data['title'].astype(str).str.lower()

In [1547]:
#Comprobación de valores null en data 'description_copia'
data['description_copia'].isnull().value_counts()

False    121220
Name: description_copia, dtype: int64

In [1548]:
#Comprobación de valores null en data 'title_copia'
data['title_copia'].isnull().value_counts()

False    121220
Name: title_copia, dtype: int64

#### Búsqueda de baños con expresiones regulares: 

In [1549]:
#defino la función cuenta_valores para ver cuántos valores nulos y no nulos hay en una columna.
def cuenta_valores(columna): 
    valores = data[columna].isnull().value_counts()
    
    return valores

In [1550]:
#Defino la columna tabla_pivot_baños para obtener una tabla pivot con los datos que se van asignando a la columna baños, según el tipo de propiedad.
def tabla_pivot_baños():
    bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30, 40, 50, 1000, 2000]
    baños = pd.cut(data.baños, bins, right = False)
    tabla = data.pivot_table('baños', index = [baños], columns= 'property_type', aggfunc={'baños': 'count'})
    
    return tabla

In [1551]:
patron = "\s(?P<numero>\d[\d]?)\s+baño[s]?"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1552]:
cuenta_valores('baños')

True     111067
False     10153
Name: baños, dtype: int64

In [1554]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1555]:
patron =  "\s(?P<numero>\d[\d]?)\s+baã±o[s]?"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1556]:
cuenta_valores('baños')

True     110519
False     10701
Name: baños, dtype: int64

In [1558]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1559]:
patron =  "\s(?P<numero>\d[\d]?)\s+bano[s]?"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1560]:
cuenta_valores('baños')

True     110317
False     10903
Name: baños, dtype: int64

In [1562]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1563]:
patron =  "\sbaño[s]?\s*:\s*(?P<numero>\d[\d]?)"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1564]:
cuenta_valores('baños')

True     110131
False     11089
Name: baños, dtype: int64

In [1566]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1567]:
patron =  "\sbaã±o[s]?\s*:\s*(?P<numero>\d[\d]?)"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1568]:
cuenta_valores('baños')

True     110125
False     11095
Name: baños, dtype: int64

In [1570]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1571]:
patron =  "\sbano[s]?\s*:\s*(?P<numero>\d[\d]?)"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1572]:
cuenta_valores('baños')

True     110123
False     11097
Name: baños, dtype: int64

In [1574]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1575]:
patron = r"\s(un|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieciseis|diecisiete|dieciocho|diecinueve|veinte)[\s]*baño[s]?"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))
mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
encontradas = description_series_match[mask_description_notnull].apply(lambda x: x.group(1))

num_pisos_mapper = {'un': 1,'dos':2, 'tres': 3, 'cuatro': 4, 'cinco': 5, 'seis': 6, \
                    'siete':7, 'ocho': 8, 'nueve': 9, 'diez': 10, \
                    'once': 11, 'doce': 12, 'trece': 13, 'catorce': 14, 'quice': 15, 'dieciseis': 16, \
                    'diecisiete': 17, 'dieciocho': 18, 'diecinueve': 19,\
                    'veinte': 20}
encontradas1 = encontradas.map(num_pisos_mapper)

In [1576]:
data.loc[mask_description_notnull,'baños']= encontradas1.astype(float)

In [1577]:
cuenta_valores('baños')

True     100426
False     20794
Name: baños, dtype: int64

In [1579]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1580]:
patron = r"\s(un|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieciseis|diecisiete|dieciocho|diecinueve|veinte)[\s]*baã±o[s]?"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))
mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
encontradas = description_series_match[mask_description_notnull].apply(lambda x: x.group(1))

num_pisos_mapper = {'un': 1,'dos':2, 'tres': 3, 'cuatro': 4, 'cinco': 5, 'seis': 6, \
                    'siete':7, 'ocho': 8, 'nueve': 9, 'diez': 10, \
                    'once': 11, 'doce': 12, 'trece': 13, 'catorce': 14, 'quice': 15, 'dieciseis': 16, \
                    'diecisiete': 17, 'dieciocho': 18, 'diecinueve': 19,\
                    'veinte': 20}
encontradas1 = encontradas.map(num_pisos_mapper)

In [1581]:
data.loc[mask_description_notnull,'baños']= encontradas1.astype(float)

In [1582]:
cuenta_valores('baños')

True     99711
False    21509
Name: baños, dtype: int64

In [1584]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1585]:
patron = r"\s(un|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieciseis|diecisiete|dieciocho|diecinueve|veinte)[\s]*bano[s]?"
busca = re.compile(patron)

description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))
mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
encontradas = description_series_match[mask_description_notnull].apply(lambda x: x.group(1))

num_pisos_mapper = {'un': 1,'dos':2, 'tres': 3, 'cuatro': 4, 'cinco': 5, 'seis': 6, \
                    'siete':7, 'ocho': 8, 'nueve': 9, 'diez': 10, \
                    'once': 11, 'doce': 12, 'trece': 13, 'catorce': 14, 'quice': 15, 'dieciseis': 16, \
                    'diecisiete': 17, 'dieciocho': 18, 'diecinueve': 19,\
                    'veinte': 20}
encontradas1 = encontradas.map(num_pisos_mapper)

In [1586]:
data.loc[mask_description_notnull,'baños']= encontradas1.astype(float)

In [1587]:
cuenta_valores('baños')

True     99557
False    21663
Name: baños, dtype: int64

In [1589]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1590]:
patron = "baño"
description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1591]:
cuenta_valores('baños')

False    80849
True     40371
Name: baños, dtype: int64

In [1593]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

In [1594]:
patron = "baã±o"
description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1595]:
cuenta_valores('baños')

False    82726
True     38494
Name: baños, dtype: int64

In [1597]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

In [1598]:
patron = " bano "
description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1599]:
cuenta_valores('baños')

False    83584
True     37636
Name: baños, dtype: int64

In [1601]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

In [1602]:
patron = "sanitario"
description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1603]:
cuenta_valores('baños')

False    84290
True     36930
Name: baños, dtype: int64

In [1605]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

In [1606]:
patron = "toilette"
description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1607]:
cuenta_valores('baños')

False    86744
True     34476
Name: baños, dtype: int64

In [1609]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

In [1610]:
patron = "inodoro"
description_series_match = data.description_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1611]:
cuenta_valores('baños')

False    86769
True     34451
Name: baños, dtype: int64

In [1613]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

#### Ahora busco con las mismas regex pero con la columna 'title_copia'

In [1614]:
patron = "\s(?P<numero>\d[\d]?)\s+baño[s]?"
busca = re.compile(patron)

description_series_match = data.title_copia.apply(lambda x:  x if x is np.NaN else busca.search(x))

mask_description_notnull = (data['baños'].isnull()) & (description_series_match.notnull())
data.loc[mask_description_notnull, 'baños'] = description_series_match[mask_description_notnull].apply(lambda x: x.group('numero')).astype(float)

In [1615]:
cuenta_valores('baños')

False    86803
True     34417
Name: baños, dtype: int64

In [1616]:
data.loc[mask_description_notnull, ['description_copia', 'title_copia']] = ""

In [1617]:
patron = "baño"
description_series_match = data.title_copia.apply(lambda x:  x if x is np.NaN else x.count(patron))
mascarabanos = (data['baños'].isnull())&(description_series_match > 0)
data.loc[mascarabanos,'baños'] = description_series_match[mascarabanos] 

In [1618]:
cuenta_valores('baños')

False    86834
True     34386
Name: baños, dtype: int64

In [1619]:
data.loc[mascarabanos, ['description_copia', 'title_copia']] = ""

#### Imputación de NULLS

In [1628]:
mask1 = (data['baños'].isnull()) & (data['property_type'] == 'PH')

In [1638]:
data.loc[mask1, 'baños'] = 1

In [1640]:
mask2 = (data['baños'].isnull()) & (data['property_type'] == 'apartment')

In [1641]:
data.loc[mask2, 'baños'] = 1

In [1642]:
mask3 = (data['baños'].isnull()) & (data['property_type'] == 'house')

In [1643]:
data.loc[mask3, 'baños'] = 1

In [1644]:
mask4 = (data['baños'].isnull()) & (data['property_type'] == 'store')

In [1645]:
data.loc[mask4, 'baños'] = 1

In [1648]:
tabla_pivot_baños()

property_type,PH,apartment,house,store
baños,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 1)",0,0,0,0
"[1, 2)",4607,58585,24290,3321
"[2, 3)",948,9803,10724,629
"[3, 4)",147,1890,3506,111
"[4, 5)",39,506,1133,47
"[5, 6)",10,176,377,15
"[6, 7)",0,80,122,7
"[7, 8)",0,4,60,1
"[8, 9)",0,5,14,2
"[9, 15)",0,8,19,3
