In [1]:
import pandas as pd
import numpy as np

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 0.6, freq = 200):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

In [5]:
test = pd.read_csv('../data/test.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

# Agregamos features creados a partir de descripcion

In [6]:
train_desc = pd.read_csv('../data/train_desc_features.csv')

In [7]:
test_desc = pd.read_csv('../data/test_desc_features.csv')

In [8]:
train_new_feat = pd.read_csv('../data/train_simple_text_features.csv')

In [9]:
test_new_feat = pd.read_csv('../data/test_simple_text_features.csv')

In [10]:
#train_kmeans_tfidf = pd.read_csv('../data/train_kmeans_tfidf_cluster.csv')

In [11]:
#test_kmeans_tfidf = pd.read_csv('../data/test_kmeans_tfidf_cluster.csv')

In [12]:
#train_kmeans_glove = pd.read_csv('../data/train_kmeans_glove_cluster.csv')

In [13]:
#test_kmeans_glove = pd.read_csv('../data/test_kmeans_glove_cluster.csv')

In [14]:
train = pd.concat([train, train_desc], axis=1)

In [15]:
test = pd.concat([test, test_desc], axis=1)

In [16]:
train = pd.concat([train, train_new_feat], axis=1)

In [17]:
test = pd.concat([test, test_new_feat], axis=1)

In [18]:
#train = pd.concat([train, train_kmeans_tfidf], axis=1)

In [19]:
#test = pd.concat([test, test_kmeans_tfidf], axis=1)

In [20]:
#train = pd.concat([train, train_kmeans_glove], axis=1)

In [21]:
#test = pd.concat([test, test_kmeans_glove], axis=1)

# Correccion de los datos faltantes y/o NaNs

In [22]:
rescued_coords = pd.read_csv('../data/rescueLatLongs.csv')
rescued_antiguedad = pd.read_csv('../data/imputations/antiguedad.csv')
rescued_banos = pd.read_csv('../data/imputations/banos.csv')
rescued_garages = pd.read_csv('../data/imputations/garages.csv')
rescued_habitaciones = pd.read_csv('../data/imputations/habitaciones.csv')
rescued_metroscubiertos = pd.read_csv('../data/imputations/metroscubiertos.csv')
rescued_metrostotales = pd.read_csv('../data/imputations/metrostotales.csv')

In [23]:
# mergeamos con coords. extra obtenidas en tp1.
train = train.merge(rescued_coords.drop('Unnamed: 0', axis=1), how='left', on='id')
train['lat_x'] = train.apply(lambda x: x['lat_y'] if pd.isna(x['lat_x']) else x['lat_x'], axis=1)
train['lng_x'] = train.apply(lambda x: x['lng_y'] if pd.isna(x['lng_x']) else x['lng_x'], axis=1)
train.drop(['lat_y','lng_y'], axis=1, inplace=True)
train.rename(columns={'lat_x':'lat','lng_x':'lon'}, inplace=True)

# por consistencia, para que ambos datasets tengan mismos nombres
test.rename(columns={'lng':'lon'}, inplace=True)

In [24]:
# Nan para los datos fuera de rango es mejor que dropear todo el dato
train.loc[(train['lat']>14) | (train['lat']<33),['lat','lon']] = np.nan
train.loc[(train['lon']>86) | (train['lon']<118),['lat','lon']] = np.nan

In [25]:
# inf. values don't make sense. I think it's preferable to treat them as nans directly.
train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [26]:
test.loc[(train['lat']>14) | (train['lat']<33),['lat','lon']] = np.nan
test.loc[(train['lon']>86) | (train['lon']<118),['lat','lon']] = np.nan

In [27]:
test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [28]:
def fillna_with_models_predictions(df, predictions_df, col_name):
    indicadora_name = "tiene_" + col_name
    df[indicadora_name] = df[col_name].notna()
    
    df = df.merge(predictions_df, how='left', on='id')
    original_col = col_name + "_x"
    filler_col = col_name + "_y"
    df[col_name] = df.apply(lambda x: x[filler_col] if pd.isna(x[original_col]) else x[original_col], axis=1)
    df.drop([original_col,filler_col], axis=1, inplace=True)
    
    return df

In [29]:
def fill_na_values(df):
    df = fillna_with_models_predictions(df, rescued_antiguedad, 'antiguedad')
    df = fillna_with_models_predictions(df, rescued_banos, 'banos')
    df = fillna_with_models_predictions(df, rescued_garages, 'garages')
    df = fillna_with_models_predictions(df, rescued_habitaciones, 'habitaciones')
    df = fillna_with_models_predictions(df, rescued_metroscubiertos, 'metroscubiertos')
    df = fillna_with_models_predictions(df, rescued_metrostotales, 'metrostotales')
    return df

In [30]:
from multiprocessing import  Pool

def parallelize_dataframe(df, func, n_cores):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [31]:
train = parallelize_dataframe(train, fill_na_values, 8)
test = parallelize_dataframe(test, fill_na_values, 8)

# Agregado de features

In [32]:
def contar_amenities(row):
    return row['gimnasio'] + row['usosmultiples'] + row['piscina'] + row['escuelascercanas'] + row['centroscomercialescercanos']

train['cant_amenities'] = train.apply(lambda x: contar_amenities(x), axis=1)
test['cant_amenities'] = test.apply(lambda x: contar_amenities(x), axis=1)

In [33]:
#import itertools
#def two_set_bools(df, bool_features):
#    for combination in itertools.combinations(bool_features,2):
#        if combination[0] != combination[1]:
#            new_feature_name = combination[0] + "_AND_" + combination[1]
#            df[new_feature_name] = df[combination[0]] & df[combination[1]]
            
#            new_feature_name = combination[0] + "_OR_" + combination[1]
#            df[new_feature_name] = df[combination[0]] | df[combination[1]]
            
#            new_feature_name = combination[0] + "_XOR_" + combination[1]
#            df[new_feature_name] = df[combination[0]] ^ df[combination[1]]

#two_set_bools(train, ['gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos'])
#two_set_bools(test, ['gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas', 'centroscomercialescercanos'])

In [34]:
train['ratio_cubiertos_totales'] = train['metroscubiertos']/train['metrostotales']
test['ratio_cubiertos_totales'] = test['metroscubiertos']/test['metrostotales']

In [35]:
#def feature_fechas(df):
    # Para entender lo de los senos y cosenos: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
#    df['year'] = df['fecha'].dt.year
#    df['month'] = df['fecha'].dt.month
#    df['day'] = df['fecha'].dt.day
#    df['sin_month'] = np.sin(2*np.pi*df['month']/12)
#    df['cos_month'] = np.cos(2*np.pi*df['month']/12)
    # tomo cant. de dias en mes: 31 en todos los casos. Para esto deberia servir bastante bien igual.
#    df['sin_day'] = np.sin(2*np.pi*df['day']/31)
#    df['cos_day'] = np.cos(2*np.pi*df['day']/31)
    
#    df['dayofweek'] = df['fecha'].dt.dayofweek
#    df['sin_dayofweek'] = np.sin(2*np.pi*df['dayofweek']/7)
#    df['cos_dayofweek'] = np.cos(2*np.pi*df['dayofweek']/7)
    
#    df['dayofyear'] = df['fecha'].dt.dayofyear
#    df['sin_dayofyear'] = np.sin(2*np.pi*df['dayofyear']/365)
#    df['cos_dayofyear'] = np.cos(2*np.pi*df['dayofyear']/365)
    
    
    #     df['days_in_month'] = df['fecha'].dt.days_in_month
#    df['daysinmonth'] = df['fecha'].dt.daysinmonth
#    df['is_leap_year'] = df['fecha'].dt.is_leap_year
#    df['is_month_end'] = df['fecha'].dt.is_month_end
#    df['is_month_start'] = df['fecha'].dt.is_month_start
#    df['is_quarter_end'] = df['fecha'].dt.is_quarter_end
#    df['is_quarter_start'] = df['fecha'].dt.is_quarter_start
#    df['is_year_end'] = df['fecha'].dt.is_year_end
#    df['is_year_start'] = df['fecha'].dt.is_year_start
    
#     df['week'] = df['fecha'].dt.week
#    df['weekofyear'] = df['fecha'].dt.weekofyear
#    df['sin_weekofyear'] = np.sin(2*np.pi*df['weekofyear']/53)
#    df['cos_weekofyear'] = np.cos(2*np.pi*df['weekofyear']/53)
    
    # no necesito mas las cols. originales de month y day.
#    df.drop(['month','day','dayofweek','dayofyear','weekofyear'], axis=1, inplace=True)

#feature_fechas(train)
#feature_fechas(test)

In [36]:
# source: https://stackoverflow.com/a/2979208
import math

def entropy(string):
        "Calculates the Shannon entropy of a string"

        # get probability of chars in string
        prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ]

        # calculate the entropy
        entropy = - sum([ p * math.log(p) / math.log(2.0) for p in prob ])

        return entropy

In [37]:
train['titulo'].fillna("", inplace=True)
test['titulo'].fillna("", inplace=True)

train['titulo_cant_html_tags'] = train['titulo'].apply(lambda x: len(x.split('<'))-1)
test['titulo_cant_html_tags'] = test['titulo'].apply(lambda x: len(x.split('<'))-1)

train['titulo_cant_palabras'] = train['titulo'].apply(lambda x: len(x.split()))
test['titulo_cant_palabras'] = test['titulo'].apply(lambda x: len(x.split()))

train['titulo_cant_palabras_unicas'] = train['titulo'].apply(lambda x: len(set(x.split())))
test['titulo_cant_palabras_unicas'] = test['titulo'].apply(lambda x: len(set(x.split())))

# Concepto traido de https://docs.featuretools.com/generated/nlp_primitives.DiversityScore.html#nlp_primitives.DiversityScore
# Cito de la doc oficial:
#         Given a list of strings, calculates the total number of unique words divided by the total number of words
#         in order to give the text a score from 0-1 that indicates how unique the words used in it are.

train['titulo_diversity_score'] = train['titulo_cant_palabras_unicas']/train['titulo_cant_palabras']
test['titulo_diversity_score'] = test['titulo_cant_palabras_unicas']/test['titulo_cant_palabras']

train['titulo_cant_caracteres'] = train['titulo'].apply(lambda x: len(x))
test['titulo_cant_caracteres'] = test['titulo'].apply(lambda x: len(x))

import re
train['titulo_cant_signos_puntuacion'] = train['titulo'].apply(lambda x: len(re.split("['.', ',', '!', '?', '¿', '¡', '-']",x)))
test['titulo_cant_signos_puntuacion'] = test['titulo'].apply(lambda x: len(re.split("['.', ',', '!', '?', '¿', '¡', '-']",x)))

train['titulo_entropy'] = train['titulo'].apply(lambda x: entropy(x))
test['titulo_entropy'] = test['titulo'].apply(lambda x: entropy(x))

train['titulo_mean_word_length'] = train['titulo'].apply(lambda x: np.mean([len(word) for word in x.split()] if x.split() else np.nan))
test['titulo_mean_word_length'] = test['titulo'].apply(lambda x: np.mean([len(word) for word in x.split()] if x.split() else np.nan))

In [38]:
train['descripcion'].fillna("", inplace=True)
test['descripcion'].fillna("", inplace=True)

train['descripcion_cant_html_tags'] = train['descripcion'].apply(lambda x: len(x.split('<'))-1)
test['descripcion_cant_html_tags'] = test['descripcion'].apply(lambda x: len(x.split('<'))-1)

train['descripcion_cant_palabras_unicas'] = train['descripcion'].apply(lambda x: len(set(x.split())))
test['descripcion_cant_palabras_unicas'] = test['descripcion'].apply(lambda x: len(set(x.split())))

train['descripcion_entropy'] = train['descripcion'].apply(lambda x: entropy(x))
test['descripcion_entropy'] = test['descripcion'].apply(lambda x: entropy(x))

#train['descripcion_mean_word_length'] = train['descripcion'].apply(lambda x: np.mean([len(word) for word in x.split()] if x.split() else np.nan))
#test['descripcion_mean_word_length'] = test['descripcion'].apply(lambda x: np.mean([len(word) for word in x.split()] if x.split() else np.nan)) 

In [39]:
palabras_avenida = ['avenida', 'av']
train['es_avenida'] = train['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))
test['es_avenida'] = test['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))

In [40]:
def feature_fechas(df):
    # Para entender lo de los senos y cosenos: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
    df['year'] = df['fecha'].dt.year
    df['month'] = df['fecha'].dt.month
    df['day'] = df['fecha'].dt.day
    df['sin_month'] = np.sin(2*np.pi*df['month']/12)
    df['cos_month'] = np.cos(2*np.pi*df['month']/12)
    # tomo cant. de dias en mes: 31 en todos los casos. Para esto deberia servir bastante bien igual.
    df['sin_day'] = np.sin(2*np.pi*df['day']/31)
    df['cos_day'] = np.cos(2*np.pi*df['day']/31)
    
    # no necesito mas las cols. originales de month y day.
    df.drop(['month','day'], axis=1, inplace=True)
    
feature_fechas(train)
feature_fechas(test)

# * Fin agregado de features *

In [41]:
train.to_csv('../data/train_master.csv',index=False,header=True)

In [42]:
test.to_csv('../data/test_master.csv',index=False,header=True)