In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 1, freq = 1500):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])
test = pd.read_csv('../data/test.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])
sample_submission = pd.read_csv('../data/ejemploRespuesta.csv')

In [5]:
rescued_coords = pd.read_csv('../data/rescueLatLongs.csv')
precios_en_dolares = pd.read_csv('../data/precios_en_dolares.csv')

In [6]:
# mergeamos con coords. extra obtenidas en tp1.
train = train.merge(rescued_coords.drop('Unnamed: 0', axis=1), how='left', on='id')
train['lat_x'] = train.apply(lambda x: x['lat_y'] if pd.isna(x['lat_x']) else x['lat_x'], axis=1)
train['lng_x'] = train.apply(lambda x: x['lng_y'] if pd.isna(x['lng_x']) else x['lng_x'], axis=1)
train.drop(['lat_y','lng_y'], axis=1, inplace=True)
train.rename(columns={'lat_x':'lat','lng_x':'lon'}, inplace=True)

# por consistencia, para que ambos datasets tengan mismos nombres
test.rename(columns={'lng':'lon'}, inplace=True)
train.head()

Unnamed: 0,id,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,lat,lon,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,depto. tipo a-402,"depto. interior de 80.15m2, consta de sala com...",Apartamento,Avenida Division del Norte 2005,Benito Juárez,Distrito Federal,,2.0,1.0,2.0,80.0,80.0,23536.0,31.73333,-106.48333,2015-08-23,False,False,False,False,False,2273000.0
1,53461,condominio horizontal en venta,"<p>entre sonora y guerrero, atr&aacute;s del h...",Casa en condominio,AV. MEXICO,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,2.0,268.0,180.0,24512.0,19.3125,-99.25,2013-06-28,False,False,False,True,True,3600000.0
2,247984,casa en venta urbi 3 recamaras tonala,descripcion \nla mejor ubicacion residencial e...,Casa,Urbi Tonala,Tonalá,Jalisco,5.0,3.0,2.0,2.0,144.0,166.0,48544.0,,,2015-10-17,False,False,False,False,False,1200000.0
3,209067,casa sola en toluca zinacantepec con credito i...,casa en privada con caseta de vigilancia casas...,Casa,IGNACIO MANUEL ALTAMIRANO 128,Zinacantepec,Edo. de México,1.0,2.0,1.0,1.0,63.0,67.0,53664.0,19.296875,-99.6875,2012-03-09,False,False,False,True,True,650000.0
4,185997,paseos del sol,bonito departamento en excelentes condiciones ...,Apartamento,PASEOS DEL SOL,Zapopan,Jalisco,10.0,2.0,1.0,1.0,95.0,95.0,47840.0,20.72356,-103.38479,2016-06-07,False,False,False,False,False,1150000.0


In [7]:
# inf. values don't make sense. I think it's preferable to treat them as nans directly.
train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240000 entries, 0 to 239999
Data columns (total 23 columns):
id                            240000 non-null int64
titulo                        234613 non-null object
descripcion                   238381 non-null object
tipodepropiedad               239954 non-null category
direccion                     186928 non-null object
ciudad                        239628 non-null category
provincia                     239845 non-null category
antiguedad                    196445 non-null float16
habitaciones                  217529 non-null float16
garages                       202235 non-null float16
banos                         213779 non-null float16
metroscubiertos               222600 non-null float16
metrostotales                 188533 non-null float16
idzona                        118808 non-null float16
lat                           138398 non-null float64
lon                           138398 non-null float64
fecha                      

# Agregamos features que ya hemos creado para analisis de tp1

In [8]:
def contar_amenities(row):
    return row['gimnasio'] + row['usosmultiples'] + row['piscina'] + row['escuelascercanas'] + row['centroscomercialescercanos']

In [9]:
train['cant_amenities'] = train.apply(lambda x: contar_amenities(x), axis=1)
test['cant_amenities'] = test.apply(lambda x: contar_amenities(x), axis=1)

In [10]:
def feature_fechas(df):
    # Para entender lo de los senos y cosenos: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
    df['year'] = df['fecha'].dt.year
    df['month'] = df['fecha'].dt.month
    df['day'] = df['fecha'].dt.day
    df['sin_month'] = np.sin(2*np.pi*df['month']/12)
    df['cos_month'] = np.cos(2*np.pi*df['month']/12)
    # tomo cant. de dias en mes: 31 en todos los casos. Para esto deberia servir bastante bien igual.
    df['sin_day'] = np.sin(2*np.pi*df['day']/31)
    df['cos_day'] = np.cos(2*np.pi*df['day']/31)
    
    # no necesito mas las cols. originales de month y day.
    df.drop(['month','day'], axis=1, inplace=True)
    
feature_fechas(train)
feature_fechas(test)

In [11]:
palabras_avenida = ['avenida', 'av']
train['es_avenida'] = train['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))
test['es_avenida'] = test['direccion'].fillna('no info').apply(lambda x: any(avenida_indicator in x.lower() for avenida_indicator in palabras_avenida))

In [12]:
def recuperar_cant_amb(x):
    "Recupera el dato amb en x (puede ser la descripcion o el titulo)"
    try:
        x = x.lower()
        if x.rfind("ambientes") != -1 or x.rfind("amb") != -1:
            pos = x.rfind("amb")
            try:
                return (np.float64(x[pos -2]))
            except ValueError:
                if "dos amb" in x or "2 amb" in x:
                    return 2.0
                elif "tres amb" in x or "3 amb" in x:
                    return 3.0
                elif "cuatro amb" in x or "4 amb" in x:
                    return 4.0
                elif "cinco amb" in x or "5 amb" in x:
                    return 5.0
                elif "seis amb" in x or "6 amb" in x:
                    return 6.0
                return 0
    except AttributeError:
        return None

In [13]:
#Recuperamos el dato de habitaciones en el titulo y la descripcion
train.loc[train["habitaciones"].isnull(),'habitaciones'] = train["descripcion"].apply(recuperar_cant_amb)
train.loc[train["habitaciones"].isnull(),'habitaciones'] = train["titulo"].apply(recuperar_cant_amb)

In [14]:
#####Recategorizacion

tipo_vivienda = ["apartamento", "casa en condominio", "casa",
                 "villa", "duplex", "departamento compartido", "hospedaje", "quinta vacacional"]
tipo_comercial = ["terreno comercial", "local comercial", "oficina comercial", 
                  "local en centro comercial", "bodega comercial", "inmuebles productivos urbanos"]

tipo_inversion = ["casa uso de suelo", "terreno", "edificio", "huerta", "lote", "garage", "otros", "nave industrial",  "rancho"]
undefined = ["terreno", "edificio", "casa uso de suelo", "otros", "nave industrial", "rancho", "nan", 
             "huerta", "lote", "garage"]  

In [15]:
def get_ancester_category(tipo_de_propiedad):
    if (tipo_de_propiedad in tipo_inversion): return "inversion"

    if (tipo_de_propiedad in tipo_comercial): return "comercial"

    if (tipo_de_propiedad in tipo_vivienda): return "vivienda"

    return "undefined"

In [16]:
train["categoria_padre"] = train["tipodepropiedad"].apply(lambda x: get_ancester_category(x))

In [17]:
train.drop_duplicates(["tipodepropiedad","direccion","provincia","ciudad"], keep='last', inplace= True)

# * Fin agregado de features de tp1 *

El sample submission no tiene header. **Ojo con eso al guardar la submission.** Hagamos la funcion para guardar submissions ahora, para evitar problemas a futuro y despreocuparnos.

In [18]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="lucioll", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [19]:
# Define a seed, so all algorithms that accept a seed, take the same, for consistency reasons,
# so everything can be replicated without problems random state
seed=42

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop('precio', axis=1), train['precio'], test_size=0.33, random_state=seed)

***

# Approach n....

## Entrenamiento local

In [23]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn-kd-tree", KNeighborsRegressor(n_neighbors=20,algorithm="kd_tree", p = 1, n_jobs=-1)))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total= 1.2min
[Pipeline] ....... (step 2 of 2) Processing knn-kd-tree, total=  48.9s
737661.7190366162


In [47]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn-kd-tree", KNeighborsRegressor(n_neighbors=20,algorithm="kd_tree", p = 1, n_jobs=-1)))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  33.2s
[Pipeline] ....... (step 2 of 2) Processing knn-kd-tree, total=   7.9s
748873.5102935112


In [48]:
beep()

## Entrenamiento con todos los datos para obtener predicciones a subir

In [20]:
my_pipe.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  56.3s
[Pipeline] ... (step 2 of 2) Processing knn-brute-force, total= 1.9min


ValueError: Found input variables with inconsistent numbers of samples: [79200, 60000]

In [21]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [22]:
description = "KNN ball_tree with n=4 and p=1"
save_submission(df, description=description)

In [20]:
beep()

## Obtengamos predicciones para todas las propiedades en nuestro train set

In [21]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_1"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  36.9s
[Pipeline] ... (step 2 of 2) Processing knn-brute-force, total=   0.0s
828032.356075
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  39.9s
[Pipeline] ... (step 2 of 2) Processing knn-brute-force, total=   0.0s
835511.888059375
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  59.9s
[Pipeline] ... (step 2 of 2) Processing knn-brute-force, total=   0.0s
825146.995178125


In [26]:
X_train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lon', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'cant_amenities', 'year', 'sin_month',
       'cos_month', 'sin_day', 'cos_day', 'es_avenida'],
      dtype='object')

# Aplico Grid Search

In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
k_range = list(range(1, 21))
print(k_range)

param_dist = {"n_neighbors": k_range,
              "weights" : ['uniform', 'distance'],
              "metric": ['minkowski']
             }

small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

grid = GridSearchCV(knn, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)
steps.append(("grid-search", grid))

my_pipeline2 = Pipeline(steps, verbose=True)

my_pipeline2.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])
grid.best_estimator_

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  42.0s
Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 77.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 1098.1min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 2869.8min finished


[Pipeline] ..... (step 2 of 2) Processing grid-search, total=1188.7min


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=16, p=2,
                    weights='distance')

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
k_range = [1, 2, 4, 6, 8, 10, 12, 14 ,16, 18, 20]
print(k_range)

param_dist = {"n_neighbors": k_range,
              "weights" : ['uniform', 'distance'],
              "metric": ['minkowski']
             }

small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", MinMaxScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

grid = GridSearchCV(knn, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)
steps.append(("grid-search", grid))

my_pipeline2 = Pipeline(steps, verbose=True)

my_pipeline2.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])
grid.best_estimator_

[1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  45.4s
Fitting 3 folds for each of 22 candidates, totalling 66 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 31.7min


KeyboardInterrupt: 

In [53]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("Best knn ", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=16, p=1.7,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  43.9s
[Pipeline] ......... (step 2 of 2) Processing Best knn , total=   9.7s
784687.2314277309


In [36]:
def fractional_dist(x,y):
    return np.sum(((x-y)**0.7)**(1/0.7))

In [27]:
metrics = ["braycurtis", "canberra", "chebyshev", "correlation", "dice", "hamming", 
           "jaccard", "kulsinski", "mahalanobis", "minkowski", "rogerstanimoto", 
           "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"]

In [41]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("Best knn ", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='canberra',
                    metric_params=None, n_jobs=-1, n_neighbors=6,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

  


[Pipeline] ......... (step 1 of 2) Processing col_trans, total= 3.7min


  
  
  


[Pipeline] ......... (step 2 of 2) Processing Best knn , total=  25.2s


  
  
  
  
  
  
  
  


900408.6760940655


In [20]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn with braycurtis", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='braycurtis',
                    metric_params=None, n_jobs=-1, n_neighbors=12,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  48.6s
[Pipeline] . (step 2 of 2) Processing knn with canberra, total=   9.3s
743110.1724742119


In [21]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn with braycurtis", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='braycurtis',
                    metric_params=None, n_jobs=-1, n_neighbors=16,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  49.1s
[Pipeline]  (step 2 of 2) Processing knn with braycurtis, total=   8.2s
740497.2572507106


In [22]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn with braycurtis", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='braycurtis',
                    metric_params=None, n_jobs=-1, n_neighbors=20,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  45.1s
[Pipeline]  (step 2 of 2) Processing knn with braycurtis, total=   8.9s
739920.9036943609


In [23]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn with braycurtis", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='braycurtis',
                    metric_params=None, n_jobs=-1, n_neighbors=24,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  45.2s
[Pipeline]  (step 2 of 2) Processing knn with braycurtis, total=   9.2s
741718.1871170142


In [28]:
from numpy import linalg as LA

def _validate_vector(u, dtype=None):
    # XXX Is order='c' really necessary?
    u = np.asarray(u, dtype=dtype, order='c').squeeze()
    # Ensure values such as u=1 and u=[1] still return 1-D arrays.
    u = np.atleast_1d(u)
    if u.ndim > 1:
        raise ValueError("Input vector should be 1-D.")
    return u

def fractional(u, v, p=0.7, w=None):
    """
    Compute the Minkowski distance between two 1-D arrays.
    The Minkowski distance between 1-D arrays `u` and `v`,
    is defined as
    .. math::
       {||u-v||}_p = (\\sum{|u_i - v_i|^p})^{1/p}.
       \\left(\\sum{w_i(|(u_i - v_i)|^p)}\\right)^{1/p}.
    Parameters
    ----------
    u : (N,) array_like
        Input array.
    v : (N,) array_like
        Input array.
    p : int
        The order of the norm of the difference :math:`{||u-v||}_p`.
    w : (N,) array_like, optional
        The weights for each value in `u` and `v`. Default is None,
        which gives each value a weight of 1.0
    Returns
    -------
    minkowski : double
        The Minkowski distance between vectors `u` and `v`.
    Examples
    --------
    >>> from scipy.spatial import distance
    >>> distance.minkowski([1, 0, 0], [0, 1, 0], 1)
    2.0
    >>> distance.minkowski([1, 0, 0], [0, 1, 0], 2)
    1.4142135623730951
    >>> distance.minkowski([1, 0, 0], [0, 1, 0], 3)
    1.2599210498948732
    >>> distance.minkowski([1, 1, 0], [0, 1, 0], 1)
    1.0
    >>> distance.minkowski([1, 1, 0], [0, 1, 0], 2)
    1.0
    >>> distance.minkowski([1, 1, 0], [0, 1, 0], 3)
    1.0
    """
    u = _validate_vector(u)
    v = _validate_vector(v)

    u_v = u - v
    if w is not None:
        w = _validate_weights(w)
        if p == 1:
            root_w = w
        if p == 2:
            # better precision and speed
            root_w = np.sqrt(w)
        else:
            root_w = np.power(w, 1/p)
        u_v = root_w * u_v
    dist = LA.norm(u_v, ord=p)
    return dist

In [29]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia', 'categoria_padre']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


steps.append(("knn with braycurtis", KNeighborsRegressor(algorithm='auto', leaf_size=30, metric=fractional,
                    metric_params=None, n_jobs=-1, n_neighbors=8,
                    weights='distance')))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  28.4s
[Pipeline]  (step 2 of 2) Processing knn with braycurtis, total=  29.0s


KeyboardInterrupt: 

***