In [1]:
import pandas as pd
import numpy as np

In [2]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# source: https://gist.github.com/rozanecm/38f2901c592bdffc40726cb0473318cf
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 0.6, freq = 200):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [4]:
train = pd.read_csv('../data/train_master.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])
test = pd.read_csv('../data/test_master.csv', dtype={'tipodepropiedad':'category', 'ciudad':'category', 'provincia':'category', 'id':'int32', 'antiguedad':'float16', 'habitaciones':'float16', 'garages':'float16', 'banos':'float16', 'metroscubiertos':'float16', 'metrostotales':'float16', 'idzona':'float16', 'lat':'float16', 'lng':'float16', 'gimnasio':'bool', 'usosmultiples':'bool', 'piscina':'bool', 'escuelascercanas':'bool', 'centroscomercialescercanos':'bool'}, parse_dates=['fecha'])

In [5]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/last_pred/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/last_pred/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fcozza", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [6]:
seed = 42

# target 1 - rf

In [11]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
cat_columns = ['tipodepropiedad','ciudad','provincia']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas','stopwords_count',
    'punctuations_count','vocab_size','characters_count','top10_trigram_occ','top10_bigram_occ',
    'numerics_count','avg_word','diversity_score','top_used_words_count','least_used_words_count',
    'words_start_count','words_end_count','sentiment','ratio_cubiertos_totales','titulo_cant_html_tags',
    'titulo_cant_palabras','titulo_cant_palabras_unicas','titulo_diversity_score','titulo_cant_caracteres',
    'titulo_cant_signos_puntuacion','titulo_entropy','titulo_mean_word_length','descripcion_cant_html_tags',
    'descripcion_cant_palabras_unicas','descripcion_entropy']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

from sklearn.ensemble import RandomForestRegressor
steps.append(("rfr", RandomForestRegressor(n_estimators=100,
    n_jobs=-1,
    random_state=seed)))

my_pipe = Pipeline(steps, verbose=True)

In [12]:
my_pipe.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   1.6s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 2.1min


In [13]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [14]:
description = "target 1"
save_submission(df, description=description)

In [15]:
beep()

In [16]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

approach_numer = "fcozza_target_1"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], 'approach_1':y_scores}))

df.to_csv('../predictions/last_train_data/' + approach_numer,index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   1.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 1.2min
743609.9634729946
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   1.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 1.2min
745952.5537785584
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   1.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 1.2min
739375.8240193245


In [17]:
beep()

# target 2 - rf + one hot + svd

In [7]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas','stopwords_count',
    'punctuations_count','vocab_size','characters_count','top10_trigram_occ','top10_bigram_occ',
    'numerics_count','avg_word','diversity_score','top_used_words_count','least_used_words_count',
    'words_start_count','words_end_count','sentiment','ratio_cubiertos_totales','titulo_cant_html_tags',
    'titulo_cant_palabras','titulo_cant_palabras_unicas','titulo_diversity_score','titulo_cant_caracteres',
    'titulo_cant_signos_puntuacion','titulo_entropy','titulo_mean_word_length','descripcion_cant_html_tags',
    'descripcion_cant_palabras_unicas','descripcion_entropy']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

from sklearn.ensemble import RandomForestRegressor
steps.append(("rfr", RandomForestRegressor(n_estimators=100,
    n_jobs=-1,
    random_state=seed)))

my_pipe = Pipeline(steps, verbose=True)

In [8]:
my_pipe.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   4.1s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 2.7min


In [9]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [10]:
description = "target 2"
save_submission(df, description=description)

In [11]:
beep()

In [12]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

approach_numer = "fcozza_target_2"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], 'approach_1':y_scores}))

df.to_csv('../predictions/last_train_data/' + approach_numer,index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.8s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 1.7min
602745.5106908402
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.7s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 1.7min
607534.2621182624
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.7s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 1.7min
600904.0241585325


In [13]:
beep()

# target 3 - rf + one hashing vectorizer + svd

In [8]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas','stopwords_count',
    'punctuations_count','vocab_size','characters_count','top10_trigram_occ','top10_bigram_occ',
    'numerics_count','avg_word','diversity_score','top_used_words_count','least_used_words_count',
    'words_start_count','words_end_count','sentiment','ratio_cubiertos_totales','titulo_cant_html_tags',
    'titulo_cant_palabras','titulo_cant_palabras_unicas','titulo_diversity_score','titulo_cant_caracteres',
    'titulo_cant_signos_puntuacion','titulo_entropy','titulo_mean_word_length','descripcion_cant_html_tags',
    'descripcion_cant_palabras_unicas','descripcion_entropy']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', 
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

from sklearn.ensemble import RandomForestRegressor
steps.append(("rfr", RandomForestRegressor(n_estimators=100,
    n_jobs=-1,
    random_state=seed)))

my_pipe = Pipeline(steps, verbose=True)

In [10]:
my_pipe.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  28.3s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 6.9min


In [11]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [12]:
description = "target 3"
save_submission(df, description=description)

In [13]:
beep()

In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_3"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], 'approach_1':y_scores}))

df.to_csv('../predictions/last_train_data/' + approach_numer,index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 4.3min
611115.8825614405
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.6s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 4.2min
617234.5271664321
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.6s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 4.2min
610302.0091033245


In [10]:
beep()

# target 4 - rf + stopwords

In [10]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas','stopwords_count',
    'punctuations_count','vocab_size','characters_count','top10_trigram_occ','top10_bigram_occ',
    'numerics_count','avg_word','diversity_score','top_used_words_count','least_used_words_count',
    'words_start_count','words_end_count','sentiment','ratio_cubiertos_totales','titulo_cant_html_tags',
    'titulo_cant_palabras','titulo_cant_palabras_unicas','titulo_diversity_score','titulo_cant_caracteres',
    'titulo_cant_signos_puntuacion','titulo_entropy','titulo_mean_word_length','descripcion_cant_html_tags',
    'descripcion_cant_palabras_unicas','descripcion_entropy']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
                ,'descripcion'
                ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from stop_words import get_stop_words
from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
#                             ("text_imputer", SimpleImputer(strategy='constant', fill_value="")),
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii', stop_words=get_stop_words('spanish'),
#                                                                      ngram_range=(2,5)
                                                                    )),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
    #                         se podria agregar una svd.... o alguna proyeccion... 
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

from sklearn.ensemble import RandomForestRegressor
steps.append(("rfr", RandomForestRegressor(n_estimators=100,
    n_jobs=-1,
    random_state=seed)))

my_pipe = Pipeline(steps, verbose=True)

In [11]:
my_pipe.fit(train.drop(['precio'], axis=1).replace({True:1,False:0}), train['precio'])

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  27.5s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 6.8min


In [12]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [13]:
description = "target 4"
save_submission(df, description=description)

In [14]:
beep()

In [11]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_4"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], 'approach_1':y_scores}))

df.to_csv('../predictions/last_train_data/' + approach_numer,index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.2s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 4.2min
603622.4021449108
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.4s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 4.2min
610608.4406444031
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.8s
[Pipeline] ............... (step 2 of 2) Processing rfr, total= 4.2min
602515.9190960078


In [12]:
beep()

# target 5 -lightgbm

In [8]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1,
                        n_estimators=100, subsample_for_bin=200000, objective=None,
                        class_weight=None, min_split_gain=0.0, min_child_weight=0.001,
                        min_child_samples=20, subsample=1.0, subsample_freq=0,
                        colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0,
                        random_state=seed, n_jobs=-1, silent=True, importance_type='split')

In [9]:
gbm.fit(train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio'],axis=1), train['precio'], sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [10]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [11]:
description = "target 5"
save_submission(df, description=description)

In [12]:
beep()

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_5"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    gbm.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

588652.1770681143
593555.9972213265
591447.8585886128


In [14]:
beep()

# target 6 - lightgbm with grid search

In [16]:
gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=75,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
gbm_optimized.fit(train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio'],axis=1), train['precio'], sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [18]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [19]:
description = "target 6"
save_submission(df, description=description)

In [20]:
beep()

In [21]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_6"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    gbm.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

588652.1770681143
593555.9972213265
591447.8585886128


In [22]:
beep()

# target 8 - lightgbm grid search + feat eng

In [23]:
gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=75,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=300, n_jobs=-1, num_leaves=1200, objective=None,
              random_state=seed, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [24]:
gbm_optimized.fit(train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio'],axis=1), train['precio'], sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [25]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [26]:
description = "target 7"
save_submission(df, description=description)

In [27]:
beep()

In [28]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_7"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio'],axis=1).iloc[train_index], train.drop(['precio'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio'][train_index], train['precio'][test_index]
    
    gbm.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

588652.1770681143
593555.9972213265
591447.8585886128


In [29]:
beep()

# target 12 - lightgbm log precio

In [7]:
train['precio_log'] = np.log(train['precio'])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(['precio', 'precio_log'], axis=1), train['precio_log'], test_size=0.33, random_state=seed)

In [9]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=75,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [10]:
gbm_optimized.fit(train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio','precio_log'],axis=1), train['precio_log'], sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [11]:
exp_predictions = np.exp(predictions)

In [12]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [13]:
description = "target 8"
save_submission(df, description=description)

In [14]:
beep()

In [15]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_8"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio','precio_log'],axis=1).iloc[train_index], train.drop(['precio','precio_log'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio_log'][train_index], train['precio_log'][test_index]
    
    gbm_optimized.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm_optimized.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    print(mean_absolute_error(np.exp(y_test2), np.exp(y_scores)))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:np.exp(y_scores)}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

0.21309995483007793
525440.1788438802
0.21405842690778873
531720.4468441766
0.21340811161957143
525368.0651654683


In [16]:
beep()

# target 13 - lightgbm log precio y skewed features

In [7]:
from scipy.stats import skew, skewtest

In [8]:
train['precio_log'] = np.log(train['precio'])

In [9]:
def var_to_log(df, attribute_name, treshold=0.5):
    original_skew = skew(df[attribute_name])
    print("skew as is:", skew(df[attribute_name]))
    if abs(original_skew) < treshold:
        print("No action needed.")
    else:
        df[attribute_name] = df[attribute_name].astype('float64')
        df[attribute_name] = df[attribute_name].fillna(df[attribute_name].mean())
        df.loc[:,attribute_name] = np.log1p(df[attribute_name])
        print("skew logged variable:", skew(df[attribute_name]))

In [10]:
for column in train.select_dtypes(include=[np.number]).drop(['precio','precio_log'],axis=1).columns:
    print("processing", column)
    var_to_log(train, column)
    var_to_log(test, column)
    print("")

processing id
skew as is: 0.0011793797535118043
No action needed.
skew as is: -0.004728242943954671
No action needed.

processing idzona
skew as is: nan
skew logged variable: -4.363605150634373
skew as is: nan
skew logged variable: -4.361812715085782

processing lat
skew as is: nan
skew logged variable: nan
skew as is: nan
skew logged variable: nan

processing lon
skew as is: nan
skew logged variable: nan
skew as is: nan
skew logged variable: nan

processing cant_comodidades_en_desc
skew as is: 0.3203184338289262
No action needed.
skew as is: 0.3437700969660058
No action needed.

processing cant_palabras_positivas
skew as is: 0.8450344935580002
skew logged variable: -0.1109111394348578
skew as is: 0.8377301106860434
skew logged variable: -0.11771546615615593

processing cant_areas_dedicadas
skew as is: 0.1495034193923016
No action needed.
skew as is: 0.14971863295484558
No action needed.

processing cant_areas_verdes
skew as is: 1.1204940975046116
skew logged variable: 0.36039703584814

  result = getattr(ufunc, method)(*inputs, **kwargs)


skew logged variable: 0.36171784299692944

processing cant_areas_entretenimiento_cerca
skew as is: 3.012321645202613
skew logged variable: 2.6317511461851337
skew as is: 3.027727681462814
skew logged variable: 2.626777024366695

processing cant_lugares_cerca
skew as is: 2.3572148515091556
skew logged variable: 1.4532223772986501
skew as is: 2.3686111788255064
skew logged variable: 1.4498184957330698

processing stopwords_count
skew as is: 4.078255760033841
skew logged variable: -0.46169959032628877
skew as is: 3.9831152764002224
skew logged variable: -0.4528969774267983

processing punctuations_count
skew as is: 34.22697275045236
skew logged variable: -0.0838840608742315
skew as is: 204.23250811658554
skew logged variable: -0.06391437768999186

processing vocab_size
skew as is: 8.71289796022834
skew logged variable: -0.7329696705887189
skew as is: 32.524792328567756
skew logged variable: -0.7266017472333228

processing characters_count
skew as is: 17.484879580158903
skew logged variabl

  result = getattr(ufunc, method)(*inputs, **kwargs)
  a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  s *= a_zero_mean


skew as is: 0.3984375
No action needed.
skew as is: 0.3984375
No action needed.

processing garages
skew as is: -0.34033203125
No action needed.
skew as is: -0.334716796875
No action needed.

processing habitaciones
skew as is: 1.91796875
skew logged variable: 0.23057799946024607
skew as is: 1.9658203125
skew logged variable: 0.25954977968052134

processing metroscubiertos
skew as is: nan


  s = a_zero_mean**2
  s = s**2
  ret = umr_sum(arr, axis, dtype, out, keepdims)


skew logged variable: -0.23724252162629508
skew as is: nan
skew logged variable: -0.2306564648088782

processing metrostotales
skew as is: nan
skew logged variable: -0.1995989553526971
skew as is: nan
skew logged variable: -0.19429339826260794

processing cant_amenities
skew as is: 0.8957617677434134
skew logged variable: 0.3085083320508332
skew as is: 0.8860637249233118
skew logged variable: 0.3016688029773044

processing ratio_cubiertos_totales
skew as is: 10.720785073600561
skew logged variable: 1.0752118201951253
skew as is: 11.967061907854456
skew logged variable: 1.1077890959412882

processing titulo_cant_html_tags
skew as is: 148.89299674995277
skew logged variable: 126.96764643012544
skew as is: 244.05764834500965
skew logged variable: 229.7406926077282

processing titulo_cant_palabras
skew as is: 0.4322613811550008
No action needed.
skew as is: 0.3890690915954755
No action needed.

processing titulo_cant_palabras_unicas
skew as is: 0.38944633730612577
No action needed.
skew as

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(['precio', 'precio_log'], axis=1), train['precio_log'], test_size=0.33, random_state=seed)

In [13]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=75,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [14]:
gbm_optimized.fit(train.drop(['id','fecha','titulo', 'descripcion', 'direccion','precio','precio_log'],axis=1), train['precio_log'], sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [15]:
exp_predictions = np.exp(predictions)

In [16]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [17]:
description = "target 9"
save_submission(df, description=description)

In [18]:
beep()

In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_9"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = train.drop(['precio','precio_log'],axis=1).iloc[train_index], train.drop(['precio','precio_log'],axis=1).iloc[test_index]
    y_train2, y_test2 = train['precio_log'][train_index], train['precio_log'][test_index]
    
    gbm_optimized.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm_optimized.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    print(mean_absolute_error(np.exp(y_test2), np.exp(y_scores)))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:np.exp(y_scores)}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

0.21378962889639574
526812.0978341543
0.2142419894711451
532482.6769932512
0.21400335999655526
527176.0745130337


In [20]:
beep()

# target 15 - xgboost tunned 

In [7]:
X = train.drop('precio', axis=1) #set de datos
y = train['precio'] #target

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [10]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=200, max_depth=10,
                             min_child_weight=6, gamma=0, subsample=0.95, colsample_bytree=0.7,
                             reg_alpha=1.5,
                             objective= 'reg:squarederror', nthread=-1, scale_pos_weight=1,seed=27)

In [11]:
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
small_size_cat_columns = ['tipodepropiedad','provincia']
large_size_cat_columns = ['ciudad']

num_columns = [
#     'id',
    "antiguedad","habitaciones",'garages',
    'banos','metroscubiertos', 'metrostotales','idzona',
    'lat', 'lon', 'cant_amenities',
    'year','sin_month','cos_month', 'sin_day', 'cos_day','cant_comodidades_en_desc',
    'cant_amenities','cant_lugares_cerca','cant_areas_entretenimiento_cerca',
    'cant_areas_verdes','cant_areas_dedicadas','cant_palabras_positivas','stopwords_count',
    'punctuations_count','vocab_size','characters_count','top10_trigram_occ','top10_bigram_occ',
    'numerics_count','avg_word','diversity_score','top_used_words_count','least_used_words_count',
    'words_start_count','words_end_count','sentiment','ratio_cubiertos_totales','titulo_cant_html_tags',
    'titulo_cant_palabras','titulo_cant_palabras_unicas','titulo_diversity_score','titulo_cant_caracteres',
    'titulo_cant_signos_puntuacion','titulo_entropy','titulo_mean_word_length','descripcion_cant_html_tags',
    'descripcion_cant_palabras_unicas','descripcion_entropy']

bool_columns = ['gimnasio','usosmultiples','piscina','escuelascercanas','centroscomercialescercanos','es_avenida',
               'planta_alta','planta_baja','tiene_bodega','oficina','cerca_o_en_esquina','cerca_o_en_avenida',
               'comercial','tiene_servicio','edificio','casa','parte_de_lote','calle_cerrada',
               'indica_frente_y_fondo','usa_easybroker','tiene_seguridad','tiene_antiguedad','tiene_banos',
               'tiene_garages','tiene_habitaciones','tiene_metroscubiertos','tiene_metrostotales']

text_columns = ['titulo'
#                 ,'descripcion'
#                 ,'direccion'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import mean_absolute_error

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))
steps.append(("xgboost_best_params", xgb_model))

my_pipe = Pipeline(steps, verbose=True)

In [12]:
my_pipe.fit(X.replace({True:1,False:0}), y)

# prediciendo valores posta...
predictions = my_pipe.predict(test.replace({True:1,False:0}))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   3.7s


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total= 1.2min


In [15]:
df = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [16]:
description = "target 10"
save_submission(df, description=description)

In [17]:
beep()

In [18]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_10"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2.replace({True:1,False:0}), y_train2)
    y_scores = my_pipe.predict(X_test2.replace({True:1,False:0}))
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.8s
[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total=  39.9s
562524.5181035156
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.7s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total=  39.8s
567984.5908488312
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=   2.7s


  if getattr(data, 'base', None) is not None and \


[Pipeline]  (step 2 of 2) Processing xgboost_best_params, total=  39.3s
562207.1580576171


In [19]:
beep()

# target 18 - lightgbm

In [7]:
train['precio_log'] = np.log(train['precio'])

In [8]:
X = train.drop(['precio', 'precio_log'], axis=1) #set de datos
y = train['precio_log'] #target

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [10]:
import lightgbm as lgb

gbm_optimized = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                                  importance_type='split', learning_rate=0.05, max_depth=75,
                                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                  n_estimators=200, n_jobs=-1, num_leaves=1200, objective=None,
                                  random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
                                  subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
%%time
gbm_optimized.fit(X.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y, sample_weight=None, init_score=None, eval_set=None, eval_names=None,
            eval_sample_weight=None, eval_init_score=None, eval_metric='mae', early_stopping_rounds=None,
            verbose=False, feature_name='auto', categorical_feature=['tipodepropiedad', 'ciudad','provincia'], callbacks=None)


# prediciendo valores posta...
predictions = gbm_optimized.predict(test.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), num_iteration=gbm_optimized.best_iteration_)

New categorical_feature is ['ciudad', 'provincia', 'tipodepropiedad']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


CPU times: user 4min 33s, sys: 1.33 s, total: 4min 35s
Wall time: 36.9 s


In [12]:
exp_predictions = np.exp(predictions)

In [13]:
df = pd.DataFrame(data={'id':test['id'], 'target':exp_predictions})

In [14]:
description = "target 11"
save_submission(df, description=description)

In [15]:
beep()

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "fcozza_target_11"

for train_index, test_index in kf.split(train):
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    gbm_optimized.fit(X_train2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1), y_train2)
    y_scores = gbm_optimized.predict(X_test2.drop(['id','fecha','titulo', 'descripcion', 'direccion'],axis=1))
    
    print(mean_absolute_error(y_test2, y_scores))
    print(mean_absolute_error(np.exp(y_test2), np.exp(y_scores)))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:np.exp(y_scores)}))

df.to_csv("../predictions/last_train_data/" + approach_numer, index=False, header=True)

0.21309995483007793
525440.1788438802
0.21405842690778873
531720.4468441766
0.21340811161957143
525368.0651654683


In [17]:
beep()