### This model adds two more categorical features on top of the previous model, and used another huggingface embeddings for the text feature. Also fine tuned the XGBoosting parameters.

In [153]:
import numpy as np
import pandas as pd

In [154]:
df = pd.DataFrame(pd.read_pickle('~/data/train.pickle'))
df.head()

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,320.000 €,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...
1,335.000 €,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,330.000 €,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de..."
3,435.000 €,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,410.000 €,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin..."


#### Expanding the features column, and converting the results to numeric

In [155]:
def parse_features(features_list):
    features_dict = {'size': np.nan, 'rooms': np.nan, 'bathrooms': np.nan, 'ppm2': np.nan}
    
    for feature in features_list:
        if 'm2' in feature and not '€/m2' in feature:
            features_dict['size'] = feature
        elif 'hab.' in feature:
            features_dict['rooms'] = feature
        elif 'baño' in feature or 'baños' in feature:
            features_dict['bathrooms'] = feature
        elif '€/m2' in feature:
            features_dict['ppm2'] = feature
            
    return pd.Series(features_dict)

In [156]:
features_df = df['features'].apply(parse_features)
df = df.drop('features', axis=1).join(features_df)

In [157]:
df['price'] = df['price'].str.replace('€', '').str.strip()
df['price'] = df['price'].astype(str).str.replace('.', '').astype(float)
df['bathrooms'] = df['bathrooms'].str.extract('(\d+)').astype(float)
df['rooms'] = df['rooms'].str.extract('(\d+)').astype(float)
df['size'] = df['size'].str.replace('m2', '').str.strip().astype(float)
df['ppm2'] = df['ppm2'].str.replace('€/m2', '').str.strip().astype(float) * 1000

In [158]:
df.head()

Unnamed: 0,price,title,loc_string,loc,type,subtype,selltype,desc,size,rooms,bathrooms,ppm2
0,320000.0,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...,85.0,2.0,1.0,3647.0
1,335000.0,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d...",65.0,2.0,1.0,5000.0
2,330000.0,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de...",77.0,2.0,1.0,4286.0
3,435000.0,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl...",96.0,3.0,2.0,4531.0
4,410000.0,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin...",84.0,2.0,1.0,4881.0


In [159]:
#df.to_csv('cleaned_listings.csv', index=False)

#### Using a BERT Multilingual model to generate embeddings from the housing descriptions

In [160]:
from transformers import AutoModel, AutoTokenizer

model_name = 'bert-base-multilingual-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [161]:
import torch

In [162]:
descriptions = df['desc'].tolist()
inputs = tokenizer(descriptions, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [163]:
from tqdm.auto import tqdm

def generate_embeddings(model, tokenizer, descriptions, batch_size=10):
    model.eval()
    embeddings = []
    
    progress_bar = tqdm(range(0, len(descriptions), batch_size))
    
    for i in progress_bar:
        batch = descriptions[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs.to(model.device))
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings.cpu().numpy())
        
    embeddings = np.vstack(embeddings)
    return embeddings

embeddings = generate_embeddings(model, tokenizer, descriptions, batch_size=10)


  0%|          | 0/87 [00:00<?, ?it/s]

#### Label encoding the categorical variables `loc_string` and `type`

In [164]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(x: np.array,
                   add_unknown: bool = True,
                   unknown_value: str = 'Unknown'):
    """Encode a categorical column.

    Args:
        x (np.array): An array to encode.
        add_unknown (bool): If True, add an 'unknown' label.
        unknown_value (str): The value to use in the 'unknown' label.

    Returns:
        A LabelEncoder.
    """
    le = LabelEncoder()
    uniq = np.unique(x)
    if add_unknown:
        uniq = np.concatenate((np.array([unknown_value]), uniq))
    le.fit(uniq)
    return le

def transform_column(le: LabelEncoder,
                     index,
                     x_train: np.array,
                     # x_val: np.array,
                     x_test: np.array,
                     # x_train_val: np.array,
                     unknown_value: str = 'Unknown') -> None:
    classes = le.classes_

    x_train[index] = le.transform(x_train[index])

    # val = [x if x in classes else unknown_value for x in x_val[index]]
    # x_val[index] = le.transform(val)

    test = [x if x in classes else unknown_value for x in x_test[index]]
    x_test[index] = le.transform(test)
    
    return x_train, x_test

def transform_data(x_train: pd.DataFrame,
                   x_test: pd.DataFrame) -> tuple:
    
    x_train_e = x_train.copy()
    # x_val_e = x_val.copy()
    x_test_e = x_test.copy()
    # x_train_val_e = x_train_val.copy()
    
    for col in ['loc_string', 'type']:
        le = label_encoding(x_train_e[col].values, add_unknown = True)
        transform_column(le, col, x_train_e, x_test_e)
        
    return x_train_e, x_test_e

In [165]:
df_test = pd.DataFrame(pd.read_pickle('~/data/test_kaggle.pickle'))
features_test = df_test['features'].apply(parse_features)
df_test = df_test.drop('features', axis=1).join(features_test)
df_test['bathrooms'] = df_test['bathrooms'].str.extract('(\d+)').astype(float)
df_test['rooms'] = df_test['rooms'].str.extract('(\d+)').astype(float)
df_test['size'] = df_test['size'].str.replace('m2', '').str.strip().astype(float)
df_test.head()

Unnamed: 0,title,loc_string,loc,description,type,subtype,selltype,desc,id,size,rooms,bathrooms,ppm2
0,Piso Carrer de llull. Piso con 4 habitaciones ...,Barcelona - El Parc i la Llacuna del Poblenou,,Contactar con Camila 7. 3. La Casa Agency Estu...,FLAT,FLAT,SECOND_HAND,Contactar con Camila 7. 3.\n\nLa Casa Agency E...,0,87.0,4.0,1.0,
1,Piso Diagonal. Luminoso piso de 4 habitaciones...,Barcelona - Poblenou,,¡Un gran piso a reformar es una gran oportunid...,FLAT,FLAT,SECOND_HAND,¡Un gran piso a reformar es una gran oportunid...,1,78.0,4.0,1.0,
2,Piso Carrer del consell de cent. Piso amueblad...,Barcelona - L´Antiga Esquerra de l´Eixample,,"AUREA INMOBILIARIA PRESENTA, ACOGEDOR APARTAME...",FLAT,FLAT,SECOND_HAND,"AUREA INMOBILIARIA PRESENTA, ACOGEDOR APARTAME...",2,65.0,1.0,1.0,
3,Piso Castanys. Carrer castanys,Barcelona - Poblenou,,"Piso en pleno centro de Poblenou, techos altos...",FLAT,FLAT,SECOND_HAND,"Piso en pleno centro de Poblenou, techos altos...",3,88.0,3.0,1.0,
4,Piso Carrer de casanova. Piso con 2 habitacion...,Barcelona - Sant Antoni,,Punt Zona Franca presenta esta fantástica vivi...,FLAT,FLAT,SECOND_HAND,Punt Zona Franca presenta esta fantástica vivi...,4,82.0,2.0,1.0,


In [166]:
le_ls = label_encoding(df.loc_string.values)
le_t = label_encoding(df.type.values)

In [167]:
train_e, test_e= transform_data(df, df_test)
train_e.head()

Unnamed: 0,price,title,loc_string,loc,type,subtype,selltype,desc,size,rooms,bathrooms,ppm2
0,320000.0,Piso Tallers. Piso con 2 habitaciones con asce...,17,,2,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...,85.0,2.0,1.0,3647.0
1,335000.0,Piso C/ de valència. Piso reformado en venta d...,2,,2,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d...",65.0,2.0,1.0,5000.0
2,330000.0,Piso en Dreta de l´Eixample. Acogedor piso al ...,2,,2,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de...",77.0,2.0,1.0,4286.0
3,435000.0,"Piso Barcelona - corts catalanes. Soleado, cén...",17,,2,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl...",96.0,3.0,2.0,4531.0
4,410000.0,"Piso en Carrer de sardenya 271. Alto, reformad...",16,Carrer de Sardenya 271,2,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin...",84.0,2.0,1.0,4881.0


#### Combining embeddings with original features, scaling

In [170]:
numerical_features = df[['size', 'rooms', 'bathrooms']]
categorical_features = train_e[['loc_string', 'type']]

embeddings_df = pd.DataFrame(embeddings)

combined_features = pd.concat([numerical_features.reset_index(drop=True), 
                               categorical_features.reset_index(drop=True),
                               embeddings_df], axis=1)

In [171]:
from sklearn.model_selection import train_test_split

y = df['price']
X = combined_features

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2024)

In [172]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[['size', 'rooms', 'bathrooms']] = scaler.fit_transform(X_train[['size', 'rooms', 'bathrooms']])
X_val[['size', 'rooms', 'bathrooms']] = scaler.transform(X_val[['size', 'rooms', 'bathrooms']])

#### Train model

In [173]:
import warnings
warnings.filterwarnings("ignore")

In [60]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

params = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 300],
    'colsample_bytree': [0.3, 0.7]
}

xgb_reg = xgb.XGBRegressor(objective ='reg:squarederror', alpha = 10)

grid_search = GridSearchCV(estimator = xgb_reg, param_grid = params, scoring='neg_mean_squared_error', n_jobs=-1, cv=5, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

best_xgb_reg = grid_search.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}


In [61]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred_val = best_xgb_reg.predict(X_val)

print("R^2:", r2_score(y_val, y_pred_val))

R^2: 0.46961951389629686


#### Initialize best model

In [62]:
best_params = {
    'colsample_bytree': 0.7,
    'learning_rate': 0.1,
    'max_depth': 3,
    'n_estimators': 300
}

final_model = xgb.XGBRegressor(objective ='reg:squarederror', 
                               colsample_bytree = best_params['colsample_bytree'],
                               learning_rate = best_params['learning_rate'],
                               max_depth = best_params['max_depth'],
                               n_estimators = best_params['n_estimators'])

In [63]:
final_model.fit(combined_features, y)

#### Test

In [53]:
# df_test = pd.DataFrame(pd.read_pickle('~/data/test_kaggle.pickle'))

In [54]:
# features_test = df_test['features'].apply(parse_features)
# df_test = df_test.drop('features', axis=1).join(features_test)

  features_test = df_test['features'].apply(parse_features)


In [55]:
# df_test['bathrooms'] = df_test['bathrooms'].str.extract('(\d+)').astype(float)
# df_test['rooms'] = df_test['rooms'].str.extract('(\d+)').astype(float)
# df_test['size'] = df_test['size'].str.replace('m2', '').str.strip().astype(float)

In [56]:
# df_test.head()

Unnamed: 0,title,loc_string,loc,description,type,subtype,selltype,desc,id,size,rooms,bathrooms,ppm2
0,Piso Carrer de llull. Piso con 4 habitaciones ...,Barcelona - El Parc i la Llacuna del Poblenou,,Contactar con Camila 7. 3. La Casa Agency Estu...,FLAT,FLAT,SECOND_HAND,Contactar con Camila 7. 3.\n\nLa Casa Agency E...,0,87.0,4.0,1.0,
1,Piso Diagonal. Luminoso piso de 4 habitaciones...,Barcelona - Poblenou,,¡Un gran piso a reformar es una gran oportunid...,FLAT,FLAT,SECOND_HAND,¡Un gran piso a reformar es una gran oportunid...,1,78.0,4.0,1.0,
2,Piso Carrer del consell de cent. Piso amueblad...,Barcelona - L´Antiga Esquerra de l´Eixample,,"AUREA INMOBILIARIA PRESENTA, ACOGEDOR APARTAME...",FLAT,FLAT,SECOND_HAND,"AUREA INMOBILIARIA PRESENTA, ACOGEDOR APARTAME...",2,65.0,1.0,1.0,
3,Piso Castanys. Carrer castanys,Barcelona - Poblenou,,"Piso en pleno centro de Poblenou, techos altos...",FLAT,FLAT,SECOND_HAND,"Piso en pleno centro de Poblenou, techos altos...",3,88.0,3.0,1.0,
4,Piso Carrer de casanova. Piso con 2 habitacion...,Barcelona - Sant Antoni,,Punt Zona Franca presenta esta fantástica vivi...,FLAT,FLAT,SECOND_HAND,Punt Zona Franca presenta esta fantástica vivi...,4,82.0,2.0,1.0,


In [64]:
descriptions = df_test['desc'].tolist()
inputs = tokenizer(descriptions, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [65]:
embeddings = generate_embeddings(model, tokenizer, descriptions, batch_size=10)

  0%|          | 0/14 [00:00<?, ?it/s]

In [66]:
#df_test[['size', 'rooms', 'bathrooms']] = scaler.transform(df_test[['size', 'rooms', 'bathrooms']])
numerical_features = df_test[['size', 'rooms', 'bathrooms']]
# categorical_features = test_e[['loc_string', 'type']]

embeddings_df = pd.DataFrame(embeddings)

combined_features = pd.concat([numerical_features.reset_index(drop=True), 
                               # categorical_features.reset_index(drop=True),
                               embeddings_df], axis=1)

In [67]:
y_pred = final_model.predict(combined_features)

In [68]:
y_pred /= 1000

In [69]:
output = pd.DataFrame()
output['id'] = range(len(y_pred))
output['price'] = y_pred

output

Unnamed: 0,id,price
0,0,367.794006
1,1,357.350739
2,2,282.487305
3,3,360.169037
4,4,362.950928
...,...,...
127,127,377.091095
128,128,305.776886
129,129,283.816925
130,130,333.421265


In [174]:
# output.to_csv('solution.csv', index=False)