In [1]:
import pandas as pd
import numpy as  np

In [None]:
# Cargar nuestro datos postproccesados de entrenamiento.
X_EXPANDED_TEST = pd.read_pickle("data/postproccesed/X_EXPANDED.pkl")
y_train = pd.read_pickle("data/postproccesed/y.pkl")

In [3]:
X_EXPANDED_TEST.shape

(20974, 6835)

In [None]:
from sklearn.feature_selection import VarianceThreshold



In [None]:
# Eliminamos datos con muy poca varianza
lowVarianceFilter = VarianceThreshold(0.05)
lowVarianceFilter.fit(X_EXPANDED_TEST)

In [6]:
X_wo_low_variance = lowVarianceFilter.transform(X_EXPANDED_TEST)

In [7]:
X_wo_low_variance.shape

(20974, 2569)

In [8]:
X_wo_low_variancelow_df = pd.DataFrame(data=X_wo_low_variance,
                                        columns=lowVarianceFilter.get_feature_names_out(),
                                        index=X_EXPANDED_TEST.index)

In [None]:
# Eliminamos columnas repetidas. 
X_wo_low_variancelow_df_2 = X_wo_low_variancelow_df.loc[:,~X_wo_low_variancelow_df.columns.duplicated()].copy()

In [10]:
X_wo_low_variancelow_df_2.shape

(20974, 2471)

In [11]:
from feature_engine.selection import DropCorrelatedFeatures, DropDuplicateFeatures


In [None]:
# Eliminamos duplicados.
filter_duplicates = DropDuplicateFeatures()
filter_duplicates.fit(X_wo_low_variancelow_df_2)
X_wo_duplicates = filter_duplicates.transform(X_wo_low_variancelow_df_2)

In [13]:
X_wo_duplicates.shape

(20974, 1526)

In [None]:
# Eliminamos variables muy correladas
correlated_filter = DropCorrelatedFeatures(threshold=0.9)

In [15]:
correlated_filter.fit(X_wo_duplicates)
X_wo_high_correlations = correlated_filter.transform(X_wo_duplicates)

In [16]:
X_wo_high_correlations.shape

(20974, 530)

In [None]:
# Aplicamos um filtro más avanzado: ProbeFeatureSelection
from  feature_engine.selection import ProbeFeatureSelection
from sklearn.linear_model import LinearRegression

advanced_filtered = ProbeFeatureSelection(estimator=LinearRegression(),
                                          scoring='neg_mean_absolute_percentage_error',
                                          n_probes=3,
                                          distribution="normal",)

In [18]:
advanced_filtered.fit(X_wo_high_correlations, y_train)
final_X = advanced_filtered.transform(X_wo_high_correlations)

In [20]:
final_X.shape

(20974, 527)

In [None]:
# Guardamos por si queremos procesar más adelante
final_X.to_pickle("data/filtered/X_filter_1.pkl")

In [22]:
# Vamos nuestro primer modelo.

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(final_X,y_train)

In [24]:
rf_model.score(final_X,y_train)

0.8302118474915707

In [25]:
rf_model.predict(final_X.iloc[:2])

array([ 0.87802475, -0.0790222 ])

In [None]:
# Vamos ahora a cargar nuestro datos de test y
# aplicarle todos los pasos de preprocesamiento y selección
# que le hemos aplicado a entrenamiento

import pickle
with open('data/postproccesed/preproccesing_price_Transformer.pkl', 'rb') as f:
    price_transformer = pickle.load(f)

In [27]:
with open('data/postproccesed/preproccesing_scaler_price.pkl', 'rb') as f:
    scalar_price = pickle.load(f)

In [28]:

def unscale(model, data_points):
    prediction = model.predict(data_points)
    unscaled = scalar_price.inverse_transform(prediction.reshape(-1, 1))
    unnormed = price_transformer.inverse_transform(unscaled.reshape(-1, 1))
    return unnormed

In [29]:
unscale(rf_model, final_X.iloc[:2])



array([[12000000.      ],
       [ 6499178.617316]])

In [30]:
# cogemos el set de entrenamiento original
test_data_df = pd.read_csv("data/preprocessed/test_data.csv")

In [31]:
test_data_df.shape

(8989, 41)

In [32]:
test_data_df = test_data_df.replace({9:np.nan})
test_data_df[test_data_df.columns[5:]] = test_data_df[test_data_df.columns[5:]].replace({0:'NO', 1:'SI', np.nan:'NO_DISPONIBLE'})

In [33]:
test_data_df.head()

Unnamed: 0,Price,city,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,...,LiftAvailable,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Stadium
0,2872000,Kolkata,883,Narendrapur,2.0,SI,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,...,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE
1,8400000,Bangalore,1400,Uttarahalli Main Road,3.0,NO,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,...,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE
2,2300000,Kolkata,1050,Garia,3.0,SI,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,...,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE
3,13000000,Delhi,1200,Azad Apartments,3.0,SI,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,...,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE
4,9324000,Bangalore,1335,Banashankari,2.0,NO,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,...,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE,NO_DISPONIBLE


In [34]:
with open('data/postproccesed/preproccesing_beds_imputer.pkl', 'rb') as f:
    beds_imputer = pickle.load(f)

In [83]:
test_data_df['No. of Bedrooms'] = beds_imputer.transform(test_data_df[['No. of Bedrooms']])

In [36]:
with open('data/postproccesed/preproccesing_ohEncoder.pkl', 'rb') as f:
    oh_encoder = pickle.load(f)

In [37]:
cat_columns = ["city"] + test_data_df.columns[5:].to_list()
cat_columns
cat_oh_data = oh_encoder.transform(test_data_df[cat_columns])

df_cat_oh_data = pd.DataFrame(data=cat_oh_data, columns=oh_encoder.get_feature_names_out(), index=test_data_df.index)

In [38]:
with open('data/postproccesed/preproccesing_gapEncoder.pkl', 'rb') as f:
    gap_encoder = pickle.load(f)

In [39]:
cat_location_data = gap_encoder.transform(test_data_df['Location'])

In [40]:
with open('data/postproccesed/preproccesing_area_Transformer.pkl', 'rb') as f:
    area_transformer = pickle.load(f)

In [41]:
with open('data/postproccesed/preproccesing_beds_Transformer.pkl', 'rb') as f:
    bed_transformer = pickle.load(f)

In [42]:
with open('data/postproccesed/preproccesing_price_Transformer.pkl', 'rb') as f:
    price_transformer = pickle.load(f)

In [45]:
with open('data/postproccesed/preproccesing_scaler_area.pkl', 'rb') as f:
    area_scaler = pickle.load(f)

In [46]:
with open('data/postproccesed/preproccesing_scaler_beds.pkl', 'rb') as f:
    beds_scaler = pickle.load(f)

In [47]:
with open('data/postproccesed/preproccesing_scaler_price.pkl', 'rb') as f:
   price_scaler = pickle.load(f)

In [48]:
price_normal = price_transformer.transform(test_data_df[['Price']])
area_normal = area_transformer.transform(test_data_df[['Area']])
beds_normal = bed_transformer.transform(test_data_df[['No. of Bedrooms']])

In [49]:
price_normal_scaled = price_scaler.transform(price_normal)
area_normal_scaled = area_scaler.transform(area_normal)
beds_normal_scaled = beds_scaler.transform(beds_normal)



In [50]:
df_normal_scaled = pd.DataFrame(data={'price_normal_scaled':price_normal_scaled.flatten(), 
                                      'area_normal_scaled':area_normal_scaled.flatten(), 
                                      'beds_normal_scaled':beds_normal_scaled.flatten()}, 
                                      index=test_data_df.index)

In [51]:
with open('data/postproccesed/preproccesing_polyfeatures.pkl', 'rb') as f:
   polyfeatures = pickle.load(f)

In [52]:
features_to_cross = pd.concat([df_cat_oh_data, df_normal_scaled[['area_normal_scaled', 'beds_normal_scaled']]], axis=1)

In [53]:
crossed_features = polyfeatures.transform(features_to_cross)

In [55]:
df_crossed_features = pd.DataFrame(data=crossed_features, columns=polyfeatures.get_feature_names_out(), index=test_data_df.index)

In [56]:
X_EXPANDED = pd.concat([df_normal_scaled[['area_normal_scaled', 'beds_normal_scaled']],
                        df_cat_oh_data,
                        cat_location_data,
                        df_crossed_features], axis=1)

In [80]:
# antes de aplicar el filtrado de entrenamiento podemos guardar en postprocessed como test:
X_EXPANDED.to_pickle("data/postproccesed/X_EXPANDED_TEST.pkl")


In [57]:
test_X_wo_low_variance = lowVarianceFilter.transform(X_EXPANDED)

In [59]:
test_X_wo_low_variancelow_df = pd.DataFrame(data=test_X_wo_low_variance,
                                        columns=lowVarianceFilter.get_feature_names_out(),
                                        index=X_EXPANDED.index)

In [61]:
test_X_wo_low_variancelow_df_2 = test_X_wo_low_variancelow_df.loc[:,~test_X_wo_low_variancelow_df.columns.duplicated()].copy()

In [62]:
test_X_wo_duplicates = filter_duplicates.transform(test_X_wo_low_variancelow_df_2)

In [63]:
test_X_wo_high_correlations = correlated_filter.transform(test_X_wo_duplicates)

In [71]:
test_final_X = advanced_filtered.transform(test_X_wo_high_correlations)

In [82]:
# podemos guardarlo en filtered al igual que entrenamiento
test_final_X.to_pickle("data/filtered/X_test_filter_1.pkl")

In [72]:
test_final_X.shape

(8989, 527)

In [81]:
y_test = test_data_df['Price']
y_test.to_pickle("data/postproccesed/y_test.pkl")

In [68]:
y_test.shape

(8989,)

In [77]:
test_predictions = unscale(rf_model,test_final_X)



In [78]:
from sklearn.metrics import ( root_mean_squared_error, 
                             mean_absolute_error, 
                             mean_absolute_percentage_error )



In [89]:
y_true = y_test.values
y_pred = test_predictions
rmse = root_mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
print(f"rmse: {rmse}")
print(f"mae: {mae}")
print(f"mape: {mape}")

rmse: 25871954.64783803
mae: 6480808.740744277
mape: 0.42968989021414233
