# HOUSE PRICE PREDICTION

## Librerias a usar

Importo las librerias que se usaran en la solucion del reto

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error,  make_scorer

RMSLE = make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

## Extraccion de los Dataframes

In [None]:
df_test = pd.read_csv("houses_test_raw.csv")
df_train = pd.read_csv("house_train_raw.csv")
df_train

## Transformacion de los datos

Cree funciones para eliminar los features que contengan mayor cantidad de valores perdidos</br>
así como tambien una funcion de preprocesamiento en donde se insertan los datos en crudo y</br>
los devuelve estandarizados y en valores numericos

In [None]:
def percentage_of_missing_values(df_train, df_test):
    # Remove the ID and label columns
    target = 'SalePrice'
    features = pd.concat(
        (df_train.drop(columns=['Id', target]),
         df_test.drop(columns=['Id']))
    )
    top_missing = features.isnull().sum()/len(features)*100
    return top_missing.sort_values(ascending=False)[:4]

df_train.drop(columns=list(percentage_of_missing_values(df_train, df_test).index), inplace=True)
df_test.drop(columns=list(percentage_of_missing_values(df_train, df_test).index), inplace=True)

In [None]:
def preprocess(df_train, df_test):
    # Remove the ID and label columns
    target = 'SalePrice'
    features = pd.concat(
        (df_train.drop(columns=['Id', target]),
         df_test.drop(columns=['Id']))
    )
    # Standardize numerical columns
    numeric_features = features.dtypes[features.dtypes != 'object'].index
    standar_scaler = StandardScaler()
    features[numeric_features] = standar_scaler.fit_transform(features[numeric_features])

    # Replace NAN numerical features by 0
    features[numeric_features] = features[numeric_features].fillna(0)
    
    # Replace discrete features by one-hot encoding.      
    features = pd.get_dummies(features, dummy_na=True)

    # Save preprocessed features
    df_train_new = features[:df_train.shape[0]].copy()
    df_train_new[target] = df_train[target]
    df_test_new = features[df_test.shape[0]:].copy()
    
    return df_train_new, df_test_new

In [None]:
data_train, data_test = preprocess(df_train, df_test)
data_train.head()

Separados los datos en inputs y output

In [None]:
X = data_train.drop(columns=["SalePrice"])
y = data_train["SalePrice"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

## RANDOM FOREST REGRESSOR

Como modelo para dar solucion a este reto elegí RandomForestRegressor de scikit-learn</br>
ya que dada la naturaleza de nuestros datos me parecio la mejor elección sobre una </br>
regresion lineal u otros métodos

In [None]:
RF = RandomForestRegressor()

In [None]:
RF.fit(X_train, y_train)

In [None]:
print('Training score for Random Forest Regressor is',RF.score(X_train,y_train))
print('Testing score for Random Forest Regressor is',RF.score(X_test,y_test))

In [None]:
RMSLE__ = mean_squared_log_error(y_test,RF.predict(X_test),squared=False)
RMSLE__

## Hyper parametric Tuning -- Random Forest Regressor

In [None]:
random_params = {'n_estimators': [100,200,300,400,500,600],  
               'max_features': [1.0,'sqrt', 'log2', None],  
               'max_depth': [ 10, 15, 20, 25, 60, 70, 80, 90, 100], 
               'min_samples_split':  [2, 5, 10], 
               'min_samples_leaf': [1, 2, 4], 
               'bootstrap': [True, False]}

In [None]:
hyper_tuning = RandomizedSearchCV(
    estimator=RandomForestRegressor(),
    param_distributions=random_params, 
    cv=KFold(n_splits=5,
            shuffle=True,
            random_state=42),
    scoring=RMSLE,
    n_iter=15,
    verbose=5
)

In [None]:
hyper_tuning.fit(X_train,y_train)

In [None]:
hyper_tuning.best_score_

Selección de los mejores parametros de nuestro modelo para su posterior entrenamiento

In [None]:
hyper_tuning.best_params_

In [None]:
RF_hyper_tuning = RandomForestRegressor(
    n_estimators=600,
    min_samples_split=5,
    min_samples_leaf= 2,
    max_features='sqrt',
    max_depth=15,
    bootstrap= False
)

In [None]:
RF_hyper_tuning.fit(X_train,y_train)

In [None]:
print('Training score for Random Forest Regressor Hyper Parametric Tuning is ',RF_hyper_tuning.score(X_train,y_train))
print('Testing score for Random Forest Regressor Hyper Parametric Tuning is ',RF_hyper_tuning.score(X_test,y_test))

### Esta es la metrica solicitada en el repositrio

In [None]:
mean_squared_log_error(y_test, RF_hyper_tuning.predict(X_test), squared=False)

### CROSS VALIDATION

Se realizará un proceso de validación cruzada para evaluar el modelo con sobre diferentes formas de agrupar nuestros datos

In [None]:
RF_score = cross_val_score(
    RF_hyper_tuning, 
    X, 
    y, 
    cv=KFold(n_splits=5,
             shuffle=True, 
             random_state=42), 
    scoring=RMSLE
)

El valor final que se obtiene como promedio de la validacion cruzada realizada para nuestro dataset es 0.1477

In [None]:
print('RMSLE: ',np.abs(RF_score).mean())

### SAVING THE MODEL

Guardo el modelo para que proximamente se pueda usar sin necesidad de volver a entrenarlo

In [None]:
from joblib import dump, load
dump(RF_hyper_tuning, 'RF_hyper_tuning.joblib') 

## SAVING RESULTS IN A CSV

Por ultimo genero y almaceno las predicciones de mi modelo sobre los datos de prueba proporcionados en el repositorio

In [None]:
pred_test = RF_hyper_tuning.predict(data_test)
pred_test

In [None]:
pred_test_df = pd.DataFrame(
    {
        "pred": pred_test
    }
)
pred_test_df

In [None]:
pred_test_df.to_csv("pred_test.csv", index=False)