# Importation des modules utiles à la construction de notre modèle de machine learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import os
from sklearn import linear_model
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Lecture de la donnée

In [None]:
data = pd.read_csv('../input/cleaned-data/cleaned_data.csv')

In [None]:
data.select_dtypes('object')

In [None]:
data.shape

In [None]:
data = data.drop(['transactiondate', 'hashottuborspa', 'propertycountylandusecode', 'propertyzoningdesc', 'fireplaceflag', 'taxdelinquencyflag'], axis=1)

## Choix des variables explicatives et de la variable cible

In [None]:
y = data["logerror"].values.astype(np.float32)
X = data.drop(['parcelid', 'logerror', 'transaction_month', 'transaction_year'], axis=1)
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

In [None]:
def modelizing_GridSearchCV(target_value, evaluated_values, defined_parameters, model_name):
    y = target_value
    X = evaluated_values
    train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 1)
    parameters = defined_parameters
    model = model_name
    grid = GridSearchCV(model, parameters)
    grid.fit(X, y)
    return grid, train_X, val_X, train_y, val_y

# Aperçu d'un modèle : régression linéaire

## 1. Régression linéaire classique

In [None]:
def modelizing_linear_regression(target_value, evaluated_values):
    y = target_value
    X = evaluated_values
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)
    regr = linear_model.LinearRegression()
    regr.fit(xtrain, ytrain)
    return regr, xtrain, ytrain, xtest, ytest

defined_regr, x_train, y_train, x_test, y_test = modelizing_linear_regression(y, X)

In [None]:
def get_model_metrics(model, X, y, b1=True, b0=True, title=None):
    """
        Separate data in train and test sets,
        fit the model,
        make predictions on train and test datas,
        print metrics

        params:
            model(function): model used with params (ie: Lasso(alpha=x)) 
            X(DataFrame): DataFrame subset with selected features,
            y(Series): variable to predict

        returns: 
                print β1, β0, R2 and RMSE
    """
    # Split des datas
    x_train, x_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=1
    )
    model.fit(x_train, y_train)
    if title:
        print(f"{title} : ")
    if b1:
        # Affichage des β1 pour chaque variable
        for idx, name in enumerate(X_train.columns):
            print(f"β1 de {name} : {round(model.coef_[idx], 3)}")
    if b0:
        print(f"β0 (intercept_) : {round(model.intercept_, 3)}\n")

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    y_list = [y_train, y_train_pred, y_test, y_test_pred]
    get_r2_rmse(y_list)


def get_r2_rmse(y_list):
    """
        calculate R2 and RMSE for each sets (train and test)
        and format output

        param:
            y_list = [
                y_train, 
                y_train_pred, 
                y_test, 
                y_test_pred
            ]
    """

    sets = ["Training", "Testing "]
    i = 0
    for set in sets:
        r2 = round(r2_score(y_list[i], y_list[i+1]), 3)
        rmse = round(mean_squared_error(
            y_list[i], y_list[i+1], squared=False), 3)
        print(
            "{} set : R2 = {}, RMSE = {}".format(set, r2, rmse)
        )
        i += 2
    print("\n")

In [None]:
ytrain_pred = defined_regr.predict(x_train)
ytest_pred = defined_regr.predict(x_test)

In [None]:
print ("Pour les données d'entrainement le R2 vaut {} alors que pour les données de test, il est de {}" 
      .format(round(r2_score(y_train, ytrain_pred),3), round(r2_score(y_test, ytest_pred),3)))

In [None]:
print ("Pour les données d'entrainement le RMSE vaut {} alors que pour les données de test, il est de {}" 
      .format(round(mean_squared_error(y_train, ytrain_pred, squared=False),3), 
              round(mean_squared_error(y_test, ytest_pred, squared=False),3)))

In [None]:
print(mean_absolute_error(y_train, ytrain_pred))
print(mean_absolute_error(y_test, ytest_pred))

## 2. Régression Ridge

### GridSearchCV

In [None]:
ridge_grid, train_X, val_X, train_y, val_y = modelizing_GridSearchCV(y, X, {'alpha':np.arange(0, 10, 1).tolist(), 'normalize':[True,False]}, linear_model.Ridge())

In [None]:
print(ridge_grid.best_estimator_)

In [None]:
get_model_metrics(linear_model.Ridge(alpha=9), X, y, b1=False, b0=False)

In [None]:
ytest_pred_ridge = ridge_grid.predict(val_X)
print(mean_absolute_error(val_y, ytest_pred_ridge))

### RandomizedSearchCV

In [None]:
def modelizing_RandomizedSearchCV(target_value, evaluated_values, defined_parameters, model_name, n_iter=10, cv=5, random_state=None):
    y = target_value
    x = evaluated_values
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)
    parameters = defined_parameters
    model = model_name
    regr = RandomizedSearchCV(model, parameters, n_iter=n_iter, cv=cv, random_state=random_state)
    regr.fit(x, y)
    return regr, xtrain, ytrain, xtest, ytest

In [None]:
ridge_randomized, xtrain, ytrain, xtest, ytest = modelizing_RandomizedSearchCV(y, X, {'alpha':uniform(loc=0, scale=10), 'normalize':[True, False]}, linear_model.Ridge(), random_state=1)

In [None]:
print(ridge_randomized.best_estimator_)

In [None]:
get_model_metrics(linear_model.Ridge(alpha=9.325573593386588), X, y, b1=False, b0=False)

In [None]:
ytest_pred_ridge_randomized = ridge_randomized.predict(xtest)
print(mean_absolute_error(ytest, ytest_pred_ridge_randomized))

## 3. Régression Lasso

### GridSearchCV

In [None]:
lasso_grid, train_X, val_X, train_y, val_y = modelizing_GridSearchCV(y, X, {'alpha':np.arange(100, 110, 1).tolist(), 'normalize':[True,False]}, linear_model.Lasso())

In [None]:
print(lasso_grid.best_estimator_)

In [None]:
get_model_metrics(linear_model.Ridge(alpha=100), X, y, b1=False, b0=False)

In [None]:
ytest_pred_lasso_grid = lasso_grid.predict(val_X)
print(mean_absolute_error(val_y, ytest_pred_lasso_grid))

### RandomizedSearchCV

In [None]:
lasso_randomized, xtrain, ytrain, xtest, ytest = modelizing_RandomizedSearchCV(y, X, {'alpha':uniform(loc=100, scale=300), 'normalize':[True, False]}, linear_model.Lasso(), random_state=1)

In [None]:
print(lasso_randomized.best_estimator_)

In [None]:
get_model_metrics(linear_model.Lasso(alpha=225.1066014107722), X, y, b1=False, b0=False)

In [None]:
ytest_pred_lasso_randomized = lasso_randomized.predict(xtest)
print(mean_absolute_error(ytest, ytest_pred_lasso_randomized))

# Sample submission

In [None]:
df_samples = pd.read_csv('../input/zillow-prize-1/sample_submission.csv')
df_samples.head()

In [None]:
prop_2016_df = pd.read_csv("../input/zillow-prize-1/properties_2016.csv", index_col='parcelid', low_memory=False)

In [None]:
df_samples['parcelid'] = df_samples['ParcelId']
sub = df_samples.merge(prop_2016_df, on='parcelid', how='left')

In [None]:
X_test = prop_2016_df[X.columns].copy()

In [None]:
# Select numerical columns
numerical_cols = [cname for cname in train_X.columns if 
                train_X[cname].dtype in ['int64', 'float64']]

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in train_X.columns if
                    train_X[cname].nunique() < 10 and 
                    train_X[cname].dtype == "object"]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', ridge_randomized)
                     ])

# Preprocessing of training data, fit model 
clf.fit(xtrain, ytrain)

In [None]:
preds_full = clf.predict(X_test)

In [None]:
sub['201610'] = preds_full
sub['201611'] = preds_full
sub['201612'] = preds_full
sub['201710'] = preds_full
sub['201711'] = preds_full
sub['201712'] = preds_full

In [None]:
sub.head()

In [None]:
sub = sub[['ParcelId', '201610', '201611', '201612', '201710', '201711', '201712']]
sub.head()

In [None]:
sub.to_csv('./my_submission.csv', index=False, float_format='%.4f')