In [None]:
import numpy as np
import pandas as pd

import sklearn

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
sklearn.__version__

### Load the dataset

In [None]:
data = pd.read_csv(r'../input/house-prices-advanced-regression-techniques/train.csv')
data.head()

In [None]:
# remove irrelevant variables
data = data.drop("Id", axis=1)
data.head()

In [None]:
# create the test set
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
train_data.info()

The dataset contains a mixture of categorical and numerical columns, `dtypes: float64(3), int64(34), object(43)`. There are continuous, nominal, and ordinal data types.
It also has missing data which indicates we need to apply transformation `ColumnTransformer` for different columns of data.  

For continuous columns, use `SimpleImputer` with `strategy='mean'` to handle missing values, then apply `StandardScaler` to normalize data.

For ordinal columns and norminal columns, use `SimpleImputer` with `strategy='most_frequent` to handle missing values, and use `OrdinalEncoder` and `OneHotEncoder` to convert categorical values to numerical values.

In [None]:
train_data.shape

### Explore the training set to gain insights

In [None]:
# compute the standard correlation coefficient
housing = train_data.copy()

corr_matrix = housing.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["SalePrice","OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF"]
scatter_matrix(housing[attributes], figsize=(12, 12));

- There is a positive correlation between the `SalePrice` and `OverallQual`, `GrLivArea`, `GarageCar` and  `GarageArea`.

### Select one machine learning model, train, optimise

In [None]:
# separate the predictors and the labels
X_train = train_data.drop("SalePrice", axis=1)
y_train = train_data["SalePrice"].copy()  # save the labels

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_train.dtypes

In [None]:
X_train.shape

### Identify all ordinal_columns: all quality related

In [None]:
X_train['OverallQual'].value_counts()

In [None]:
X_train['ExterQual'].value_counts()

In [None]:
X_train['BsmtFinType1'].value_counts()

In [None]:
X_train['BsmtFinType2'].value_counts()

In [None]:
X_train['HeatingQC'].value_counts()

In [None]:
X_train['LowQualFinSF'].value_counts()

In [None]:
X_train['KitchenQual'].value_counts()

In [None]:
X_train['FireplaceQu'].value_counts()

In [None]:
X_train['PoolQC'].value_counts()

In [None]:
X_train['Fence'].value_counts()

In [None]:
X_train['GarageQual'].value_counts()

In [None]:
def getOrdinalPip(order):
    return Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                             ('encoder', OrdinalEncoder(categories=order,
                                                        handle_unknown='use_encoded_value', # New in version 0.24
                                                        unknown_value=-1,)),
                             ('scaler', StandardScaler())])

In [None]:
ordinal_columns = ['HeatingQC', 'GarageQual','FireplaceQu','KitchenQual','ExterQual']
# drop all ordinal columns
def drop_ordinal(df):
    X_train_dump = df.drop(columns=ordinal_columns)
    return X_train_dump    

In [None]:
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer

# a function for getting all categorical_columns
def get_categorical_columns(df):
    categorical_columns_selector = selector(dtype_include=object)
    categorical_columns = categorical_columns_selector(drop_ordinal(df))
    return categorical_columns

In [None]:
# a function for getting all numerical_columns
def get_numerical_columns(df):
    numerical_columns_selector = selector(dtype_exclude=object)
    numerical_columns = numerical_columns_selector(df)
    return numerical_columns

In [None]:
get_numerical_columns(X_train)

In [None]:
get_categorical_columns(X_train)

In [None]:
def get_ordinal_pipeline(order):
    return Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                             ('encoder', OrdinalEncoder(categories=order,
                                                        handle_unknown='error',
                                                        unknown_value=None,)),
                             ('scaler', StandardScaler())])
    

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# a function for Transformation the data
def my_transformation(df):
    df = df.copy()
    
    numerical_columns = get_numerical_columns(df)
    nominal_columns = get_categorical_columns(df)
    ordinal_columns = ['GarageQual']
    ordinal_columns1 = ['FireplaceQu']
    ordinal_columns2 = ['HeatingQC']
    order1 = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]
  
    ordinal_columns3 = ['KitchenQual']
    ordinal_columns4 = ['ExterQual']
    order2 = [['Fa', 'TA', 'Gd', 'Ex']]
    
    numerical_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                               ('scaler', StandardScaler())])
    nominal_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                             ('encoder', OneHotEncoder(handle_unknown='ignore'))])
    ordinal_pipeline1 = get_ordinal_pipeline(order1)
    ordinal_pipeline2 = get_ordinal_pipeline(order2)

    preprocessor = ColumnTransformer([
        ('numerical_transformer', numerical_pipeline, numerical_columns),
        ('nominal_transformer', nominal_pipeline, nominal_columns),
        ('ordinal_transformer', ordinal_pipeline1, ordinal_columns),
        ('ordinal_transformer1', ordinal_pipeline1, ordinal_columns1),
        ('ordinal_transformer2', ordinal_pipeline1, ordinal_columns2),
        ('ordinal_transformer3', ordinal_pipeline2, ordinal_columns3),
        ('ordinal_transformer4', ordinal_pipeline2, ordinal_columns4),
    ])
    
    preprocessor.fit(df)
    
    return preprocessor

In [None]:
preprocessor= my_transformation(X_train)
X_train_prepared = preprocessor.transform(X_train)
X_train_prepared.shape

In [None]:
from sklearn.model_selection import GridSearchCV

# a function for tuning the model with hyper-parameter
def tune_model(model, param_grid, X_train_prepared):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train_prepared, y_train);
    print('grid_search.best_estimator_: ', grid_search.best_estimator_)
    final_model = grid_search.best_estimator_
    return final_model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def showPerformance(clf):
    y_train_pred = clf.predict(X_train_prepared)
    
    print("RMSE train: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
    scores = cross_val_score(lin_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=3)
    lin_rmse_scores = np.sqrt(-scores)
    print("Validation score RMSE Mean:", lin_rmse_scores.mean(), "; Standard deviation:", lin_rmse_scores.std())
    print("Training set score: {:.2f}".format(clf.score(X_train_prepared, y_train)))

#### Train a Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train);
showPerformance(lin_reg)

#### Use RidgeCV

In [None]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=5).fit(X_train_prepared, y_train)

print("alpha = ", ridge.alpha_)
showPerformance(ridge)

#### Use LassoCV

In [None]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], max_iter=10000, cv=5).fit(X_train_prepared, y_train)

In [None]:
print("alpha = ", lasso.alpha_)
print("Number of features used:", np.sum(lasso.coef_ != 0))
showPerformance(lasso)

#### Use ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet
elastic =  ElasticNet(max_iter=1e7)
elastic.fit(X_train_prepared, y_train)

showPerformance(elastic)

In [None]:
param_grid = {
            'alpha'     : [0.1, 1, 10, 0.01],
            'l1_ratio'  :  np.arange(0.40,1.00,0.10),
            'tol'       : [0.0001,0.001]
            }

final_model_elastic = tune_model(elastic, param_grid, X_train_prepared)


In [None]:
showPerformance(final_model_elastic)

#### Use VotingRegressor

In [None]:
from sklearn.ensemble import VotingRegressor

er = VotingRegressor([('ridge', ridge), ('lasso', lasso)], weights=[1,2])
er.fit(X_train_prepared, y_train)

showPerformance(er)

### Test model performance on test data

In [None]:
X_test = test_data.drop("SalePrice", axis=1)
y_test = test_data["SalePrice"].copy()
X_test.shape

In [None]:
X_test_prepared = preprocessor.transform(X_test) 
X_test_prepared.shape

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_percentage_error # New in version 0.24
# show the model permormance on test data
def perfor_test(model):
    y_test_predicted = model.predict(X_test_prepared)
    print(f"Mean absolute error (MAE): " f"{mean_absolute_error(y_test, y_test_predicted):.4f} $")
    print(f"Median absolute error (MedAE): " f"{median_absolute_error(y_test, y_test_predicted):.4f} $")
    print(f"Mean absolute percentage error (MAPE): " f"{mean_absolute_percentage_error(y_test, y_test_predicted) * 100:.4f} %")

In [None]:
perfor_test(lin_reg)

In [None]:
perfor_test(lasso)

In [None]:
perfor_test(ridge)

In [None]:
perfor_test(er)

In [None]:
perfor_test(elastic)

In [None]:
import seaborn as sns
# plot the regression
def plot_reg(model):
    y_test_predicted = model.predict(X_test_prepared)
    predicted_actual = {"True values ($)": y_test, "Predicted values ($)": y_test_predicted}
    predicted_actual = pd.DataFrame(predicted_actual)
    
    sns.scatterplot(data=predicted_actual,
                     x="True values ($)", y="Predicted values ($)",
                     color="black", alpha=0.5)
    plt.axline((0, 0), slope=1, label="Perfect fit")
    plt.axis('square')
    plt.title("Regression using a model without \ntarget transformation");

In [None]:
plot_reg(er)

- The model tends to under-estimate the price of the house.

#### Apply a target transformation

In [None]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor

def target_transform(model):
    quantile_transformer = QuantileTransformer(n_quantiles=900, output_distribution="normal")
    model_transformed_target = TransformedTargetRegressor(regressor=model,
                                transformer=quantile_transformer)

    model_transformed_target.fit(X_train_prepared, y_train)
    return model_transformed_target

In [None]:
perfor_test(target_transform(lasso))

In [None]:
perfor_test(target_transform(er))

In [None]:
plot_reg(target_transform(lasso))

### Output predictions

In [None]:
test = pd.read_csv(r'../input/house-prices-advanced-regression-techniques/test.csv')
ID = test["Id"]
test = test.drop("Id", axis=1)
test.shape

In [None]:
ID.shape

In [None]:
test_prepared = preprocessor.transform(test) 
test_prepared.shape

In [None]:
prediction = target_transform(er).predict(test_prepared)
prediction = pd.DataFrame(data={"Id":ID,"SalePrice":prediction}).to_csv('prediction.csv', index= False)

In [None]:
res = pd.read_csv(r'../input/prediction/prediction.csv')
res.head()