## Import Libraries

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from statistics import mean


## Read Data

In [None]:
# read train data 
Train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv", index_col="Id")
Train_data

In [None]:
Train_data.info()

### From info(), we can see that we have null values in the data

In [None]:
# calc. number of rows that contain null 
null_rows = Train_data.shape[0] - Train_data.dropna(axis=0).shape[0]
null_rows

In [None]:
# read test data
Test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv", index_col="Id")
Test_data

In [None]:
Test_data.info()

In [None]:
# Join all the data together (train + test)
All_Data = pd.concat([Train_data, Test_data])
All_Data.drop('SalePrice', axis=1, inplace=True)
All_Data

In [None]:
All_Data.info()

## Preprocessing Data

## • Handling Missing Values

### We can see that number of columns that contain null values in train data equal total number of samples, so we can't drop rows that contain null.

### We will keep columns that have null values <= 5% and discard any column that has null values > 5%

In [None]:
# drop nulls at specefic percentage
percent = 5
min_count =  int( ((100-percent)/100) * All_Data.shape[0] )
print("Columns that have No Null values less than ", min_count," will drop it")
print("--"*25 )
All_Data.dropna(axis=1, thresh=min_count).info()

In [None]:
# from above we can know features that have missing > 5% and we will drop it 
# and features that have missing <= 5% to handle all of them

features_to_drop = []
features_to_impute = []

#to check for num of nulls depend on the determined percentage of the error
checkCond_Null = All_Data.shape[0] - min_count 

# loop to check percent in each columns
for c in All_Data.columns:
    if All_Data[c].isnull().sum() > checkCond_Null:
        features_to_drop.append(c)
    elif (All_Data[c].isnull().sum() <= checkCond_Null) & (All_Data[c].isnull().sum()!= 0.0):
        features_to_impute.append(c)

print("- We have ",len(features_to_impute),"features have small missing values in it. These columns are :\n\n", features_to_impute)
print('\n','--'*30,'\n')
print("- We have ",len(features_to_drop),"features have alot of missing values in it. These columns are :\n\n",features_to_drop)

In [None]:
All_Data.shape

In [None]:
# Drop features that have missing ratio > 5%
All_Data = All_Data.drop(features_to_drop, axis=1)
All_Data.shape

In [None]:
# Histograms of feature columns that we will impute
for c in features_to_impute:
    plt.figure(figsize=(10,8))
    All_Data[c].hist()
    plt.title(c)
    plt.show()

In [None]:
# impute missing data 
def Imput_Missing_Value(Data, features_to_impute):
    for i in features_to_impute:
        #for a categorical variable imputation, we can impute with the most frequent categorical value.
        if Data[i].dtype == 'object': 
            Data[i] = Data[i].fillna(Data[i].mode()[0])
        # we will impute numerical feature with mean
        else: 
            Data[i]=Data[i].fillna(Data[i].mean())
    return Data



In [None]:
All_Data = Imput_Missing_Value(All_Data, features_to_impute)
All_Data.info()

In [None]:
#Check if there are null values or not 
All_Data.isnull().sum().max()
# equal 0 means there is no missing values in the data :)

In [None]:
# correlation of training data
corr_Matrix = Train_data.corr()
#corr_Matrix
corr_Matrix['SalePrice'].sort_values(ascending=False)

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})
sns.heatmap(corr_Matrix,cmap="Greens")  # corr_Matrix = Train_data.corr()
plt.title("The Correlation between the features")
plt.savefig("./corrMAt.jpg")

In [None]:
Train_data.head()

## • Encoding categorical features

In [None]:
Columns_Enc_OneHot = ['MSZoning' ,'LandContour' ,'LotConfig','LandSlope','RoofStyle','RoofMatl','Exterior1st','MasVnrType','Heating']
Columns_Enc_Ordinal = ['Street','LotShape','Utilities','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC','CentralAir','Electrical','KitchenQual','PavedDrive','SaleCondition','SaleType'] 

In [None]:
All_Data

### - OneHotEncoding

In [None]:
# One Hot Encoding for train data
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse = False)
housing_caterogy_onehot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(All_Data[Columns_Enc_OneHot]))
housing_caterogy_onehot_encoded.columns = one_hot_encoder.get_feature_names_out(Columns_Enc_OneHot)
housing_caterogy_onehot_encoded.index = np.arange(1, len(All_Data)+1)                
housing_caterogy_onehot_encoded

In [None]:
# replace categorical features with its encoding so remove old features
All_Data.drop(Columns_Enc_OneHot, axis=1, inplace=True)
All_Data

In [None]:
# Add Encoding features in the dataframe
All_Data = pd.concat([All_Data, housing_caterogy_onehot_encoded], axis=1)
All_Data

### - OrdinalEncoding

In [None]:
# Ordinal Encoding 
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
All_Data[Columns_Enc_Ordinal] = pd.DataFrame(ordinal_encoder.fit_transform(All_Data[Columns_Enc_Ordinal]))
All_Data

In [None]:
All_Data['Utilities']

In [None]:
# drop some features that is redendent
features_throw = ['Neighborhood','Condition1','Condition2','BldgType','HouseStyle','Exterior2nd','ExterQual','Functional']
All_Data.drop(features_throw, axis=1, inplace=True)
All_Data

In [None]:
# to sure that we didn't have any null 
All_Data.fillna(0, inplace=True)

In [None]:
All_Data

In [None]:
# Check type of all features 
All_Data.info(1)

In [None]:
train_samples = len(Train_data)
train_samples

In [None]:
# Return data to train and test
Train_Data_new = All_Data[:train_samples]
Train_Data_new

Test_Data_new = All_Data[train_samples:]
Test_Data_new

In [None]:
Train_Data_new

In [None]:
#Check if there are null values or not 
Test_Data_new.isnull().sum().max()

In [None]:
Train_data_Y = Train_data[['SalePrice']].reset_index().drop('Id',axis=1)
Train_data_Y.index = np.arange(1, len(Train_Data_new)+1)   
Train_data_Y

In [None]:
Train_Data_new = pd.concat([Train_Data_new, Train_data_Y],axis=1)
Train_Data_new

In [None]:
# Correlation matrix of training data after preprocessing
corr_Matrix = Train_Data_new.corr()
#corr_Matrix
corr_Matrix['SalePrice'].sort_values(ascending=False)

## Split Data to train and validation

In [None]:
# divide data 20% for the validation
Train, Test = train_test_split(Train_Data_new, test_size=0.2, random_state=42)

In [None]:
# Split train data for features and the target 
Train_y = Train['SalePrice'] 
Train_x = Train.drop(['SalePrice'], axis=1)

In [None]:
# Split validation data for features and the target 
Test_y = Test['SalePrice'] 
Test_x = Test.drop(['SalePrice'], axis=1)

## Build ML Models

In [None]:
parameters_values = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0]

## • Linear Regression

In [None]:
# LinearRegression Model
################
# Create a model
model = LinearRegression()
# Fit the model
model.fit(Train_x, Train_y)
# Get the R-squared
r_sq = model.score(Train_x, Train_y)
print("SCORE :",r_sq)
pred = model.predict(Test_x)
MSE = mean_squared_error(np.log(Test_y), np.log(pred))
RMSE = math.sqrt(MSE)
print("RMSE :",RMSE)

## • Lasso Regression

In [None]:
# LASSO Model
################
RMSE_Lasso = []
for i in parameters_values:
    # Create a model
    lasso_model = make_pipeline(StandardScaler(),linear_model.Lasso(alpha=i)) # use make pipline rather that normalize parameter in model beacause it's an old version and will be removed  
    # Fit the model
    lasso_model.fit(Train_x, Train_y)
    # Get the R-squared
    print("SCORE =",lasso_model.score(Train_x, Train_y),"@ hyperparameter =",i)
    lasso_pred = lasso_model.predict(Test_x)
    MSE = mean_squared_error(np.log(Test_y), np.log(lasso_pred))
    RMSE_Lasso.append(math.sqrt(MSE))
print("RMSE :",RMSE_Lasso)

In [None]:
RMSE_Lasso_AVG = mean(RMSE_Lasso)
RMSE_Lasso_AVG

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(parameters_values, RMSE_Lasso)
plt.xlabel("Hyperparameter")
plt.ylabel("RMSE_Lasso")
plt.title("RMSE_Lasso vs. Hyperparamter ")
plt.show()

## • Ridge Regression

In [None]:
# Ridge Model
################
RMSE_Ridge = []
for i in parameters_values:
    # Create a model
    Ridge_model = make_pipeline(StandardScaler(),linear_model.Ridge(alpha=i))
    # Fit the model
    Ridge_model.fit(Train_x, Train_y)
    # Get the R-squared
    print("SCORE :",Ridge_model.score(Train_x, Train_y),"@ hyperparameter =",i)
    Ridge_pred = Ridge_model.predict(Test_x)
    MSE = mean_squared_error(np.log(Test_y), np.log(Ridge_pred))
    RMSE_Ridge.append(math.sqrt(MSE))
print("RMSE :",RMSE_Ridge)

In [None]:
RMSE_Ridge_AVG = mean(RMSE_Ridge)
RMSE_Ridge_AVG

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(parameters_values, RMSE_Ridge)
plt.xlabel("Hyperparameter")
plt.ylabel("RMSE_Ridge")
plt.title("RMSE_Ridge vs. Hyperparamter ")
plt.show()

## • ElasticNet

In [None]:
# ElasticNet Model
################
RMSE_ElasticNet = []
for i in parameters_values:
    # Create a model
    ElasticNet_model = make_pipeline(StandardScaler(), linear_model.ElasticNet(alpha=i, l1_ratio=0.1))
    # Fit the model
    ElasticNet_model.fit(Train_x, Train_y)
    # Get the R-squared
    print("SCORE :",ElasticNet_model.score(Train_x, Train_y))
    ElasticNet_pred = ElasticNet_model.predict(Test_x)
    MSE = mean_squared_error(np.log(Test_y), np.log(ElasticNet_pred))
    RMSE_ElasticNet.append(math.sqrt(MSE))
print("RMSE :",RMSE_ElasticNet)

In [None]:
RMSE_ElasticNet_AVG = mean(RMSE_ElasticNet)
RMSE_ElasticNet_AVG

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(parameters_values, RMSE_ElasticNet)
plt.xlabel("Hyperparameter")
plt.ylabel("RMSE_ElasticNet")
plt.title("RMSE_ElasticNet vs. Hyperparamter ")
plt.show()

## • XGBOOST 

In [None]:
import xgboost

RMSE_XGBoost = []
for i in parameters_values:
    # Create a model
    XGBoost_model =  make_pipeline(StandardScaler(),xgboost.XGBRegressor(alpha=i, objective ='reg:linear',n_estimators = 10, seed = 123))
    # Fit the model
    XGBoost_model.fit(Train_x, Train_y)
    # Get the R-squared
    print("SCORE :",XGBoost_model.score(Train_x, Train_y))
    XGBoost_pred = XGBoost_model.predict(Test_x)
    MSE = mean_squared_error(np.log(Test_y), np.log(XGBoost_pred))
    RMSE_XGBoost.append(math.sqrt(MSE))
print("RMSE :",RMSE_XGBoost)



In [None]:
RMSE_XGBoost_AVG = mean(RMSE_XGBoost)
RMSE_XGBoost_AVG

## • Stochastic Gredient Descent Regressor (SGDRegressor)

In [None]:
# SGDRegressor Model
################
RMSE_SGD = []
for i in parameters_values:
    # Create a model
    SGD_model =  make_pipeline(StandardScaler(),SGDRegressor(alpha=i, max_iter=1000, tol=1e-3))
    # Fit the model
    SGD_model.fit(Train_x, Train_y)
    # Get the R-squared
    print("SCORE :",SGD_model.score(Train_x, Train_y))
    SGD_pred = SGD_model.predict(Test_x)
    MSE = mean_squared_error(np.log(Test_y), np.log(SGD_pred))
    RMSE_SGD.append(math.sqrt(MSE))
print("RMSE :",RMSE_SGD)

In [None]:
RMSE_SGD_AVG = mean(RMSE_SGD)
RMSE_SGD_AVG

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(parameters_values, RMSE_SGD)
plt.xlabel("Hyperparameter")
plt.ylabel("RMSE_SGD")
plt.title("RMSE_SGD vs. Hyperparamter ")
plt.show()

## Predict Test Data

In [None]:
# Test Model on the Test data
predicted_prices = XGBoost_model.predict(Test_Data_new)
print("Predicted Prices :",predicted_prices)


In [None]:
my_submission = pd.DataFrame({'Id': Test_Data_new.index, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)