## Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os
import plotly.express as px
import plotly.graph_objects as go


In [None]:
train  = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

## Data Exploration

In [None]:
train.info()

In [None]:
test.head()

In [None]:
train.select_dtypes(np.number).hist(bins = 50,figsize =(30,20),color='orange');

## Features Selection

*  ## Correlation with the target

In [None]:
corr_matrix = train.select_dtypes(np.number).corr()
corr = corr_matrix["SalePrice"].sort_values(ascending = False)
print(corr)
indexNames = corr[abs(corr.values) < 0.4].index.values
indexNames = np.setdiff1d(indexNames, ['Id','MSSubClass'])
#print(len(indexNames), indexNames)
#train.drop(indexNames , inplace=True, axis = 1)
#len(train.columns)
#test.drop(indexNames , inplace=True, axis = 1)
#print(train.shape[1],test.shape[1])

In [None]:
mask = np.triu(np.ones_like(corr, dtype=np.bool))
corr_matrix = corr_matrix.mask(mask)
fig = px.imshow(corr_matrix, text_auto=True)
fig.layout.height = 1000
fig.layout.width = 1000
fig.show()

## Combine train and test data

> For the purpose of filling null values

In [None]:
y_target = train['SalePrice']
test_ids = test.Id
train_v0 = train.drop(['Id','SalePrice'], axis = 1)
test_v0 = test.drop('Id',axis = 1)
data_v0 = pd.concat([train_v0,test_v0],axis = 0)
data_v0

In [None]:
data_v0.info()

> View Traget variable SalesPrice

In [None]:
y_target

## Data Cleaning

> **MSSubClass** is a categroical column represented as numberic

In [None]:
data_v0['MSSubClass'] = data_v0['MSSubClass'].astype(str)
data_v0.MSSubClass.dtype

* ## Fill missing Categorical values #

> Their are two methods to do so 
1. if the missing value represents meaning **mode()**
2. if the missing value represents null **None**
3. Replace ordinal features with proper ordinal values

In [None]:
cat_mode_cols = [
    'MasVnrType', 
    'MSZoning', 
    'Functional',
    'Utilities',
    'Exterior2nd',
    'KitchenQual', 
    'Electrical', 
    'Exterior1st',
    'SaleType'   
]
for col in cat_mode_cols:
    data_v0[col].fillna(data_v0[col].mode()[0],inplace = True)
    
cat_None_cols =  [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'GarageType',
    'PoolQC',
    'Fence',
    'MiscFeature'
]   

for col in cat_None_cols:
    data_v0[col].fillna('None',inplace = True)

In [None]:
ordinal_cols     = {'GarageFinish'  : {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
                    'GarageQual'    : {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'GarageCond'    : {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'BsmtQual'      : {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'BsmtCond'      : {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'BsmtExposure'  : {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
                    'BsmtFinType1'  : {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
                    'BsmtFinType2'  : {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
                    'ExterQual'     : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'ExterCond'     : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'HeatingQC'     : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'KitchenQual'   : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'Electrical'    : {'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5},
                    'CentralAir'    : {'N': 0, 'Y': 1},
                    'PavedDrive'    : {'N': 1, 'P': 2, 'Y': 3}
                    }

data_v0.replace(ordinal_cols, inplace=True)

Display missing values table function

In [None]:
def display_missing(train,cols):
    mis_val = train.isna().sum().sort_values(ascending=False)
    mis_val_per = (mis_val / len(train) *100).sort_values(ascending=False).round(1)
    mis_val_table = pd.concat([mis_val,mis_val_per], axis = 1,keys =['# Missing Values','% Total Missing'])
    return mis_val_table.head(cols)
# mis_val_table_rename_columns = mis_val_table.rename(columns = {0:"# Missing Values", 1:"% Total Missing"})
# mis_val_sort = mis_val_table_rename_columns[mis_val_table_rename_columns.iloc[:,:] != 0].sort_values(
#         "% Total Missing", ascending=False).round(1)
#mis_val_sort.head(22)
#display_missing(train,22)

In [None]:
display_missing(data_v0.select_dtypes('object'),22)

## Feature and target transformation

In [None]:
data_v1 = data_v0.copy()

Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails, or outliers. **Data sets with low kurtosis tend to have light tails, or lack of outliers.**

* ## Feature transformation 

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(data_v1['LotArea'], kde=True, fit=norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log1p(data_v1['LotArea']), kde=True, fit=norm,color='darkblue')
plt.xlabel("Log LotArea ")
plt.title("With Log Transform")

plt.show()

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(data_v1['GrLivArea'], kde=True, fit=norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log1p(data_v1['GrLivArea']), kde=True, fit=norm,color='darkblue')
plt.xlabel("Log GrLivArea ")
plt.title("With Log Transform")

plt.show()

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(data_v1['TotalBsmtSF'], kde=True, fit=norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log1p(data_v1['TotalBsmtSF']), kde=True, fit=norm,color='darkblue')
plt.xlabel("Log TotalBsmtSF ")
plt.title("With Log Transform")

plt.show()

In [None]:
def display_skew_kurt(df,cols):
    skew= df.select_dtypes(np.number).skew()
    abs_skew = abs(skew)
    kurt = df.select_dtypes(np.number).kurt()
    skew_kurt_table = pd.concat([skew,abs_skew,kurt], axis = 1,
                                keys =['Skew','Absolute Skew','Kurtosis']).sort_values("Skew",ascending = False)
    skew_kurt_table['Skewed'] = skew_kurt_table['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)
    return skew_kurt_table


In [None]:
skew_kurt_df = display_skew_kurt(data_v1,20)

In [None]:
for col in skew_kurt_df.query("Skewed == True").index.values:
    data_v1[col] = np.log1p(data_v1[col])

* ## Cosine transformation 

Since the sales in summer are always higher than winter we will give higher priority to June and its surroundings and less to Jan and december

[View in space](http://https://www.desmos.com/calculator)

In [None]:
data_v1['MoSold'] = (-np.cos(5.236) * data_v1['MoSold'])

* ## Target transformation 

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(y_target, kde=True, fit=norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log(y_target), kde=True, fit=norm,color='darkblue')
plt.xlabel("Log SalePrice")
plt.title("With Log Transform")

plt.show()

In [None]:
y_target_log = np.log(y_target)

## Encoding Categorical columns
* One hot Encoding using get dummies

In [None]:
data_v2 = data_v1.copy()

In [None]:
data_v2 = pd.get_dummies(data_v2)
data_v2

## Scaling

In [None]:
data_v3= data_v2.copy()

In [None]:
scaler = MinMaxScaler()
data_v3 = pd.DataFrame(scaler.fit_transform(data_v2), columns = data_v3.columns)
data_v3.head()

In [None]:
imputer = KNNImputer(n_neighbors=5)
data_v3 = pd.DataFrame(imputer.fit_transform(data_v3),columns = data_v3.columns)

In [None]:
data_v3.isna().any()

In [None]:
data_v3.isna().sum()

## Split the data

In [None]:
data_v4 = data_v3.copy()

In [None]:
train_final = data_v3.loc[:train_v0.index.max(), :].copy()
test_final = data_v3.loc[train_v0.index.max() + 1:, :].reset_index(drop=True).copy()

## Model Comparisons

In [None]:
X_train, X_val, y_train, y_val =train_test_split(train_final, y_target ,train_size=0.8, test_size=0.2,random_state=0)

* ## Random Forest Model 

In [None]:
forest_model = RandomForestRegressor(n_estimators=500, max_depth=10)
forest_model.fit(X_train, y_train)
predicted_random_forest = forest_model.predict(X_val)
forest_model.score(X_train,y_train)
print('RMSE:', mean_squared_error(np.log(y_val), np.log(predicted_random_forest),squared=False))

In [None]:
forest_preds = forest_model.predict(test_final)

* ## Gradient Boosting Regressor 

In [None]:
GBR_model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000, max_depth=4)
GBR_model.fit(X_train, y_train)
predicted_GBR = GBR_model.predict(X_val)
GBR_model.score(X_train, y_train)
print('RMSE:', mean_squared_error(np.log(y_val), np.log(predicted_GBR),squared=False))

In [None]:
GBR_preds = GBR_model.predict(test_final)

## Submit

In [None]:
submission = pd.concat([test_ids, pd.Series(GBR_preds, name='SalePrice')], axis=1)
submission

In [None]:
submission.to_csv('./submission.csv', index=False, header=True)