In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpb
import pandas as pd
import numpy as np
from scipy.stats import skew
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
%matplotlib inline
sns.set_theme()

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(train.isnull(), ax=ax)

In [None]:
print("Train: ", train.shape)
print("Test: ", test.shape)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(ax=ax)

Alley, FireplaceQu, PoolQc, Fence, MiscFeature and some of the features with most missing values. Hence, it is worth while to remove this features from the training data

In [None]:
train.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True, axis=1)
test.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True, axis=1)

## Correlation Matrix

In [None]:
corr = train.corr()
fig, ax = plt.subplots(figsize=(14,8))
sns.heatmap(corr)

We can see that a lot of features are highly correlated. This features can be removed when using high computational model to increase speed such as XgBoost but for now we will keep it

# Data Preprocessing

In [None]:
target = ['SalePrice']
cat_features = train.drop(columns=['Id', 'SalePrice']).select_dtypes(include='object').columns.tolist()
num_features = train.drop(columns=['Id', 'SalePrice']).select_dtypes(include=np.number).columns.tolist()
all_features = cat_features + num_features

We have created target and feature variables and separately stored the categorical and numerical features in a list

### Log Transform the data

In [None]:
mpb.rcParams['figure.figsize'] = (15.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train['SalePrice'])})
prices.hist()

In [None]:
train.SalePrice = np.log(train.SalePrice)

In [None]:
# Pipeline for categorical features
cat_tfms = Pipeline(steps=[
    ('cat_ordenc', ce.OrdinalEncoder(return_df=True, handle_unknown='value', handle_missing='value'))
])

# Pipeline for numerical features
num_tfms = Pipeline(steps=[
    ('num_imputer',  SimpleImputer(missing_values=np.nan, strategy='median'))
])

features = ColumnTransformer(transformers=[
    ('cat_tfms', cat_tfms, cat_features),
    ('num_tfms', num_tfms, num_features)
], remainder='passthrough')

In [None]:
X = train[all_features]
y = train.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True,random_state=42)
X_train_tf = pd.DataFrame(features.fit_transform(X_train), columns=all_features)
X_test_tf = pd.DataFrame(features.fit_transform(X_test), columns=all_features)
test_tf = test[all_features]
test_tf = pd.DataFrame(features.transform(test_tf), columns=all_features)
enc_map = dict()
for feat in cat_features: enc_map[feat] = dict(zip(X_train[feat], X_train_tf[feat]))

In [None]:
print("X_train shape: ", X_train_tf.shape)
print("test shape:", test_tf.shape)

# Model Training

In [None]:
rf = RandomForestRegressor(
    n_estimators=50, max_depth=None, min_samples_leaf=1, min_samples_split=2,
    max_features=.7, max_samples=None, n_jobs=-1, random_state=42)

In [None]:
rf.fit(X_train_tf, y_train)

In [None]:
y_preds = rf.predict(X_test_tf)

In [None]:
rmse = mean_squared_error(y_test, y_preds, squared=False)
rmse

# Submission

In [None]:
preds = rf.predict(test_tf)

In [None]:
submission = pd.DataFrame({
    'Id': np.asarray(test.Id), 
    'SalePrice': preds.astype(int)
})
submission.to_csv('my_submission.csv', index=False)