# Part 1: Data Preprocessing

Dataset: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv

## Importing the libraries and dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
pd.set_option('max_rows', 200)

In [None]:
pd.set_option('max_columns', 200)

## Data exploration

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.columns

In [None]:
dataset.info()

In [None]:
# statistical summary
dataset.describe()

In [None]:
# Numerical columns
dataset.select_dtypes(include=['int64', 'float64']).columns

In [None]:
len(dataset.select_dtypes(include=['int64', 'float64']).columns)

In [None]:
# categorical columns
dataset.select_dtypes(include='object').columns

In [None]:
len(dataset.select_dtypes(include='object').columns)

## Dealing with null values

In [None]:
dataset.isnull().values.any()

In [None]:
dataset.isnull().values.sum()

In [None]:
dataset.isnull().sum()

In [None]:
# columns with null values
dataset.columns[dataset.isnull().any()]

In [None]:
len(dataset.columns[dataset.isnull().any()])

In [None]:
# null values with heatmap
plt.figure(figsize=(16,9))
sns.heatmap(dataset.isnull())
plt.show()

In [None]:
dataset.shape

In [None]:
null_percent = dataset.isnull().sum() / dataset.shape[0] * 100

# (missing values / total values) * 100

In [None]:
null_percent

In [None]:
# columns to drop (more than 50% null values)
cols_to_drop = null_percent[null_percent > 50].keys()

In [None]:
cols_to_drop

In [None]:
dataset = dataset.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])

In [None]:
dataset.shape

In [None]:
# columns with null values
dataset.columns[dataset.isnull().any()]

In [None]:
len(dataset.columns[dataset.isnull().any()])

**Add columns mean to numerical columns**

In [None]:
# Numerical Columns
# 'LotFrontage', 'MasVnrArea', 'GarageYrBlt'

In [None]:
dataset['LotFrontage'] = dataset['LotFrontage'].fillna(dataset['LotFrontage'].mean())
dataset['MasVnrArea'] = dataset['MasVnrArea'].fillna(dataset['MasVnrArea'].mean())
dataset['GarageYrBlt'] = dataset['GarageYrBlt'].fillna(dataset['GarageYrBlt'].mean())

In [None]:
len(dataset.columns[dataset.isnull().any()])

**Add columns mode to categorical columns**

In [None]:
dataset.select_dtypes(include='object').columns

In [None]:
dataset.columns[dataset.isnull().any()]

In [None]:
len(dataset.columns[dataset.isnull().any()])

In [None]:
dataset['MasVnrType'] = dataset['MasVnrType'].fillna(dataset['MasVnrType'].mode()[0])
dataset['BsmtQual'] = dataset['BsmtQual'].fillna(dataset['BsmtQual'].mode()[0])
dataset['BsmtCond'] = dataset['BsmtCond'].fillna(dataset['BsmtCond'].mode()[0])
dataset['BsmtExposure'] = dataset['BsmtExposure'].fillna(dataset['BsmtExposure'].mode()[0])
dataset['BsmtFinType1'] = dataset['BsmtFinType1'].fillna(dataset['BsmtFinType1'].mode()[0])
dataset['BsmtFinType2'] = dataset['BsmtFinType2'].fillna(dataset['BsmtFinType2'].mode()[0])
dataset['Electrical'] = dataset['Electrical'].fillna(dataset['Electrical'].mode()[0])
dataset['FireplaceQu'] = dataset['FireplaceQu'].fillna(dataset['FireplaceQu'].mode()[0])
dataset['GarageType'] = dataset['GarageType'].fillna(dataset['GarageType'].mode()[0])
dataset['GarageFinish'] = dataset['GarageFinish'].fillna(dataset['GarageFinish'].mode()[0])
dataset['GarageQual'] = dataset['GarageQual'].fillna(dataset['GarageQual'].mode()[0])
dataset['GarageCond'] = dataset['GarageCond'].fillna(dataset['GarageCond'].mode()[0])

In [None]:
len(dataset.columns[dataset.isnull().any()])

In [None]:
dataset.isnull().values.any()

## Distplot

In [None]:
# distplot of the target variable

plt.figure(figsize=(16,9))
bar = sns.distplot(dataset['SalePrice'])
bar.legend(["Skewness: {:.2f}".format(dataset['SalePrice'].skew())])
plt.show()

## Correlation matrix

In [None]:
dataset_2 = dataset.drop(columns='SalePrice')

In [None]:
dataset_2.shape

In [None]:
dataset_2.corrwith(dataset['SalePrice']).plot.bar(
    figsize=(16,9), title='Correlated with SalePrice', grid=True
)

In [None]:
# heatmap
plt.figure(figsize=(25, 25))
ax = sns.heatmap(data=dataset.corr(), cmap='coolwarm', annot=True, linewidths=2)

In [None]:
high_corr = dataset.corr()

In [None]:
high_corr_features = high_corr.index[abs(high_corr['SalePrice']) > 0.5]

In [None]:
high_corr_features

In [None]:
len(high_corr_features)

In [None]:
# heatmap
plt.figure(figsize=(16,9))
ax = sns.heatmap(data=dataset[high_corr_features].corr(), cmap='coolwarm', annot=True, linewidths=2)

## Dealing with the categorical values

In [None]:
dataset.shape

In [None]:
dataset.columns

In [None]:
# categorical columns
dataset.select_dtypes(include='object').columns

In [None]:
len(dataset.select_dtypes(include='object').columns)

In [None]:
dataset = pd.get_dummies(data=dataset, drop_first=True)

In [None]:
dataset.shape

In [None]:
dataset.columns

In [None]:
# categorical columns
dataset.select_dtypes(include='object').columns

In [None]:
len(dataset.select_dtypes(include='object').columns)

## Splitting the dataset

In [None]:
# independ variables / matrix of features
x = dataset.drop(columns='SalePrice')

In [None]:
# target variable / dependent variable
y = dataset['SalePrice']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

## Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

In [None]:
x_test

# Part 2: Building the model

## 1) Multiple linear regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor_mlr = LinearRegression()
regressor_mlr.fit(x_train, y_train)

In [None]:
y_pred = regressor_mlr.predict(x_test)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test, y_pred)

## 2) Random forest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor()
regressor_rf.fit(x_train, y_train)

In [None]:
y_pred = regressor_rf.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

## 3) XGBoost regression

In [None]:
from xgboost import XGBRFRegressor
regressor_xgb = XGBRFRegressor()
regressor_xgb.fit(x_train, y_train)

In [None]:
y_pred = regressor_xgb.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

# Part 3: Hyper parameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters = {
    'n_estimators':[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'min_samples_split':[2, 5, 10],
    'min_samples_leaf':[1, 2, 4],
    'max_features':['auto', 'sqrt'],
    'bootstrap':[True, False]    
}

In [None]:
parameters

In [None]:
random_cv = RandomizedSearchCV(estimator=regressor_rf, param_distributions=parameters, n_iter=50, cv=5,
                               verbose=2, n_jobs=-1, random_state=0)

In [None]:
random_cv.fit(x_train, y_train)

In [None]:
random_cv.best_estimator_

In [None]:
random_cv.best_params_

# Part 4: Final model (Random forest regressor)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=50, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
regressor.fit(x_train, y_train)

In [None]:
y_pred = regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

# Part 5: Prediction on test data

## Getting columns list

In [None]:
# Read the test data
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
test.shape

In [None]:
columns_list = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition']

In [None]:
print(len(columns_list))

In [None]:
test = test[columns_list]

## Dealing with NULL values

In [None]:
# columns with null values
null_columns = test.columns[test.isnull().any()]

In [None]:
len(null_columns)

In [None]:
test.isnull().sum() / test.shape[0] * 100

In [None]:
null_p = test.isnull().sum() / test.shape[0] * 100

In [None]:
null_p.sort_values(ascending=False)

In [None]:
test[null_columns].head(10)

In [None]:
cat_null_columns = test[null_columns].select_dtypes(include='object').columns

In [None]:
num_null_columns = test[null_columns].select_dtypes(exclude='object').columns

In [None]:
print(len(cat_null_columns), len(num_null_columns))

### Mean for numerical columns

In [None]:
num_null_columns

In [None]:
for col in num_null_columns:
    test[col] = test[col].fillna(test[col].mean())

In [None]:
len(test.columns[test.isnull().any()])

### Mode for categorical columns

In [None]:
for col in cat_null_columns:
    test[col] = test[col].fillna(test[col].mode()[0])

In [None]:
len(test.columns[test.isnull().any()])

## Dealing with categorical values

In [None]:
test_df = pd.get_dummies(data=test, drop_first=True)

In [None]:
test_df.shape

In [None]:
test_df.head()

### Split

In [None]:
# independ variables / matrix of features
x = dataset.drop(columns='SalePrice')

In [None]:
# target variable / dependent variable
y = dataset['SalePrice']

In [None]:
final_columns = test_df.columns

In [None]:
x = x[final_columns]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

In [None]:
x_test

### Final Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=50, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
regressor.fit(x_train, y_train)

In [None]:
y_pred = regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

## Prediction on test data

### Scaling

In [None]:
test_df.shape

In [None]:
test_x = sc.transform(test_df)

### Pred

In [None]:
# Use the model to make predictions
predicted_prices = regressor.predict(test_x)

# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

In [None]:
len(predicted_prices)

### Submission

In [None]:
my_submission = pd.DataFrame({'Id': test_df.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission.shape