In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import skew, norm, boxcox_normmax
from scipy.special import boxcox1p

import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

# Overview 

1. [Information about the data](#1)
2. [Exploratory Data Analysis](#2)
3. [Feature Selection and Preprocessing](#3)
4. [Modelling and Evaluating](#4)

# <a id="1"></a> 1. Information about the data

In [None]:
# loading train and test data
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
print("Train Shape:",train.shape)
print("Test Shape:",test.shape)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe().T

# <a id="2" ></a> 2. Exploratory Data Analysis

**Correlation coefficients between features. Each cell in the table shows the correlation between two variables. High correlation between 2 features are bad for us because of including same information.**

In [None]:

plt.figure(figsize=(20,20))
sns.heatmap(train.corr(), cmap="coolwarm")
plt.show()

**There are 81 features and since it will take a long time to evaluate all the features, I will only go through the some features that I am wonder about.**

These are:

    * SalePrice
    * YearBuilt and SalePrice
    * OverallQual and SalePrice
    * YearBuily and OverallQual
    * Fireplaces and SalePrice
    * GrLivArea and SalePrice
    * GarageArea and SalePrice

In [None]:
#SalePrice

plt.figure(figsize=(12,6))
sns.boxplot(x=train.SalePrice,color="#bf9e7e")

plt.show()

In [None]:
# YearBuilt and SalePrice

plt.figure(figsize=(12,8))
sns.regplot(train.YearBuilt , train.SalePrice, 
            scatter_kws={"color":"#824155"}, 
            line_kws={"linewidth":3, "color": "#6ca19e" ,"label":"Mean Sale Price"},
            label="Sale Price")
plt.style.use("fivethirtyeight")
plt.title("YearBuilt and SalePrice")
plt.xlabel("YearBuilt")
plt.ylabel("SalePrice")
plt.legend()
plt.show()

In [None]:
# OverallQual and SalePrice  
# YearBuily and OverallQual

plt.figure(figsize=(12,8))
sns.scatterplot(x = train.YearBuilt, y=train.SalePrice , hue=train.OverallQual , palette="tab10", label="OverallQual")

plt.title("OverallQual by YearBuilt and SalePrice")
plt.legend(loc="upper left")
plt.show()

In [None]:
# Fireplaces Counts

plt.figure(figsize=(8,8))

plt.pie(train.groupby("Fireplaces")["SalePrice"].count() , 
        colors=["#427555","#45A0AD","#C27FBA","#7D1F52"],
       labels=train.Fireplaces.unique(),
       shadow=True, wedgeprops={'edgecolor': 'black'}, autopct='%1.1f%%',explode=(0.1, 0, 0, 0.0))

plt.title("Counts of Fireplaces")
plt.show()

In [None]:
# Fireplaces and SalePrice

plt.figure(figsize=(12,8))

sns.barplot(x = train.Fireplaces , y=train.SalePrice, palette=["#427555","#45A0AD","#C27FBA","#7D1F52"])

plt.title("Fireplaces and SalePrice")
plt.show()

In [None]:
# GrLivArea and SalePrice

plt.figure(figsize=(12,8))
sns.scatterplot(x=train.GrLivArea, y=train.SalePrice ,hue=train.OverallQual, palette="icefire")

plt.title("GrLivArea and SalePrice")
plt.legend()
plt.show()

In [None]:
# GarageArea and SalePrice

plt.figure(figsize=(12,8))
sns.scatterplot(x=train.GarageArea, y=train.SalePrice, palette="icefire", color="#4a8591")

plt.title("GarageArea and SalePrice")
plt.show()

# 3. <a id=3 ></a> Feature Selection and Preprocessing

In [None]:
train_num_cols = train.select_dtypes(exclude=["object"]).columns
train_cat_cols = train.select_dtypes(include=["object"]).columns

test_num_cols = test.select_dtypes(exclude=["object"]).columns
test_cat_cols = test.select_dtypes(include=["object"]).columns

## 3.1 Numeric columns Missin Values

### 3.1.1 Train Data

**Count of Missing Values of train_num_cols**

In [None]:
train[train_num_cols].isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train[train_num_cols].isnull().sum().sort_values(ascending=False)[:10] / len(train[train_num_cols])

**The percentage of missing values is not so high. we can fill the missing values with mean**

In [None]:
train["LotFrontage"].fillna(train["LotFrontage"].mean(), inplace=True)

train["GarageYrBlt"].fillna(train["GarageYrBlt"].mean(), inplace=True)

train["MasVnrArea"].fillna(train["MasVnrArea"].mean(), inplace=True)

print("Train num cols missin value:",train[train_num_cols].isnull().sum().sort_values(ascending=False)[:10])

### 3.1.2 Test Data

**Count of Missing Values of test_num_cols**

In [None]:
test[test_num_cols].isnull().sum().sort_values(ascending=False)

**Missing Value Ratio of test_num_cols**

In [None]:
test[test_num_cols].isnull().sum().sort_values(ascending=False) / len(test[test_num_cols])

**The percentage of missing values is not so high. we can fill the missing values with mean**

In [None]:
test["LotFrontage"].fillna(test["LotFrontage"].mean(), inplace=True)
test["GarageYrBlt"].fillna(test["GarageYrBlt"].mean(), inplace=True)
test["MasVnrArea"].fillna(test["MasVnrArea"].mean(), inplace=True)
test["BsmtHalfBath"].fillna(test["BsmtHalfBath"].mean(), inplace=True)
test["BsmtFullBath"].fillna(test["BsmtFullBath"].mean(), inplace=True)
test["BsmtFinSF2"].fillna(test["BsmtFinSF2"].mean(), inplace=True)
test["GarageCars"].fillna(test["GarageCars"].mean(), inplace=True)
test["GarageArea"].fillna(test["GarageArea"].mean(), inplace=True)
test["TotalBsmtSF"].fillna(test["TotalBsmtSF"].mean(), inplace=True)
test["BsmtUnfSF"].fillna(test["BsmtUnfSF"].mean(), inplace=True)
test["BsmtFinSF1"].fillna(test["BsmtFinSF1"].mean(), inplace=True)

## 3.2 Categoric columns Missing Values

***Note :*** **Since working categorical column's missing values , if missing value ratio is so high i prefer the drop this column but column missing value ratio is not high i prefer filling missing values with most frequent value**

### 3.2.1 Train Data

In [None]:
train[train_cat_cols].isnull().sum().sort_values(ascending=False)[:20] / len(train[train_cat_cols])

**These columns have so high missin value ratio  so i will drop them**
    
    PoolQC          0.997944
    MiscFeature     0.965045
    Alley           0.926662
    Fence           0.801234

In [None]:
train.drop(["PoolQC","MiscFeature","Alley","Fence"], axis=1, inplace=True)

In [None]:
train["FireplaceQu"].fillna("Gd", inplace=True)
train["GarageCond"].fillna("Ta", inplace=True)
train["GarageQual"].fillna("Ta", inplace=True)
train["GarageFinish"].fillna("Unf", inplace=True)
train["GarageType"].fillna("Attchd", inplace=True)
train["BsmtCond"].fillna("TA", inplace=True)
train["BsmtQual"].fillna("TA", inplace=True)
train["BsmtExposure"].fillna("No", inplace=True)
train["BsmtFinType2"].fillna("Unf", inplace=True)
train["BsmtFinType1"].fillna("Unf", inplace=True)
train["MasVnrType"].fillna("None", inplace=True)
train["MSZoning"].fillna("RL", inplace=True)
train["Utilities"].fillna("AllPub", inplace=True)
train["Functional"].fillna("Typ", inplace=True)
train["Exterior2nd"].fillna("VinylSd", inplace=True)
train["KitchenQual"].fillna("TA", inplace=True)
train["Electrical"].fillna("SBrkr", inplace=True)

### 3.2.2 Test Data

In [None]:
test[test_cat_cols].isnull().sum().sort_values(ascending=False)[:23]

In [None]:
test.drop(["PoolQC","MiscFeature","Alley","Fence"], axis=1, inplace=True)

In [None]:
test["Exterior1st"].value_counts()

In [None]:
test["FireplaceQu"].fillna("Gd", inplace=True)
test["GarageCond"].fillna("TA", inplace=True)
test["GarageQual"].fillna("TA", inplace=True)
test["GarageFinish"].fillna("Unf", inplace=True)
test["GarageFinish"].fillna("Unf", inplace=True)
test["GarageType"].fillna("Attchd", inplace=True)
test["BsmtCond"].fillna("TA", inplace=True)
test["BsmtQual"].fillna("TA", inplace=True)
test["BsmtExposure"].fillna("No", inplace=True)
test["BsmtFinType2"].fillna("Unf", inplace=True)
test["BsmtFinType1"].fillna("GLQ", inplace=True)
test["MasVnrType"].fillna("None", inplace=True)
test["MSZoning"].fillna("RL", inplace=True)
test["Utilities"].fillna("AllPub", inplace=True)
test["Functional"].fillna("Typ", inplace=True)
test["Exterior2nd"].fillna("VinylSd", inplace=True)
test["KitchenQual"].fillna("TA", inplace=True)
test["SaleType"].fillna("WD", inplace=True)
test["Exterior1st"].fillna("VinylSd", inplace=True)

In [None]:
test.isnull().sum()

## 3.3 Categorical Columns Encode

In [None]:
train_cat_cols = train.select_dtypes(include=["object"]).columns
test_cat_cols = test.select_dtypes(include=["object"]).columns

train = pd.get_dummies(train, columns = train_cat_cols, drop_first=True)
test = pd.get_dummies(test, columns = test_cat_cols, drop_first=True)

## 3.4 Numerical Columns Skewness

**Skewness refers to a distortion or asymmetry that deviates from the symmetrical bell curve, or normal distribution, in a set of data.**
    
    skewness = 0 : normally distributed.
    skewness > 0 : more weight in the left tail of the distribution.
    skewness < 0 : more weight in the right tail of the distribution. 
    
    
**Pozitive skewness Normalization Methods :**

    1. Log Transform (my choice in this case)
    2. Root Transform
    3. Reciprocals Transformation
    
**Negative skewness Normalization Methods :**

    1. Square Transformation
    2. Cube Transformation
    3. Higher Powers

### 3.4.1 Train Data

In [None]:
skew_cols = train[train_num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_cols[skew_cols > 0.5]
print(high_skew)

skew_index = high_skew.index

# Normalize skewed features
train[skew_index] = np.log1p(train[skew_index])

In [None]:
skew_cols = test[test_num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_cols[skew_cols > 0.5]
print(high_skew)

skew_index = high_skew.index

# Normalize skewed features
train[skew_index] = np.log1p(train[skew_index])

**All columns are Numerical**

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train_id = train["Id"]
test_id = test["Id"]

train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

In [None]:
y = train["SalePrice"]

X = train[test.columns]

## 3.5 Feature Scaling

### 3.5.1 Train data Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

X = scaler.fit_transform(X)

### 3.5.2 Test Data Scaling

In [None]:
scaler=StandardScaler()

test_cols = test.columns
test = scaler.fit_transform(test)
test = pd.DataFrame(test, columns=test_cols)

test.head()

## 3.6 Splitting Train Data for Modelling

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=9)

# <a id="4"> 4. Modelling and Evaluating

In [None]:
from sklearn.metrics import mean_squared_error

## 4.1 Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Linear Regression with Default Parameters
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict values
y_pred = lr.predict(X_test)

print("RMSE of Linear Regression (Default Parameters): %.2f"%np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,8))
sns.regplot(y_test, y_pred, scatter_kws=dict(color="#7583EA"), line_kws=dict(color="#9EA2C1", linewidth=3))
plt.title("Linear Regression Actual vs Predict Train Data")
plt.xlabel("Actual Value")
plt.ylabel("Predicit Value")
plt.show()

## 4.2 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree with Default Parameters
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

y_pred = dt_reg.predict(X_test)

print("RMSE of DT Regressor (Default Parameters): ",np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,8))
sns.regplot(y_test, y_pred, scatter_kws=dict(color="#68813C"), line_kws=dict(color="#C7D134", linewidth=3))
plt.title("DT Regressor Actual vs Predict Train Data")
plt.xlabel("Actual Value")
plt.ylabel("Predicit Value")
plt.show()

## 4.3 Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# RandomForestRegressor with Default Parameters
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

y_pred = rf_reg.predict(X_test)

print("RMSE of RF Regressor (Default Parameters): ",np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,8))
sns.regplot(y_test, y_pred, scatter_kws=dict(color="#A22F59"), line_kws=dict(color="#9F3C96", linewidth=3))
plt.title("RF Regressor Actual vs Predict Train Data")
plt.xlabel("Actual Value")
plt.ylabel("Predicit Value")
plt.show()

## 4.4 Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# RandomForestRegressor with Default Parameters
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)

y_pred = gb_reg.predict(X_test)

print("RMSE of GB Regressor (Default Parameters): ",np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,8))
sns.regplot(y_test, y_pred, scatter_kws=dict(color="#6F2BA7"), line_kws=dict(color="#A590B8", linewidth=3))
plt.title("GB Regressor Actual vs Predict Train Data")
plt.xlabel("Actual Value")
plt.ylabel("Predicit Value")
plt.show()

## 4.5 XGboost

In [None]:
import xgboost

xgb_reg = xgboost.XGBRegressor()

xgb_reg.fit(X_train, y_train)

y_pred = xgb_reg.predict(X_test)

print("RMSE of GB Regressor (Default Parameters): ",np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(8,8))
sns.regplot(y_test, y_pred, scatter_kws=dict(color="#60C8B6"), line_kws=dict(color="#DCD768", linewidth=3))
plt.title("XGB Regressor Actual vs Predict Train Data")
plt.xlabel("Actual Value")
plt.ylabel("Predicit Value")
plt.show()

In [None]:
xgb_best= xgboost.XGBRegressor(max_depth=5,
                      n_estimator=100,
                      reg_lambda=1)
xgb_best.fit(X_train, y_train)

# Creating Submission File

In [None]:
predictions = xgb_best.predict(test)

sample_sub = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
final_data = {'Id': sample_sub.Id, 'SalePrice': predictions}
final_submission = pd.DataFrame(data=final_data)
final_submission.to_csv('submission_file.csv',index =False)