# Housing Price Prediction - Advanced Regression Techniques. Score = 0.13275

We will be using advanced Regression techniques to predict the housing sales price with maximum accuracy.  

Structure of the Machine Learning project will be used as below.

    Exploratory Data Analysis
    Data Visualization 
    Data Preprocessing 
    Feature Engineering
    Data Modeling
    Hyperparamter Optimization
    Submission 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm, skew
import statsmodels.api as plt
import matplotlib.pyplot as plt

%matplotlib inline

import xgboost as XGB
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

import sklearn

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

import warnings
warnings.filterwarnings(action="ignore")


# **Import Data**

In [1]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

train.info()

In [1]:
train.shape

# **Exploratory Data Analysis**

In [1]:
# target 

target = train['SalePrice']
print (target.describe())

In [1]:
# check the skewness

sns.distplot(target)

In [1]:
skew = target.skew()
kurt = target.kurt()

print ("Skewness: %f" %skew)
print ("Kurtosis: %f" %kurt)

In [1]:
# log transformation 

target_transform = np.log1p(target)
sns.distplot(target, fit=norm);

In [1]:
# correlation matrix for features.

corr_mat = train.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corr_mat, vmax = .8, square = True)

In [1]:
high_corr = train.corr()
high_corr_features = high_corr.index[abs(high_corr["SalePrice"])>0.5]

plt.figure(figsize=(10,10))
mat = sns.heatmap(train[high_corr_features].corr(), annot=True, cmap="RdYlGn")

* It's important to know what you do and how benefit from it. We can see 'OverQual' in the top of highest correlation it's 0.79!
* 'GarageCars' & 'GarageArea' like each other (correlation between them is 0.88)
* 'TotalBsmtSF' & '1stFlrSF' also like each other (correlation betwwen them is 0.82), so we can keep either one of them or add the1stFlrSF to the Toltal.
* 'TotRmsAbvGrd' & 'GrLivArea' also has a strong correlation (0.83), I decided to keep 'GrLivArea' because it's correlation with 'SalePrice' is higher.

In [1]:
corr_SalePrice = pd.Series(high_corr["SalePrice"].sort_values(ascending=False))
features = corr_SalePrice[corr_SalePrice.between(0.55, 1)]
print (features)

features = features.index

In [1]:
features

In [1]:
y_train = train['SalePrice']
test_id = test['Id']
full_data = pd.concat([train, test], axis = 0, sort=False)
full_data = full_data.drop(['Id', 'SalePrice'], axis = 1)

In [1]:
full_data.head()

# **Data Cleaning**

In [1]:
total_null = full_data.isnull().sum().sort_values(ascending=False)
percent = (full_data.isnull().sum()/full_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total_null, percent], axis=1, keys=['Total','Percent'])

missing_data.head(20)

In [1]:
full_data.drop((missing_data[missing_data['Total'] > 5]).index, axis=1, inplace = True)
print (full_data.isnull().sum().max())

In [1]:
total = full_data.isnull().sum().sort_values(ascending=False)
total.head(20)

In [1]:
full_data.shape

In [1]:
numeric_missed = ['BsmtFinSF1',
                  'BsmtFinSF2',
                  'BsmtUnfSF',
                  'TotalBsmtSF',
                  'BsmtFullBath',
                  'BsmtHalfBath',
                  'GarageArea',
                  'GarageCars']

for features in numeric_missed:
    full_data[features] = full_data[features].fillna(0)

In [1]:
categorical_missed = ['Exterior1st',
                  'Exterior2nd',
                  'SaleType',
                  'MSZoning',
                   'Electrical',
                     'KitchenQual']

for feature in categorical_missed:
    full_data[feature] = full_data[feature].fillna(full_data[feature].mode()[0])

In [1]:
full_data['Functional'] = full_data['Functional'].fillna('Type')


In [1]:
full_data.drop(['Utilities'], axis = 1, inplace=True)

In [1]:
full_data = pd.get_dummies(full_data)

In [1]:
full_data.isnull().sum().max()

# **Features**

In [1]:
from scipy.stats import skew

numeric = full_data.dtypes[full_data.dtypes != 'object'].index
skewed = full_data[numeric].apply(lambda col: skew(col)).sort_values(ascending=False)
skewed = skewed[abs(skewed) > 0.5]

for col in skewed.index:
    full_data[col] = np.log1p(full_data[col])

In [1]:
full_data['TotalSF'] = full_data['TotalBsmtSF'] + full_data['1stFlrSF'] + full_data['2ndFlrSF']

In [1]:
x_train = full_data[:len(y_train)]
x_test = full_data[len(y_train):]

In [1]:
x_train.shape, x_test.shape

# **Data Modeling - XGBoost**

In [1]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error


# def rmse():
#     return mean_squared_error(y_actual, y_predicted, squared=False)

In [1]:


model_xgb = XGB.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
                        learning_rate=0.05, max_depth=3, 
                        min_child_weight=1.7817, n_estimators=2200,
                        reg_alpha=0.4640, reg_lambda= 0.8571,
                        subsample=0.5213, random_state=7, nthread=-1)
model_xgb.fit(x_train,y_train)



In [1]:
y_predict = model_xgb.predict(x_test)


# y_predict_las = las.predict(x_test)

y_predict

# **Submission**

In [1]:
submission_xgb1 = pd.DataFrame()
submission_xgb1['Id'] = test_id
submission_xgb1['SalePrice'] = y_predict
submission_xgb1.to_csv('mysubmission_xgb1.csv', index=False)

