In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from numpy import absolute
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Training Data

train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
train.head()

In [None]:
# Testing Data

test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test.head()

In [None]:
# Saving ID on a new variable and Dropping ID column

train_ID = train['Id']
test_ID = test['Id']
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [None]:
# Plotting to find outliers
for items in train.columns:
    if len(train[items].unique()) < 20:
        sns.catplot(x=train[items], y=train['SalePrice'], data=train)
        plt.show()
    else:
        sns.scatterplot(x=train[items],y=train['SalePrice'])
        plt.show()

In [None]:
# Removing Outlier
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)
print(train.shape)

Visualizing output data from train set

In [None]:
# Plotting output data for Visualization
train['SalePrice'].hist(bins = 40)

Data is skewed, we will use log1p to remove the skewness in data

In [None]:
# Adjusting for Skewness
train["SalePrice"] = np.log1p(train["SalePrice"])
y_train = train['SalePrice'].reset_index(drop=True)

train['SalePrice'].hist(bins = 40)

In [None]:
# Concat Testing part of both Train and Test and split SalePrice from Train data for Training
train_data = train.drop(['SalePrice'], axis=1)
total_data = pd.concat([train_data, test]).reset_index(drop=True)

In [None]:
# Total data is full data to be used for EDA
total_data.shape

In [None]:
print('columns containing missing values =',total_data.isnull().any().sum())

In [None]:
# Finding missing values to plot a Graph
missing_counts = pd.DataFrame(total_data.isnull().sum().sort_values(ascending=False))
missing_columns = missing_counts[missing_counts.iloc[:,0]>0]
# Plotting Missing Values
plt.figure(figsize=(20,10))
missing_columns = missing_counts[missing_counts.iloc[:,0]>0]
sns.barplot(x=missing_columns.index,y=missing_columns.iloc[:,0])
plt.xticks(rotation=90)
print(missing_columns)
plt.show()

In [None]:
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
total_data['MSSubClass'] = total_data['MSSubClass'].apply(str)
total_data['YrSold'] = total_data['YrSold'].astype(str)
total_data['MoSold'] = total_data['MoSold'].astype(str)

# Fixing columns : filling Null values with suitable values in columns
total_data['Functional'] = total_data['Functional'].fillna('Typ')
total_data['Electrical'] = total_data['Electrical'].fillna("SBrkr")
total_data['KitchenQual'] = total_data['KitchenQual'].fillna("TA")
total_data['Exterior1st'] = total_data['Exterior1st'].fillna(total_data['Exterior1st'].mode()[0])
total_data['Exterior2nd'] = total_data['Exterior2nd'].fillna(total_data['Exterior2nd'].mode()[0])
total_data['SaleType'] = total_data['SaleType'].fillna(total_data['SaleType'].mode()[0])
total_data["PoolQC"] = total_data["PoolQC"].fillna("None")

In [None]:
# Filling columns with Mode and Median values
total_data["LotFrontage"].fillna(total_data["LotFrontage"].median(),inplace=True)
total_data["MSZoning"].fillna(total_data["MSZoning"].mode(),inplace=True)

In [None]:
# Filling some columns Nan values with 0's

for item in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    total_data[item] = total_data[item].fillna(0)

# Filling categorial columns Nan Values with 'None'

for item in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    total_data[item] = total_data[item].fillna('None')

In [None]:
# Filling remaining Object type columns Nan values with None
objects = []
for i in total_data.columns:
    if total_data[i].dtype == object:
        objects.append(i)
total_data.update(total_data[objects].fillna('None'))

In [None]:
# Filling remaining Numerical type columns with 0 values
numerical = total_data.select_dtypes(include=np.number).columns.tolist()
for i in numerical:
    total_data.update(total_data[i].fillna(0))

Data Engineering 

In [None]:
# Dropping unwanted columns
total_data = total_data.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

In [None]:
# Engineered new columns 
total_data['YrBltAndRemod']=total_data['YearBuilt']+total_data['YearRemodAdd']
total_data['TotalSF']=total_data['TotalBsmtSF'] + total_data['1stFlrSF'] + total_data['2ndFlrSF']

total_data['Total_sqr_footage'] = (total_data['BsmtFinSF1'] + total_data['BsmtFinSF2'] +
                                 total_data['1stFlrSF'] + total_data['2ndFlrSF'])

total_data['Total_Bathrooms'] = (total_data['FullBath'] + (0.5 * total_data['HalfBath']) +
                               total_data['BsmtFullBath'] + (0.5 * total_data['BsmtHalfBath']))

total_data['Total_porch_sf'] = (total_data['OpenPorchSF'] + total_data['3SsnPorch'] +
                              total_data['EnclosedPorch'] + total_data['ScreenPorch'] +
                              total_data['WoodDeckSF'])

In [None]:
# Simplifying Features
total_data['haspool'] = total_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
total_data['has2ndfloor'] = total_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
total_data['hasgarage'] = total_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
total_data['hasbsmt'] = total_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
total_data['hasfireplace'] = total_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# Adding pandas dummy values to encode features
print(total_data.shape)
final_data = pd.get_dummies(total_data).reset_index(drop=True)
print(final_data.shape)

In [None]:
# Splitting Train and test values now with help of variable 'y' 
x_train = final_data.iloc[:len(y_train), :]
x_test = final_data.iloc[len(x_train):, :]

print('x_train', x_train.shape)
print('y_train', y_train.shape)
print('x_test', x_test.shape)

In [None]:
# Removing Overfitting features
overfit = []
for i in x_train.columns:
    counts = x_train[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(x_train) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
# MSZoning_C have an extra field added in data 
overfit.append('MSZoning_C (all)')

X_train = x_train.drop(overfit, axis=1).copy()
X_test = x_test.drop(overfit, axis=1).copy()
Y_train = y_train.copy()

print('X_train', X_train.shape)
print('Y_train', Y_train.shape)
print('X_test', X_test.shape)

Implementing Model 

In [None]:
# Define Model
model = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006, random_state=42)

In [None]:
# define KFold
kfolds = KFold(n_splits=10, shuffle = True, random_state=42)

In [None]:
# evaluate model
score = np.sqrt(-cross_val_score(model, X_train, Y_train,scoring="neg_mean_squared_error",cv=kfolds))
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
cross_val_score

In [None]:
# Training the Model 
xgb = model.fit(X_train, Y_train)

In [None]:
# Predicting Test values
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission.iloc[:,1] = (np.expm1(model.predict(X_test)))

In [None]:
# Fixing output range
q1 = submission['SalePrice'].quantile(0.0042)
q2 = submission['SalePrice'].quantile(0.99)
# Quantiles helping us get some extreme values for extremely low or high values 
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)


In [None]:
# Output
submission.head()

In [None]:
# Submission
submission.to_csv("submission.csv", index=False)