In [None]:
#including all required dependencies
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from xgboost import XGBRegressor
import sklearn.metrics as metrics
import math
from scipy.stats import norm, skew
import seaborn as sns


In [None]:
# reading_data
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#importing seaborn for data visualization
import seaborn as sn

In [None]:
# Plotting distribution
sn.displot(df['SalePrice'])

In [None]:
##Right Skewness. Neutalising by Logarithmic mapping

In [None]:
df['SalePrice'] = np.log1p(df['SalePrice'])
sn.distplot(df['SalePrice']);


In [None]:
# Right Skewness eliminated 

In [None]:
df.corr()

In [None]:
#Plotting Heat Map to visualise correlation data better. Drwan for only features having high correlation 
# (>0.6) with Target Variable
corr = df.corr()
Corr_high = corr.index[abs(corr["SalePrice"])>0.6]
plt.figure(figsize=(10,10))
g = sn.heatmap(df[Corr_high].corr(),annot=True)

In [None]:
## Garage Car and Garage Area highly correlated, as expected


In [None]:
##Removing Garage Area as Garage Cars has higher correlation with Sale Price
y_train = df['SalePrice']
test_id = test['Id']
all_data = pd.concat([df, test], axis=0, sort=False)
all_data = all_data.drop(['Id', 'SalePrice'], axis=1)

In [None]:
# Taking care of null values
Total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum() / all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([Total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

In [None]:
# dropping features having more than 5 missing values
all_data.drop((missing_data[missing_data['Total'] > 5]).index, axis=1, inplace=True)
print(all_data.isnull().sum().max())

In [None]:
total = all_data.isnull().sum().sort_values(ascending=False)
total.head(19)

In [None]:
# Filling missing values of numerical variables with 0
numeric_missed = ['BsmtFinSF1',
                  'BsmtFinSF2',
                  'BsmtUnfSF',
                  'TotalBsmtSF',
                  'BsmtFullBath',
                  'BsmtHalfBath',
                  'GarageArea',
                  'GarageCars']

for feature in numeric_missed:
    all_data[feature] = all_data[feature].fillna(0)


In [None]:
# Imputing mode of categorical variables in the missing values
categorical_missed = ['Exterior1st',
                  'Exterior2nd',
                  'SaleType',
                  'MSZoning',
                   'Electrical',
                     'KitchenQual']

for feature in categorical_missed:
    all_data[feature] = all_data[feature].fillna(all_data[feature].mode()[0])

In [None]:

all_data['Functional'] = all_data['Functional'].fillna('Typ')


In [None]:
#Dropping utilities feature
all_data.drop(['Utilities'], axis=1, inplace=True)


In [None]:
# Detecting skewness of features
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_feats[abs(skewed_feats) > 0.5]
high_skew

In [None]:
# Eliminating right skewness of features
for feature in high_skew.index:
    all_data[feature] = np.log1p(all_data[feature])


In [None]:
# Creating new feature totalSF adding Basement,First Floor and Second Floor SF
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [None]:
# Generating dumies for categorical feature
all_data = pd.get_dummies(all_data)
all_data.head()

In [None]:
#preparing for cross validation
x_train =all_data[:len(y_train)]
x_test = all_data[len(y_train):]

In [None]:
# Performing KFold Cross Validation
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
    kf = KFold(5,shuffle=True,random_state=42).get_n_splits(x_train.values)
    rmse = np.sqrt(-cross_val_score(model, x_train, y_train,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)
def rmse_CV_test(model):
    kf = KFold(5,shuffle=True,random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model, x_test, y_test,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)

In [None]:
# Preparing extreme gradient boost ensemble technique and training the model
import xgboost as XGB

the_model = XGB.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state =7, nthread = -1)
the_model.fit(x_train, y_train)

In [None]:
# Generating prediction and preparing file for submission
y_predict = np.floor(np.expm1(the_model.predict(x_test)))
sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = y_predict
sub.to_csv('mysubmission.csv',index=False)