In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### import packages


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm, skew
import pylab
%matplotlib inline

### read data

In [1]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_df = test['Id']

In [1]:
print(train.shape)
print(test.shape)

In [1]:
train.head()

### Exploratory Data Analysis

In [1]:
train.info()

In [1]:
train.describe()

In [1]:
# count the number of missing values in each column
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.plot.bar(rot=75)

In [1]:
# get a list of quantative columns(numeric) 
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
# get a list of qulitative features
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

In [1]:
#get the output and visualize it
y = train['SalePrice']
y.hist(bins=50)
plt.show()

### visualize quantitative features

In [1]:
quantitative_data = train[quantitative]
quantitative_data.hist(bins=50, figsize=(20,20))
plt.show()

### plot the correlation of features with the target

In [1]:
quantitative_data = train[quantitative]
corr_matrix = quantitative_data.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [1]:
plt.subplots(figsize=(20,15))
ax = sns.heatmap(quantitative_data.corr(),
                vmin=-1, vmax=1, center=0,
                cmap=sns.diverging_palette(20, 220, n=200),
                square=True)

### make a boxplot to show how the output changes with qualitative features

In [1]:
train_to_plot = train.copy()
for c in qualitative:
    train_to_plot[c] = train_to_plot[c].astype('category')
    if train_to_plot[c].isnull().any():
        train_to_plot[c] = train_to_plot[c].cat.add_categories(['MISSING'])
        train_to_plot[c] = train_to_plot[c].fillna('MISSING')
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)
#create a box plot to show how the output changes with qualitative features
f = pd.melt(train_to_plot, id_vars=['SalePrice'], value_vars=qualitative)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, height=5)
g = g.map(boxplot, "value", "SalePrice")

### handling missing values

In [1]:
train[quantitative].isnull().sum()

In [1]:
# fill missing values in train quantitative features
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"]\
                            .transform(lambda x: x.fillna(x.median()))
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(0)


# fill missing values in test quantitative features

test["LotFrontage"] = test.groupby("Neighborhood")["LotFrontage"]\
                          .transform(lambda x: x.fillna(x.median()))
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(0)

In [1]:
train[quantitative].isnull().sum()

In [1]:
qualitative

### handle missing values in categorical data

In [1]:
train[qualitative].isnull().sum()

In [1]:
train[qualitative] = train[qualitative].fillna('unknown')
test[qualitative] = test[qualitative].fillna('unknown')

In [1]:
train[qualitative].isnull().sum()
test[qualitative].isnull().sum()

### feature selection

In [1]:
train['Alley'].unique()

In [1]:
# drop features with many missing values 
train = train.drop(columns=["Id", "Fence", "CentralAir", "FireplaceQu", "PoolArea", "LowQualFinSF", "3SsnPorch", "MiscVal", 'RoofMatl','Street','Condition2','Utilities','Heating'])

test = test.drop(columns=["Id", "Fence", "CentralAir", "FireplaceQu", "PoolArea", "LowQualFinSF", "3SsnPorch", "MiscVal", 'RoofMatl','Street','Condition2','Utilities','Heating'])

In [1]:
train.drop(['GarageYrBlt','TotRmsAbvGrd','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)

test.drop(['GarageYrBlt','TotRmsAbvGrd','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)

In [1]:
print(train.shape)
print(test.shape)
train.isnull().sum()

### feature encoding

In [1]:
ordinal_ranking = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NA':0, 'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'Reg':3, 'IR1':2, 'IR2':1,'IR3':0,'AllPub':4, 'NoSewr':3, 'NoSeWa':2, 
            'ELO':1, 'Gtl':3, 'Mod':2, 'Sev':1, 'Av':3, 'Mn':2, 'No':1, 'GLQ':5, 'ALQ':4, 'BLQ':3, 'Rec':2, 'LwQ':1, 'Unf':-1, 'Typ':8, 'Min1':7, 'Min2':6, 'Mod':5,
            'Maj1':4, 'Maj2':3, 'Sev':2, 'Sal':1, 'Fin':2, 'RFn':1, 'unknown':0}

In [1]:
train = train.replace(ordinal_ranking)
test = test.replace(ordinal_ranking)

In [1]:
nominal_features = ['Alley','LandContour', 'LotConfig','Neighborhood','Condition1', 'BldgType','RoofStyle',
                    'MasVnrType','Foundation','GarageType','PavedDrive',
                   'SaleCondition']

In [1]:
train[nominal_features].head()

In [1]:
# one_hot_encoding
train = pd.get_dummies(data = train , columns = nominal_features, drop_first=True)

test = pd.get_dummies(data = test , columns = nominal_features, drop_first=True)

In [1]:
print(train.shape)
print(test.shape)


In [1]:
col_to_drop = ['MiscFeature' , 'Electrical' , 'SaleType' , 'Exterior2nd' , 'Exterior1st' , 'HouseStyle' , 'MSZoning']

In [1]:
train = train.drop(columns = col_to_drop)

test = test.drop(columns = col_to_drop)

In [1]:
print(train.shape)
print(test.shape)

In [1]:
# from sklearn.preprocessing import OrdinalEncoder
# col_to_encode = ['MiscFeature' , 'Electrical' , 'SaleType' , 'HouseStyle' , 'MSZoning']
# cols = ['Exterior2nd', 'Exterior1st', ]
# train.drop(columns = cols)
# test.drop(columns=cols)
# ordinal_encoder = OrdinalEncoder()
# # Assigning numerical values and storing in another column
# train[col_to_encode] = ordinal_encoder.fit_transform(train[col_to_encode])
# test[col_to_encode] = ordinal_encoder.transform(test[col_to_encode])
# for col in col_to_encode:
#     print(train[col].unique())

### model

In [1]:
y = train['SalePrice']
x = train.drop(columns = ['SalePrice'])

In [1]:
# split data
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = 0.8, test_size=0.2, random_state=0)

### train the model

In [1]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

model_1=RandomForestRegressor(n_estimators=500, random_state=0)
xg_model = XGBRegressor(n_estimators=700, learning_rate=0.01)
model3 = LinearRegression()
model4 = Ridge(alpha=1.0, normalize=True, tol=0.001, solver='svd', random_state=0)

xg_model.fit(x_train, y_train)
model_1.fit(x_train, y_train)
model3.fit(x_train, y_train)
model4.fit(x_train, y_train)

### make predictions

In [1]:
predicted=xg_model.predict(x_val)

predicted1 = model_1.predict(x_val)

preds = model3.predict(x_val)
preds1 = model4.predict(x_val)

### calculate error and accuracy

In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np


print('mean_squared_error using XGBRegressor : {}'.format(np.sqrt(mean_squared_error(y_val,predicted))))

print('mean_squared_error using RandomForestRegressor : {}'.format(np.sqrt(mean_squared_error(y_val,predicted1))))
print('*' * 100)
print('mean_absolute_error using XGBRegressor : {}'.format(mean_absolute_error(y_val,predicted)))

print('mean_absolute_error using RandomForestRegressor : {}'.format(mean_absolute_error(y_val,predicted1)))
print('*' * 100)
print('Accuracy using XGBRegressor : {}'.format(r2_score(y_val,predicted)))

print('Accuracy using RandomForestRegressor : {}'.format(r2_score(y_val,predicted1)))

In [1]:
print('mean_squared_error using Ridge : {}'.format(np.sqrt(mean_squared_error(y_val,preds1))))

print('*' * 100)
print('mean_absolute_error using Ridge : {}'.format(mean_absolute_error(y_val,preds1)))

print('*' * 100)
print('Accuracy using Ridge : {}'.format(r2_score(y_val,preds1)))



In [1]:
# test = test.fillna(0)
# np.where(np.isnan(test))
# # x_val.info()
# y_test_pred = model_1.predict(test)

In [1]:
# test['SalePrice']= y_test_pred
# test['Id']= test_df
# test[['Id','SalePrice']].to_csv('/kaggle/working/submission.csv', index=False)