In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read , concatenate and explore the dataset

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# check the size of each data
print("train data : {}".format(train.shape))
print("test data : {}".format(test.shape))

In [None]:
# save id of train and test data 
train_id = train['Id']
test_id = test['Id']

# drop them from dataset beacuse we don't need them in the model
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [None]:
# concatente train and test data do do our preprocessing in both of them
ntrain = train.shape[0]
ntest = test.shape[0]

y_train= train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

all_data = pd.concat((train, test), ignore_index=True)
print("all_data size is : {}".format(all_data.shape))

In [None]:
all_data.head()

In [None]:
all_data.info()

# preprocessing 
## drop columns that has :
* alot of null values and will not affect our model
* is compined from other columns

In [None]:
all_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'], axis=1, inplace=True)

all_data.shape

In [None]:
# if we see this data, we find that toatalBsmt is sum of tree other column
print(all_data[['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']].head())

all_data.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)

all_data.shape

## fill missing values of all columns
* categories : fill with most frequuent
* numric : fill with zeros or mean

In [None]:
# for categoral columns 
# replace null with most frequent 
categoral_mode = ['MSZoning', 'MasVnrType' ,'Electrical', 'SaleType', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional']
for col in categoral_mode:
  all_data[col].fillna(all_data[col].mode()[0], inplace=True)

# replace NA with NoB 
no_basement = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in no_basement:
  all_data[col].fillna('NoB', inplace=True)

# replace NA with NoG:
no_garage = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in no_garage:
  all_data[col].fillna('NoG', inplace=True)



In [None]:
# numric  columns
# replace null with mean 
numric_mean = ['LotFrontage', 'GarageYrBlt']
for col in numric_mean:
  all_data[col].fillna(all_data[col].median(), inplace=True)

# replace null with zero 
numric_zero =['MasVnrArea', 'GarageCars', 'GarageArea', 'BsmtFullBath', 'BsmtHalfBath', 'TotalBsmtSF']
for col in numric_zero:
  all_data[col].fillna(0, inplace=True)

In [None]:
all_data.info()

# Encoding Categoral columns :
* one hot encoding
* ordinal encoding

In [None]:
# doing one hot encoding for nominal column
one_hot = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', \
           'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', \
           'Heating', 'Functional', 'GarageType', 'SaleType', 'SaleCondition']

all_data = pd.get_dummies(all_data, columns=one_hot)

In [None]:
# doing odinal encoding for ordinal column
ordinal = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', \
           'CentralAir', 'Electrical', 'KitchenQual', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive']

ordinal_mapping  = {'ExterQual'     : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'ExterCond'     : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'BsmtQual'      : {'NoB': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'BsmtCond'      : {'NoB': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'BsmtExposure'  : {'NoB': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
                    'BsmtFinType1'  : {'NoB': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
                    'BsmtFinType2'  : {'NoB': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
                    'HeatingQC'     : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'CentralAir'    : {'N': 0, 'Y': 1},
                    'Electrical'    : {'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5},
                    'KitchenQual'   : {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'GarageFinish'  : {'NoG': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
                    'GarageQual'    : {'NoG': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'GarageCond'    : {'NoG': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
                    'PavedDrive'    : {'N': 1, 'P': 2, 'Y': 3}
                    }

all_data.replace(ordinal_mapping, inplace=True)

In [None]:
all_data.info()

In [None]:
all_data.head()

# normalize numric columns

In [None]:
#from sklearn.preprocessing import MinMaxScaler

#mms = MinMaxScaler()
#all_data = pd.DataFrame(mms.fit_transform(all_data), columns=all_data.columns)
#all_data.head()

# Define model

In [None]:
# first get train and test data
x_train = all_data[:ntrain]
x_test = all_data[ntrain:]

In [None]:
from sklearn.model_selection import train_test_split

# split train data to train and val
X_train, X_valid, y_train, y_valid = train_test_split(x_train, y_train, train_size=0.8, test_size=0.2,random_state=0)


## 1 - Gradient Boosting regression model

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

final_GBR_model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000, max_depth=4)
final_GBR_model.fit(X_train, y_train)

final_GBR_model.score(X_train, y_train)

In [None]:
final_GBR_preds_valid = final_GBR_model.predict(X_valid)
print("Rmse = ", mean_squared_error(np.log(y_valid), np.log(final_GBR_preds_valid), squared=False))

In [None]:
# a plot of effect of learning rate on gradient boosting
lr = [0.0001, 0.001, 0.003, 0.01, 0.02 , 0.1, 0.2]
rmse_GBR = []

for l in lr:
    GBR_model = GradientBoostingRegressor(learning_rate=l, n_estimators=1000, max_depth=4)
    GBR_model.fit(X_train, y_train)
    GBR_preds_valid = GBR_model.predict(X_valid)
    rms = mean_squared_error(np.log(y_valid), np.log(GBR_preds_valid), squared=False)
    rmse_GBR.append(rms)


In [None]:
plt.plot(lr, rmse_GBR)
plt.xlabel('learning rate')
plt.ylabel('RMSE')
plt.show()

## 2 - linearsvr model 


In [None]:
from sklearn.svm import LinearSVR

# a plot of effect of learning rate on gradient boosting
iterations = [10, 50, 100, 200, 500, 1000]
rmse_svr = []

for iter in iterations:
    svr_model = LinearSVR(epsilon = 1, max_iter=iter)
    svr_model.fit(X_train, y_train)
    svr_preds_valid = svr_model.predict(X_valid)
    rms = mean_squared_error(np.log(y_valid), np.log(svr_preds_valid), squared=False)
    rmse_svr.append(rms)



#tree_preds_valid = tree_model.predict(X_valid)
#print("Rmse = ", mean_squared_error(np.log(y_valid), np.log(tree_preds_valid), squared=False))

In [None]:
plt.plot(iterations, rmse_svr)
plt.xlabel('num of iteration')
plt.ylabel('RMSE')
plt.show()

# Submission
* using Gradient Boosting regression model

In [None]:
preds_test = final_GBR_model.predict(x_test)

# Save test predictions to submission file
output = pd.DataFrame({'Id': test_id,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)