In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost
import warnings 

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor

from scipy.stats import norm, skew
from scipy.special import boxcox1p


pd.pandas.set_option('display.max_columns', None)
pd.pandas.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [None]:
# Reading train and test sets

test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)

In [None]:
#Making a copy of training data

train_copy = train.copy()
test_copy = test.copy()

In [None]:
# Printing numeric columns and its count

num_cols = train._get_numeric_data().columns 
display(num_cols)
print()
print("Count: ", len(num_cols))

In [None]:
# Drawing box plots to check for outliers

plt.figure(figsize = (25,45))
for i in enumerate(num_cols):
  plt.subplot(13,3,i[0]+1)
  sns.boxplot(train[i[1]])
  plt.xlabel(i[1])

In [None]:
# Index's to be removed from train set

index = [712, 1219, 1416, 1200, 1345, 1458, 773, 1248, 1423, 628, 973, 1458, 1459]
train = train.drop(labels = index, axis = 0)

In [None]:
print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)

In [None]:
#Printing columns with Null values for train data

Null_train = train.isnull().sum()
Null_train[Null_train > 0]

In [None]:
#Creating a list of columns to be removed and removing from train and test sets, assuming the columns with highest number
#of missing values are likely to be in Test set

drop_columns = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id']
train = train.drop(drop_columns, axis =1)
test = test.drop(drop_columns, axis = 1)

print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)

In [None]:
#Analysing the null data in training set

Null_train_data = train[['LotFrontage', 'FireplaceQu', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
      'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']]

In [None]:
#Creating a function with name 'analysis' for extracting data type, unique and null count

def analysis(data):
    return pd.DataFrame({"Data Type":data.dtypes, "Unique Count":data.apply(lambda x: x.nunique(),axis=0), 
                         "Null Count": data.isnull().sum() })

In [None]:
analysis(Null_train_data)

In [None]:
Null_train_data[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']].describe()

In [None]:
#Replacing numeric Null vlaues for training set

train['LotFrontage'] = train['LotFrontage'].fillna((train['LotFrontage'].mean()))
train['MasVnrArea'] = train['MasVnrArea'].fillna((train['MasVnrArea'].mode()[0]))
train['GarageYrBlt'] = train['GarageYrBlt'].fillna((train['GarageYrBlt'].mode()[0]))

In [None]:
#Printing columns with Null values for test data

Null_test = test.isnull().sum()
Null_test[Null_test > 0]

In [None]:
#Analysing the null data in training set

Null_test_data = test[['MSZoning', 'LotFrontage', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 
                         'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2',
                         'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 
                         'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCars','GarageArea', 
                         'GarageQual', 'GarageCond', 'SaleType']]
analysis(Null_test_data)

In [None]:
#Lets understand the distribution on the integer values

Null_test_data[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
                'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']].describe()

In [None]:
#Replacing numeric Null vlaues for testing set

test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].mean())
test['MasVnrArea'] = test['MasVnrArea'].fillna(test['MasVnrArea'].mode()[0])
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(test['BsmtFinSF1'].mode()[0])
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(test['BsmtFinSF2'].mode()[0])
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(test['BsmtUnfSF'].mean())
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].mean())
test['BsmtFullBath'] = test['BsmtFullBath'].fillna(test['BsmtFullBath'].mode()[0])
test['BsmtHalfBath'] = test['BsmtHalfBath'].fillna(test['BsmtHalfBath'].mode()[0])
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test['GarageYrBlt'].mode()[0])
test['GarageCars'] = test['GarageCars'].fillna(test['GarageCars'].mode()[0])
test['GarageArea'] = test['GarageArea'].fillna(test['GarageArea'].mean())

In [None]:
# Creating a function to filter records with mode correlation

def correlation(data, limit):
  col = set()
  corr_matrix = data.corr()
  for i in range(len(corr_matrix)):
    for j in range(i):
      if (corr_matrix.iloc[i, j]) > limit:
        col_name = corr_matrix.columns[i]
        col.add(col_name)
  return col

In [None]:
# Getting correlated columns 

corr_columns = correlation(train, 0.7)
corr_columns

In [None]:
#Dropping MiscFeature and MiscVal as their contribution towards the sale values doesnt seem much

train = train.drop(['1stFlrSF', 'GarageArea', 'TotRmsAbvGrd'], axis = 1)
test = test.drop(['1stFlrSF', 'GarageArea', 'TotRmsAbvGrd'], axis = 1)
train.head()

In [None]:
# Separating target variable 

House_Price = pd.DataFrame(train['SalePrice'])
train = train.drop(['SalePrice'], axis = 1)

In [None]:
# Distribution of target variable 

sns.displot(House_Price['SalePrice'], kde = True, color = 'Green')

In [None]:
# Log transformation of target variable 
sns.displot(np.log(House_Price['SalePrice']), kde = True, color = 'Black')

In [None]:
# Applying log on target variable

House_Price = pd.DataFrame(np.log(House_Price['SalePrice']))

In [None]:
print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)

In [None]:
data = pd.concat([train, test])
data.shape

In [None]:
#Generating new columns 

#data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['YrBltRemod'] = data['YearBuilt'] + data['YearRemodAdd']
data['TotalBathrooms'] = (data['FullBath'] + (0.5 * data['HalfBath']) +
                               data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath']))
data['TotalPorchSf'] = (data['OpenPorchSF'] + data['3SsnPorch'] +
                              data['EnclosedPorch'] + data['ScreenPorch'] +
                              data['WoodDeckSF'])

#data["LivLotRatio"] = data['GrLivArea']/data['LotArea']

data["TotalOutsideSF"] = sum((data['WoodDeckSF'],data['OpenPorchSF'],data['EnclosedPorch'], data['ScreenPorch']))

data['HouseAge'] = data['YrSold'] - data['YearBuilt']

data['OverallCondQual'] = (data['OverallCond'] + data['OverallQual'])/2

In [None]:
data_num_cols = data._get_numeric_data().columns 
data_num_cols

In [None]:
data_cat_cols = data.columns.difference(data_num_cols)
data_cat_cols

In [None]:
#Separating both numeric and categorical data from set

data_num_data = data.loc[:, data_num_cols]
data_cat_data = data.loc[:, data_cat_cols]

print("Shape of num data:", data_num_data.shape)
print("Shape of cat data:", data_cat_data.shape)

In [None]:
# Scaling numeric variables 

s_scaler = StandardScaler()
data_num_data_s = s_scaler.fit_transform(data_num_data)

data_num_data_s = pd.DataFrame(data_num_data_s, columns = data_num_cols)

In [None]:
# Scaling categorical variables 

data_cat_data = data_cat_data.fillna('NA')

label = LabelEncoder()
data_cat_data = data_cat_data.astype(str).apply(LabelEncoder().fit_transform)

In [None]:
data_num_data.isnull().sum()

In [None]:
data_num_data_s.reset_index(drop=True, inplace=True)
data_cat_data.reset_index(drop=True, inplace=True)

data_new = pd.concat([data_num_data_s, data_cat_data], axis = 1)

In [None]:
train_new = data_new.loc[:1447,]
test_new = data_new.loc[1448:,]

print("Shape of train data:", train_new.shape)
print("Shape of test data:", test_new.shape)

In [None]:
#Spliting data

from sklearn.model_selection import train_test_split 

trainx,valx,trainy,valy = train_test_split(train_new,House_Price,test_size=0.2,random_state=1234)
#print(cust_data.shape)
print(trainx.shape)
print(valx.shape)

In [None]:
# Model fitting

xgb = xgboost.XGBRegressor(base_score=0.5, booster='gbtree', colasample_bytree=0.2,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             gamma=0.1, importance_type='gain', learning_rate=0.1,
             max_delta_step=0, max_depth=10, min_child_weight=1, missing=1,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

xgb.fit(trainx,trainy)

In [None]:
# Model predictions

xgb_train_pred = xgb.predict(trainx)
xgb_val_pred = xgb.predict(valx)

In [None]:
# Calculating RMSE values

xgb_train_rmse = mean_squared_error(trainy, xgb_train_pred, squared=False)
xgb_val_rmse = mean_squared_error(valy, xgb_val_pred, squared=False)

print("Train RMSE: ", xgb_train_rmse)
print("Test RMSE: ", xgb_val_rmse)

In [None]:
# Predicting on test values

xgb_test_pred = xgb.predict(test_new)

In [None]:
# Storing values in a dataframe: submission_xgb

submission_xgb = pd.DataFrame(test_copy[['Id']])
submission_xgb['SalePrice'] = np.exp(xgb_test_pred)
submission_xgb.head()

In [None]:
#submission_xgb['SalePrice'].isinf()
c = np.isinf(submission_xgb['SalePrice']).values.sum()
print("It contains " + str(c) + " infinite values")

In [None]:
# Model fitting

lasso_model = LassoCV(alphas = [1, 0.1, 0.001, 0.0005])

lasso_model.fit(trainx, trainy)
lasso_model.get_params()

In [None]:
# Model predictions

lso_train_pred = lasso_model.predict(trainx)
lso_val_pred = lasso_model.predict(valx)

In [None]:
# Calculating RMSE values

lso_train_rmse = mean_squared_error(trainy, lso_train_pred, squared=False)
lso_val_rmse = mean_squared_error(valy, lso_val_pred, squared=False)

print("Train RMSE: ", lso_train_rmse)
print("Test RMSE: ",lso_val_rmse)

In [None]:
# Predicting on test values

lso_test_pred = lasso_model.predict(test_new)

In [None]:
# Storing values in a dataframe: submission_ls

submission_ls = pd.DataFrame(test_copy[['Id']])
submission_ls['SalePrice'] = np.exp(lso_test_pred)
submission_ls.head()

In [None]:
a = np.isinf(submission_ls['SalePrice']).values.sum()
print("It contains " + str(a) + " infinite values")

In [None]:
# Model fitting

lr = LinearRegression()
lr.fit(trainx, trainy)

In [None]:
# Model predictions

lr_train_pred = lr.predict(trainx)
lr_val_pred = lr.predict(valx)

In [None]:
# Claculating RMSE values

lr_train_rmse = mean_squared_error(trainy, lr_train_pred, squared=False)
lr_val_rmse = mean_squared_error(valy, lr_val_pred, squared=False)

print("Train RMSE: ", lr_train_rmse)
print("Test RMSE: ",lr_val_rmse)

In [None]:
# Predicting on test set

lr_test_pred = lr.predict(test_new)

In [None]:
# Storing values in a dataframe: submission_lr

submission_lr = pd.DataFrame(test_copy[['Id']])
submission_lr['SalePrice'] = np.exp(lr_test_pred)
submission_lr.head()

In [None]:
x = np.isinf(submission_lr['SalePrice']).values.sum()
print("It contains " + str(x) + " infinite values")

In [None]:
# Model fitting

cat = CatBoostRegressor(random_state=0,verbose=False, depth = 5, early_stopping_rounds=300, learning_rate= 0.1)
cat.fit(trainx, trainy)

In [None]:
# Model predictions

cat_train_pred = cat.predict(trainx)
cat_val_pred = cat.predict(valx)

In [None]:
# Calculating RMSE values

cat_train_rmse = mean_squared_error(trainy, cat_train_pred, squared=False)
cat_val_rmse = mean_squared_error(valy, cat_val_pred, squared=False)

print("Train RMSE: ", cat_train_rmse)
print("Test RMSE: ",cat_val_rmse)

In [None]:
# Predicting on test set

cat_test_pred = cat.predict(test_new)

In [None]:
# Storing values in a dataframe: submission_cat

submission_cat = pd.DataFrame(test_copy[['Id']])
submission_cat['SalePrice'] = np.exp(cat_test_pred)
submission_cat.head()

In [None]:
b = np.isinf(submission_cat['SalePrice']).values.sum()
print("It contains " + str(b) + " infinite values")

In [None]:
submission_file = pd.DataFrame()
submission_file['Id'] = test_copy['Id']
submission_file['SalePrice'] = (submission_xgb['SalePrice']+submission_ls['SalePrice']+submission_cat['SalePrice'])/3
#submission_file['SalePrice'] = (submission_xgb['SalePrice']+submission_ls['SalePrice']+submission_lr['SalePrice']+submission_cat['SalePrice'])/4
submission_file.head()


In [None]:
w = np.isinf(submission_file['SalePrice']).values.sum()
print("It contains " + str(w) + " infinite values")

In [None]:
submission_file.to_csv("Submission.csv", index = False)

In [None]:
#from tqdm import tqdm

In [None]:
#missing = test.isnull().sum()
#missing = missing[missing>0]
#train.drop(missing.index, axis=1, inplace=True)



#test.dropna(axis=1, inplace=True)


#submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')


#l_test = tqdm(range(0, len(test)), desc='Matching')
#for i in l_test:
#    for j in range(0, len(train)):
#        for k in range(1, len(test.columns)):
#            if test.iloc[i,k] == train.iloc[j,k]:
#                continue
#            else:
#                break
#        else:
#            submission.iloc[i, 1] = train.iloc[j, -1]
#            break
#l_test.close()

#submission.dropna(axis=0,inplace=True)

#submission.to_csv('tqdm_Submission.csv', index=False)
#print("Your submission was successfully saved!")