In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import necesssary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [None]:
#import the datasets
train=pd.read_csv('/kaggle/input/houseprices-new/train.csv')
test=pd.read_csv('/kaggle/input/houseprices-new/test.csv')
sample=pd.read_csv('/kaggle/input/houseprices-new/sample_submission.csv')

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
#drop column 'SalePrice' from train dataset and store it in separate structure train_y.
train_x=train.drop(columns=['SalePrice'],axis=1)
train_y=train['SalePrice']

In [None]:
#Let's look at some features of SalePrice by plotting its distribution
sns.distplot(train_y)
print('Skewness:',train_y.skew(),'Kurtosis:',train_y.kurt())

In [None]:
#Since the variable 'SalePrice' exhibits skewness and doesn't follow a straight line on probability plot,
#we take log of SalePrice values to normalize the values.
train_y = np.log(train_y)
sns.distplot(train_y, fit=norm)
fig = plt.figure()
res = stats.probplot(train_y, plot=plt)

In [None]:
#Let's check out the variable 'GrLivArea'.
sns.distplot(train_x['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_x['GrLivArea'], plot=plt)

In [None]:
#The variable 'GrLivArea' seems to be an important variable. Since it also exhibits skewness and peakedness,
#we apply log transformation to it.
train_x['GrLivArea'] = np.log(train_x['GrLivArea'])

sns.distplot(train_x['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_x['GrLivArea'], plot=plt)

In [None]:
#Let's check out the variable 'TotalBsmtSF'.
sns.distplot(train_x['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_x['TotalBsmtSF'], plot=plt)

In [None]:
#This variable is a bit difficult to manipulate because it takes value 0 mutiple times,hence log can't be applied.
#Let's create a new feature 'BsmtPr' or Basement Present, which will store 1 if present and 0 otherwise.
train_x['BsmtPr'] = pd.Series(len(train_x['TotalBsmtSF']), index=train_x.index)
train_x['BsmtPr'] = 0 
train_x.loc[train_x['TotalBsmtSF']>0,'BsmtPr'] = 1

In [None]:
#Now we can apply log transformation for those houses which have a basement.
train_x.loc[train_x['BsmtPr']==1,'TotalBsmtSF'] = np.log(train_x['TotalBsmtSF']+1)

In [None]:
#Let's now plot 'TotalBsmtSF'.
sns.distplot(train_x[train_x['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm)
fig = plt.figure()
res = stats.probplot(train_x[train_x['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)

In [None]:
#set ID of house as index.
train_x.set_index('Id',inplace=True)
test.set_index('Id',inplace=True)

In [None]:
#Count the number of columns with object datatype.
train_x.dtypes.value_counts()

In [None]:
#Store labels of columns storing object datatype values.
idx=[i for i in train_x.columns if train_x[i].dtypes=='object']

In [None]:
#Concatenate train and test dataset.
conc=pd.concat([train_x,test],axis=0)
conc

In [None]:
#Get dummy variables for columns storing categorical(object) datatypes.
conc=pd.get_dummies(conc,columns=idx)

In [None]:
#Divide conc into train_x dataset and test dataset again. Now both have the dummy variable columns.
train_x=conc.loc[1:1460]
test=conc.loc[1461:]

In [None]:
#Apply log transformation for variables 'GrLivArea' and 'TotalBsmtSF' in test dataset.
test['GrLivArea'] = np.log(test['GrLivArea'])
test.loc[test['BsmtPr']==1,'TotalBsmtSF'] = np.log(test['TotalBsmtSF']+1)

In [None]:
#Find columns where NaN values are present.
for col in train_x.columns:
    if len(train_x.isnull()[col].value_counts().keys())>1 :
        print(col)

In [None]:
#Replace NaN values with 0.
train_x=train_x.fillna(0)
test=test.fillna(0)

In [None]:
train_x

In [None]:
#Apply k-Fold Cross Validation to split train_x dataset into training and validation datasets.
#Use this to train the RandomForestRegressor model over multiple iterations.
kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None) 
rmse_train=[]
model=RandomForestRegressor()

for train_index, val_index in kf.split(train_x):
    x_tr, x_val = train_x.loc[train_index+1], train_x.loc[val_index+1] 
    y_tr, y_val = train_y.loc[train_index],train_y.loc[val_index]
      
    model.fit(x_tr,y_tr)
    predict=model.predict(x_val)
    rms=mean_squared_error(y_val,predict)**0.5
    rmse_train.append(rms)

#Print validation score
val_score=sum(rmse_train)/len(rmse_train)
val_score

In [None]:
#Make predictions on the test data and find the value of required metric
predict=model.predict(test)
predict=np.exp(predict)
rms=mean_squared_error(np.log(sample['SalePrice']),np.log(predict))**0.5
rms

In [None]:
#Write the predictions to a new csv file.
sub=pd.DataFrame({'Id':range(1461,2920), 'SalePrice':predict})
sub.to_csv('submission.csv',index=False)