In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAERROR
from sklearn.preprocessing import OneHotEncoder

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
hp = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
hppred = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# Have a glimpse of the data
hp.head()

In [None]:
hp.head()

# Shape

In [None]:
# what is the shape
print(hp.shape)
print(hppred.shape)
# there are 1460 rows and 81 columns in training data
# (1459,80) in testing data. So one column is missing in test data.

# so there NANs. Are there missing values?

In [None]:
hp.isna().head() # isnull() also works

In [None]:
hp.isnull().sum().sum()

In [None]:
hp.isnull().sum()

In [None]:
colnasum_tr = hp.isnull().sum().sort_values(ascending=False) 
colnasum_pr = hppred.isnull().sum().sort_values(ascending=False) 
print(colnasum_tr)
print(colnasum_pr)
# We get a Series with index as column names and values are number of NANs or missing values

In [None]:
# We can plot the above Series.Pandas has elementary ploting 
hp.isnull().sum().plot() # 


In [None]:
# we can plot just a few values of the Series
hp.isnull().sum().iloc[0:20].plot()

In [None]:
hppred.isnull().sum().plot()

# what are the columns with NANs 

In [None]:
colna = colnasum_tr[colnasum_tr >0]
print(colna)
colna.shape
# so there are 19 columns having NANs

# Let us first try to analyze by dropping the columns with NANs

In [None]:
hp_dna = hp.dropna(axis=1)
hppred_dna = hppred.dropna(axis=1)

hp_dna

In [None]:
# check no missing values
hp_dna.isnull().sum().sum()
# No NANs or missing values

# Identify the target column 

In [None]:
# first list the columns
cols = hp_dna.columns
# find out columns with 'Sales' in their names
colsale = [col for col in cols if 'Sale' in col]
colsale
# Of ['SaleType', 'SaleCondition', 'SalePrice'] 'SalePrice' is our target variable

In [None]:
# first list the columns
colspr = hppred_dna.columns
# find out columns with 'Sales' in their names
colsale = [col for col in colspr if 'Sale' in col]
colsale
# oops! prediction data has no 'SalePrice' and 'SaleType' columns


# So the columns in hptest_dna and hp_dna might be different. So first we got to figure out which columns match and which don't

In [None]:
pd.Series(cols).equals(colspr)
# Output 'False' implies both have different columns

In [None]:
# print columns which match in both
colsmatch = [col2 for col1 in cols for col2 in colspr if col1 == col2]
print(colsmatch)
print('')
print('NUmber of matching columns=', len(colsmatch))

# Number of matching columns in both are 46. So form another training and testing sets with just these matching forms.

In [None]:
hp_dna_X = hp_dna[colsmatch]
hp_dna_prX = hppred_dna[colsmatch]
hp_y = hp_dna['SalePrice']

# Split the data into training and testing data

In [None]:
# Split into validation and training data
hptrain_X, hpval_X, hptrain_y, hpval_y = train_test_split(hp_dna_X, hp_y, random_state=2)
hptrain_X.reset_index(inplace=True) 
hptrain_X.pop('index')
hpval_X.reset_index(inplace=True)
hpval_X.pop('index')
hpval_X.head() 
#train_y= hptrain_y.reset_index()
#train_y.pop('index')
#hptrain_y

# above things are done because the output of test-train-split will have random index values. So that index
# must be removed and a new index is placed

In [None]:
# Let us first work with numeric columns only
Xtr_int = hptrain_X.select_dtypes(include = int)
Xval_int = hpval_X.select_dtypes(include = int)
Xpr_int = hp_dna_prX.select_dtypes(include = int)


# Let's use the Random Forest regressor 

In [None]:
RFmodel = RandomForestRegressor(random_state=100)

# Fit the model

In [None]:
RFmodel.fit(Xtr_int,hptrain_y)

# Compute the MAE for Xtr_int

In [None]:
Xtr_int_mae = MAERROR(RFmodel.predict(Xtr_int),hptrain_y)
print(Xtr_int_mae)

# Compute the MAE for Xval_int

In [None]:
pred_valy = RFmodel.predict(Xval_int)
Xval_int_mae = MAERROR(pred_valy,hpval_y)
print(Xval_int_mae)

In [None]:
# Make predictions on test data
#pred_y = RFmodel.predict(Xpr_int)
#pred_y.shape

# Refine the results by exploring the data and refining the model. Let's visualize the data and get some idea about correlations

# Above plot shows that predicted distribution has sharply decaying tail.

#  Refinement-1 of the model:

## Let us add categorical variables as well into analysis

In [None]:
hptrain_X.select_dtypes(object).shape

In [None]:
# check which columns have 'object' data types
objs = (hptrain_X.dtypes == 'object')
# print out names of object columns
print(objs[objs],'\n')

# make a list of those columns
ObjCols = list(objs[objs].index)
#print(ObjCols,'\n')

# Check that the same list of 'objcols' works for the testing data 
objs2 = (hp_dna_prX.dtypes == 'object')
ObjCols2 = np.array(list(objs2[objs2].index))
#print(ObjCols2,'\n')
print(np.array_equal(ObjCols2,ObjCols)) 

# Because of the finiteness of the sample data, the number of unique variables per a categorical column are different between the training, validation and test samples as the next cell shows

In [None]:
print(hpval_X[ObjCols].nunique().sum(),hptrain_X[ObjCols].nunique().sum(),hp_dna_prX[ObjCols].nunique().sum())

# Therefore, retain only those object columns which have same number of unique values for the categorical variables

In [None]:
stmt1 = hptrain_X[ObjCols].nunique() == hpval_X[ObjCols].nunique() 
stmt2 = hptrain_X[ObjCols].nunique() == hp_dna_prX[ObjCols].nunique()
stmt3 = (stmt1) & (stmt2)
ObjColsCmn = list(stmt3[stmt3].index)
print('\n','Object columns that have same nunique() values in \
      train,validate and test samples are',ObjColsCmn,'\n')
# Now form a dataframe of above object columns for training, validation and test sets 
Xtrain = hptrain_X[ObjCols][ObjColsCmn]
Xval = hpval_X[ObjCols][ObjColsCmn]
Xpr = hp_dna_prX[ObjCols][ObjColsCmn]



# Let's use the OneHotEncoder from scikit-learn

In [None]:
Ohe = OneHotEncoder(handle_unknown='error',sparse=False)

# Do the OneHotEncoding on training, validation and testing datasets
OheObjXtrain = pd.DataFrame(Ohe.fit_transform(Xtrain),columns=Ohe.get_feature_names(ObjColsCmn)) # these are the columns corresponding to object columns are OHE
OheObjXval = pd.DataFrame(Ohe.fit_transform(Xval),columns=Ohe.get_feature_names(ObjColsCmn)) # these are the columns corresponding to object columns are OHE
OheObjXpr = pd.DataFrame(Ohe.fit_transform(Xpr),columns=Ohe.get_feature_names(ObjColsCmn)) # these are the columns corresponding to object columns are OHE



In [None]:
# Check that the number of columns OneHotEncoded(OHE) dataframes are same
print('shape of training sample is ',Xtrain.shape,' and shape of its OHE dataframe is ',OheObjXtrain.shape)
print('shape of validation sample is ',Xval.shape,' and shape of its OHE dataframe is ',OheObjXval.shape)
print('shape of test sample is ',Xpr.shape,' and shape of its OHE dataframe is ',OheObjXpr.shape)


# Now replace the categorical columns in the datasets with the OneHotEncoded columns

In [None]:
#drop the categorical columns
XtrainObjDrop= hptrain_X.drop(ObjCols,axis=1)
XvalObjDrop= hpval_X.drop(ObjCols,axis=1)
XprObjDrop= hp_dna_prX.drop(ObjCols,axis=1)

#concat the dataframes(hereafter call DF) with OneHotEncoded columns
Xtr = pd.concat([XtrainObjDrop,OheObjXtrain],axis =1)
Xval = pd.concat([XvalObjDrop,OheObjXval],axis =1)
Xpr = pd.concat([XprObjDrop,OheObjXpr],axis =1)

#print shape
print('Shape of training set is',XtrainObjDrop.shape)
print(hpval_X.shape,hptrain_X.shape)
print(Xval.shape,Xtr.shape,Xpr.shape)

# Now let's throw the data into RandomForest and see what comes out!

In [None]:
# instatiate the model class
RFmodel = RandomForestRegressor(random_state=100)

In [None]:
# fit
RFmodel.fit(Xtr.iloc[:,1:],hptrain_y) #id actually doesn't play a role so don't use it

# Compute MAE for Xtr

In [None]:
Xtr_mae = MAERROR(RFmodel.predict(Xtr.iloc[:,1:]),hptrain_y)
print(Xtr_mae)

# Compute MAE for Xval

In [None]:

Xval_mae = MAERROR(RFmodel.predict(Xval.iloc[:,1:]),hpval_y)
print(Xval_mae)

# Predict 'SalesPrice' for testing data and also get the MAE

In [None]:
pred_y = RFmodel.predict(Xpr.iloc[:,1:])
pred_y.shape

In [None]:
output = pd.DataFrame({'Id': Xpr.Id,
                       'SalePrice': pred_y})
output.to_csv('submission.csv', index=False)