In [327]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [328]:
dataset_1 = pd.read_csv('train.csv')
dataset_2 = pd.read_csv('test.csv')
print("dataset_1 = ", dataset_1.shape, "dataset_2 = ", dataset_2.shape)

dataset_1 =  (1460, 81) dataset_2 =  (1459, 80)


So we know that row 1-1460 belongs to "train" dataset and row 1461 to 2919 belongs to "test" dataset.
Let's join the test and train test for some data preprocessing. We will divide them again later:

In [329]:
dataset = pd.concat([dataset_1, dataset_2], ignore_index = True)
dataset.index += 1 
dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
2,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
3,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
4,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
5,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,
2916,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,
2917,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,
2918,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [330]:
dataset.shape

(2919, 81)

In [331]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 1 to 2919
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

There are some data here that are not int, but they are written in numbers so we change them to objects: (from https://www.kaggle.com/dhaneeshkarthikp/house-price/notebook)

In [332]:
dataset ['MSSubClass'] = dataset['MSSubClass'].apply(str)
dataset ['OverallQual'] = dataset ['OverallQual'].apply(str)
dataset ['OverallCond'] = dataset ['OverallCond'].apply(str)

Some data cleaning: (from https://www.kaggle.com/dhaneeshkarthikp/house-price/notebook)

In [333]:
dataset['TotalSF'] = dataset['TotalBsmtSF'] + dataset['1stFlrSF'] + dataset['2ndFlrSF']
dataset['TotalBathrooms'] = (dataset['FullBath'] + (0.5 * dataset['HalfBath']) + dataset['BsmtFullBath'] + (0.5 * dataset['BsmtHalfBath']))
dataset['TotalPorchSf'] = (dataset['OpenPorchSF'] + dataset['3SsnPorch'] + dataset['EnclosedPorch'] + dataset['ScreenPorch'] + dataset['WoodDeckSF'])
dataset["LivLotRatio"] = dataset['GrLivArea']/dataset['LotArea']

dataset = dataset.drop (['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF', 'GrLivArea', 'LotArea'], axis = 1)

We take out the Id column because it has no effect on the results. Also we take our the SalePrice which is out dependant variable.

In [334]:
y = dataset['SalePrice']
y = y.iloc[0:1460].values.reshape(-1,1)
y

array([[208500.],
       [181500.],
       [223500.],
       ...,
       [266500.],
       [142125.],
       [147500.]])

In [335]:
Id = dataset['Id']
Id = Id.iloc[1460:2920].values
Id

array([1461, 1462, 1463, ..., 2917, 2918, 2919], dtype=int64)

In [336]:
dataset.drop(['Id','SalePrice'], inplace=True, axis=1)

Dividing numerical and categorical values columns names:

In [337]:
datatypes = dict(dataset.dtypes)
objs=[]
nums=[]
for i in dataset.columns:
    if datatypes[i]=="O":
        objs.append(i)
    else:
        nums.append(i)

Dealing with the missing values:

In [338]:
for i in dataset.columns:
    nulls = dataset[i].isnull().sum()
    if nulls>0:
        print(nulls, "\t", i)

4 	 MSZoning
486 	 LotFrontage
2721 	 Alley
2 	 Utilities
1 	 Exterior1st
1 	 Exterior2nd
24 	 MasVnrType
23 	 MasVnrArea
81 	 BsmtQual
82 	 BsmtCond
82 	 BsmtExposure
79 	 BsmtFinType1
1 	 BsmtFinSF1
80 	 BsmtFinType2
1 	 BsmtFinSF2
1 	 BsmtUnfSF
1 	 Electrical
1 	 KitchenQual
2 	 Functional
1420 	 FireplaceQu
157 	 GarageType
159 	 GarageYrBlt
159 	 GarageFinish
1 	 GarageCars
1 	 GarageArea
159 	 GarageQual
159 	 GarageCond
2909 	 PoolQC
2348 	 Fence
2814 	 MiscFeature
1 	 SaleType
1 	 TotalSF
2 	 TotalBathrooms


In [339]:
cm = dataset.columns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(dataset)
dataset = imputer.transform(dataset)
dataset = pd.DataFrame(dataset, columns=cm)

Dividing the dataset to numerical and categorical(object):

In [340]:
df_nums = dataset[nums]
df_objs = dataset[objs]

Changing the categorical variables to dummy variables: (only on categorical values)

In [341]:
objs

['MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [342]:
from sklearn.preprocessing import LabelEncoder
df_objs = df_objs.apply(LabelEncoder().fit_transform)
df_objs

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,10,3,1,0,3,3,0,4,0,5,...,1,1,4,4,2,0,2,2,8,4
1,5,3,1,0,3,3,0,2,0,24,...,1,1,4,4,2,0,2,2,8,4
2,10,3,1,0,0,3,0,4,0,5,...,1,1,4,4,2,0,2,2,8,4
3,11,3,1,0,0,3,0,0,0,6,...,5,2,4,4,2,0,2,2,8,0
4,10,3,1,0,0,3,0,2,0,15,...,1,1,4,4,2,0,2,2,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2,4,1,0,3,3,0,4,0,10,...,1,2,4,4,2,0,2,2,8,4
2915,2,4,1,0,3,3,0,4,0,10,...,4,2,4,4,2,0,2,2,8,0
2916,5,3,1,0,3,3,0,4,0,11,...,5,2,4,4,2,0,2,2,8,0
2917,14,3,1,0,3,3,0,4,0,11,...,1,2,4,4,2,0,2,2,8,4


Feature scaling the numerical values:

In [343]:
nums

['LotFrontage',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'LowQualFinSF',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'TotalSF',
 'TotalBathrooms',
 'TotalPorchSf',
 'LivLotRatio']

In [344]:
df_nums

Unnamed: 0,LotFrontage,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,...,GarageCars,GarageArea,PoolArea,MiscVal,MoSold,YrSold,TotalSF,TotalBathrooms,TotalPorchSf,LivLotRatio
0,65,2003,2003,196,706,0,150,0,3,1,...,2,548,0,0,2,2008,2566,3.5,61,0.202367
1,80,1976,1976,0,978,0,284,0,3,1,...,2,460,0,0,5,2007,2524,2.5,298,0.131458
2,68,2001,2002,162,486,0,434,0,3,1,...,2,608,0,0,9,2008,2706,3.5,42,0.158756
3,60,1915,1970,0,216,0,540,0,3,1,...,3,642,0,0,2,2006,2473,2,307,0.179791
4,84,2000,2000,350,655,0,490,0,4,1,...,3,836,0,0,12,2008,3343,3.5,276,0.154137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,21,1970,1970,0,0,0,546,0,3,1,...,0,0,0,0,6,2006,1638,1.5,0,0.56405
2915,21,1970,1970,0,252,0,294,0,3,1,...,1,286,0,0,4,2006,1638,1.5,24,0.576558
2916,160,1960,1996,0,1224,0,0,0,4,1,...,2,576,0,0,9,2006,2448,2,474,0.0612
2917,62,1992,1992,0,337,0,575,0,3,1,...,0,0,0,700,7,2006,1882,1.5,112,0.092903


In [345]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
df_nums = pd.DataFrame(sc_x.fit_transform(df_nums), columns=nums)
sc_y = StandardScaler()
y = sc_y.fit_transform(y)

In [346]:
df_nums

Unnamed: 0,LotFrontage,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,BedroomAbvGr,KitchenAbvGr,...,GarageCars,GarageArea,PoolArea,MiscVal,MoSold,YrSold,TotalSF,TotalBathrooms,TotalPorchSf,LivLotRatio
0,-0.127678,1.046258,0.896833,0.529034,0.581145,-0.293025,-0.934165,-0.101197,0.169927,-0.207698,...,0.306418,0.349364,-0.06315,-0.089592,-1.552184,0.157646,0.022662,1.586238,-0.762276,0.170425
1,0.567125,0.154764,-0.395604,-0.567016,1.178255,-0.293025,-0.629284,-0.101197,0.169927,-0.207698,...,0.306418,-0.058991,-0.06315,-0.089592,-0.446925,-0.602962,-0.029542,0.348705,0.719030,-0.444468
2,0.011283,0.980221,0.848965,0.338903,0.098189,-0.293025,-0.287999,-0.101197,0.169927,-0.207698,...,0.306418,0.627787,-0.06315,-0.089592,1.026753,0.157646,0.196673,1.586238,-0.881030,-0.207757
3,-0.359279,-1.859351,-0.682812,-0.567016,-0.494529,-0.293025,-0.046824,-0.101197,0.169927,-0.207698,...,1.619830,0.785561,-0.06315,-0.089592,-1.552184,-1.363569,-0.092932,-0.270061,0.775282,-0.025349
4,0.752406,0.947203,0.753229,1.390216,0.469187,-0.293025,-0.160586,-0.101197,1.385655,-0.207698,...,1.619830,1.685798,-0.06315,-0.089592,2.132012,0.157646,0.988424,1.586238,0.581524,-0.247803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,-2.165768,-0.043346,-0.682812,-0.567016,-0.968705,-0.293025,-0.033173,-0.101197,0.169927,-0.207698,...,-2.320407,-2.193574,-0.06315,-0.089592,-0.078505,-1.363569,-1.130785,-0.888827,-1.143540,3.306806
2915,-2.165768,-0.043346,-0.682812,-0.567016,-0.415500,-0.293025,-0.606531,-0.101197,0.169927,-0.207698,...,-1.006994,-0.866421,-0.06315,-0.089592,-0.815344,-1.363569,-1.130785,-0.888827,-0.993535,3.415270
2916,4.272743,-0.373528,0.561757,-0.567016,1.718287,-0.293025,-1.275450,-0.101197,1.385655,-0.207698,...,0.306418,0.479295,-0.06315,-0.089592,1.026753,-1.363569,-0.124005,-0.270061,1.819071,-1.053723
2917,-0.266639,0.683057,0.370284,-0.567016,-0.228904,-0.293025,0.032809,-0.101197,0.169927,-0.207698,...,-2.320407,-2.193574,-0.06315,1.144312,0.289914,-1.363569,-0.827508,-0.888827,-0.443514,-0.778806


Merging the two datas in one:

In [347]:
dataset_final = pd.concat([df_objs,df_nums], axis=1)
dataset_final

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageCars,GarageArea,PoolArea,MiscVal,MoSold,YrSold,TotalSF,TotalBathrooms,TotalPorchSf,LivLotRatio
0,10,3,1,0,3,3,0,4,0,5,...,0.306418,0.349364,-0.06315,-0.089592,-1.552184,0.157646,0.022662,1.586238,-0.762276,0.170425
1,5,3,1,0,3,3,0,2,0,24,...,0.306418,-0.058991,-0.06315,-0.089592,-0.446925,-0.602962,-0.029542,0.348705,0.719030,-0.444468
2,10,3,1,0,0,3,0,4,0,5,...,0.306418,0.627787,-0.06315,-0.089592,1.026753,0.157646,0.196673,1.586238,-0.881030,-0.207757
3,11,3,1,0,0,3,0,0,0,6,...,1.619830,0.785561,-0.06315,-0.089592,-1.552184,-1.363569,-0.092932,-0.270061,0.775282,-0.025349
4,10,3,1,0,0,3,0,2,0,15,...,1.619830,1.685798,-0.06315,-0.089592,2.132012,0.157646,0.988424,1.586238,0.581524,-0.247803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2,4,1,0,3,3,0,4,0,10,...,-2.320407,-2.193574,-0.06315,-0.089592,-0.078505,-1.363569,-1.130785,-0.888827,-1.143540,3.306806
2915,2,4,1,0,3,3,0,4,0,10,...,-1.006994,-0.866421,-0.06315,-0.089592,-0.815344,-1.363569,-1.130785,-0.888827,-0.993535,3.415270
2916,5,3,1,0,3,3,0,4,0,11,...,0.306418,0.479295,-0.06315,-0.089592,1.026753,-1.363569,-0.124005,-0.270061,1.819071,-1.053723
2917,14,3,1,0,3,3,0,4,0,11,...,-2.320407,-2.193574,-0.06315,1.144312,0.289914,-1.363569,-0.827508,-0.888827,-0.443514,-0.778806


Now its time to divide the test and train dataset again:

In [348]:
dataset_1 = dataset_final[0:1460]
dataset_2 = dataset_final[1460:2920]
print("dataset_1 = ", dataset_1.shape, "dataset_2 = ", dataset_2.shape)

dataset_1 =  (1460, 69) dataset_2 =  (1459, 69)


Training the model:

In [349]:
x = dataset_1.iloc[:,:].values

In [350]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size= 0.30, random_state = 123)

In [351]:
x_test

array([[10.        ,  3.        ,  1.        , ...,  0.34870526,
         0.05650478,  0.27215148],
       [11.        ,  4.        ,  1.        , ...,  0.96747163,
        -0.58101917,  0.01801896],
       [ 2.        ,  4.        ,  1.        , ..., -0.2700611 ,
        -0.8935309 ,  2.81972247],
       ...,
       [ 4.        ,  4.        ,  1.        , ..., -0.2700611 ,
        -1.14354029,  0.42016612],
       [10.        ,  3.        ,  1.        , ...,  0.34870526,
         0.38151698,  0.19224208],
       [10.        ,  3.        ,  1.        , ...,  1.58623799,
         1.24404938,  0.08542394]])

In [352]:
y_test

array([[ 5.23561686e-01],
       [-1.18265883e+00],
       [-6.41201394e-01],
       [ 1.17582901e+00],
       [-5.15281061e-01],
       [ 9.83170901e-01],
       [ 1.61277257e+00],
       [-6.72681478e-01],
       [-3.26400562e-01],
       [-4.27136828e-01],
       [-2.76032429e-01],
       [ 4.92081603e-01],
       [-5.15281061e-01],
       [-9.18226127e-01],
       [ 9.95762935e-01],
       [ 3.34681186e-01],
       [-8.30081894e-01],
       [ 1.82210253e+00],
       [ 3.91345336e-01],
       [-1.63963332e-01],
       [-3.76768695e-01],
       [-2.26923499e-01],
       [-8.99338077e-01],
       [-4.77504961e-01],
       [ 6.18001936e-01],
       [-4.52320895e-01],
       [ 3.52310033e-01],
       [ 9.92303972e-04],
       [-6.42460598e-01],
       [-9.30818160e-01],
       [-3.97545550e-01],
       [ 1.35843373e-02],
       [-4.77504961e-01],
       [ 2.26389700e-01],
       [ 2.34681256e+00],
       [ 1.14320604e-01],
       [-9.97439624e-02],
       [ 3.70305009e+00],
       [-5.5

Linear Regression:

In [353]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

In [354]:
y_pred = regressor.predict(x_test)

In [355]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.825354829365281

Decision Tree:

In [356]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x_train, y_train)

DecisionTreeRegressor(random_state=0)

In [357]:
y_pred = regressor.predict(x_test)

In [358]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7635821133946998

Random Forest:

In [359]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(x_train, y_train)

  regressor.fit(x_train, y_train)


RandomForestRegressor(n_estimators=10, random_state=0)

In [360]:
y_pred = regressor.predict(x_test)

In [361]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8776720290469712

Support Vector Regression (SVR):

In [362]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train)

  return f(**kwargs)


SVR()

In [363]:
y_pred = regressor.predict(x_test)

In [364]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8410895546302299

It seems that random forest gives us the hughest accuracy. So we use that to predict the test data:

In [365]:
x = dataset_2.iloc[:,:].values

In [366]:
y_final = sc_y.inverse_transform(regressor.predict(x)).reshape(-1,1)
y_final

array([[114583.52319592],
       [185788.64892854],
       [181444.99791961],
       ...,
       [188781.12696728],
       [129625.02474613],
       [240827.99733774]])

In [370]:
Id.reshape(-1,1)

array([[1461],
       [1462],
       [1463],
       ...,
       [2917],
       [2918],
       [2919]], dtype=int64)

In [374]:
submission['Id'] = Id
submission['SalePrice'] = y_final
submission.to_csv('./submission.csv',index=False)