In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# After playing with the dataset, I have learned that variables "SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF" can be the best performing features for my model:

df = pd.read_csv("houseprice.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [3]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [4]:
df.shape

(1201, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   object 
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   object 
 5   LotShape     1201 non-null   object 
 6   YearBuilt    1201 non-null   int64  
 7   1stFlrSF     1201 non-null   int64  
 8   2ndFlrSF     1201 non-null   int64  
 9   SalePrice    1201 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


In [6]:
for i in df.columns:
    print("Column name {} and unique values are {}".format(i, len(df[i].unique())))

Column name MSSubClass and unique values are 15
Column name MSZoning and unique values are 5
Column name LotFrontage and unique values are 110
Column name LotArea and unique values are 869
Column name Street and unique values are 2
Column name LotShape and unique values are 4
Column name YearBuilt and unique values are 112
Column name 1stFlrSF and unique values are 678
Column name 2ndFlrSF and unique values are 368
Column name SalePrice and unique values are 597


In [7]:
import datetime
datetime.datetime.now().year

2022

In [8]:
# Fixing variable YearBuilt

df["Total Years"] = datetime.datetime.now().year - df["YearBuilt"]

In [10]:
df.drop("YearBuilt", axis= "columns", inplace= True)

In [11]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', '1stFlrSF', '2ndFlrSF', 'SalePrice', 'Total Years'],
      dtype='object')

In [14]:
cat_features = ["MSSubClass", "MSZoning", "Street", "LotShape"]   # Creating categorical features
out_feature = "SalePrice"   # Dependent feature

In [17]:
lbl_encoders = {}
lbl_encoders["MSSubClass"] = LabelEncoder()
lbl_encoders["MSSubClass"].fit_transform(df["MSSubClass"])

array([5, 0, 5, ..., 6, 0, 0], dtype=int64)

In [18]:
lbl_encoders

{'MSSubClass': LabelEncoder()}

In [19]:
lbl_encoders={}
for feature in cat_features:
    lbl_encoders[feature]=LabelEncoder()
    df[feature]=lbl_encoders[feature].fit_transform(df[feature])

In [20]:
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,5,3,65.0,8450,1,3,856,854,208500,19
1,0,3,80.0,9600,1,3,1262,0,181500,46
2,5,3,68.0,11250,1,0,920,866,223500,21
3,6,3,60.0,9550,1,0,961,756,140000,107
4,5,3,84.0,14260,1,0,1145,1053,250000,22
...,...,...,...,...,...,...,...,...,...,...
1455,5,3,62.0,7917,1,3,953,694,175000,23
1456,0,3,85.0,13175,1,3,2073,0,210000,44
1457,6,3,66.0,9042,1,3,1188,1152,266500,81
1458,0,3,68.0,9717,1,3,1078,0,142125,72


In [28]:
# Stacking and converting into tensors
cat_features=np.stack([df['MSSubClass'],df['MSZoning'],df['Street'],df['LotShape']],1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]], dtype=int64)

In [29]:
# Converting numpy to tensors

cat_features=torch.tensor(cat_features,dtype=torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [30]:
# creating continuous variable

cont_features=[]
for i in df.columns:
    if i in ["MSSubClass", "MSZoning", "Street", "LotShape","SalePrice"]:
        pass
    else:
        cont_features.append(i)

In [31]:
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total Years']

In [32]:
# Stacking continuous variable to a tensor

cont_values=np.stack([df[i].values for i in cont_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    19.],
        [   80.,  9600.,  1262.,     0.,    46.],
        [   68., 11250.,   920.,   866.,    21.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    81.],
        [   68.,  9717.,  1078.,     0.,    72.],
        [   75.,  9937.,  1256.,     0.,    57.]])

In [33]:
cont_values.dtype

torch.float32

In [34]:
# Dependent feature 
y=torch.tensor(df['SalePrice'].values,dtype=torch.float).reshape(-1,1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   int32  
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   int32  
 5   LotShape     1201 non-null   int32  
 6   1stFlrSF     1201 non-null   int64  
 7   2ndFlrSF     1201 non-null   int64  
 8   SalePrice    1201 non-null   int64  
 9   Total Years  1201 non-null   int64  
dtypes: float64(1), int32(3), int64(6)
memory usage: 89.1 KB


In [36]:
cat_features.shape,cont_values.shape,y.shape

(torch.Size([1201, 4]), torch.Size([1201, 5]), torch.Size([1201, 1]))

In [37]:
len(df['MSSubClass'].unique())

15

In [38]:
# Embedding size for categorical columns

cat_dims=[len(df[col].unique()) for col in ["MSSubClass", "MSZoning", "Street", "LotShape"]]

In [39]:
cat_dims

[15, 5, 2, 4]

In [40]:
embedding_dim= [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [41]:
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [42]:
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [43]:
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [44]:
cat_featuresz=cat_features[:4]
cat_featuresz

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [45]:
pd.set_option('display.max_rows', 500)
embedding_val=[]
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

In [46]:
embedding_val

[tensor([[-0.1924, -1.1444,  0.4221,  ..., -2.0066,  0.2399, -0.8754],
         [-1.0750,  0.3050,  1.4061,  ..., -2.8568,  0.1895,  0.4468],
         [-0.1924, -1.1444,  0.4221,  ..., -2.0066,  0.2399, -0.8754],
         ...,
         [ 0.0351, -0.1737, -1.1010,  ...,  0.5755, -0.5605,  0.1473],
         [-1.0750,  0.3050,  1.4061,  ..., -2.8568,  0.1895,  0.4468],
         [-1.0750,  0.3050,  1.4061,  ..., -2.8568,  0.1895,  0.4468]],
        grad_fn=<EmbeddingBackward0>),
 tensor([[ 1.2751, -1.1662,  0.7804],
         [ 1.2751, -1.1662,  0.7804],
         [ 1.2751, -1.1662,  0.7804],
         ...,
         [ 1.2751, -1.1662,  0.7804],
         [ 1.2751, -1.1662,  0.7804],
         [ 1.2751, -1.1662,  0.7804]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.9912],
         [-0.9912],
         [-0.9912],
         ...,
         [-0.9912],
         [-0.9912],
         [-0.9912]], grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.0161,  0.3098],
         [ 0.0161,  0.3098],
         [ 0.4404, -1.

In [47]:
z = torch.cat(embedding_val, 1)
z

tensor([[-0.1924, -1.1444,  0.4221,  ..., -0.9912,  0.0161,  0.3098],
        [-1.0750,  0.3050,  1.4061,  ..., -0.9912,  0.0161,  0.3098],
        [-0.1924, -1.1444,  0.4221,  ..., -0.9912,  0.4404, -1.1560],
        ...,
        [ 0.0351, -0.1737, -1.1010,  ..., -0.9912,  0.0161,  0.3098],
        [-1.0750,  0.3050,  1.4061,  ..., -0.9912,  0.0161,  0.3098],
        [-1.0750,  0.3050,  1.4061,  ..., -0.9912,  0.0161,  0.3098]],
       grad_fn=<CatBackward0>)

In [48]:
# Implement dropupout
droput=nn.Dropout(.4)

In [49]:
final_embed=droput(z)
final_embed

tensor([[-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000,  0.0000],
        [-0.0000,  0.5083,  0.0000,  ..., -1.6520,  0.0000,  0.0000],
        [-0.3206, -1.9073,  0.0000,  ..., -1.6520,  0.7340, -1.9267],
        ...,
        [ 0.0586, -0.0000, -1.8350,  ..., -0.0000,  0.0000,  0.5163],
        [-1.7917,  0.5083,  2.3435,  ..., -1.6520,  0.0268,  0.5163],
        [-1.7917,  0.0000,  0.0000,  ..., -0.0000,  0.0268,  0.5163]],
       grad_fn=<MulBackward0>)

In [50]:
# Creating a Feed Forward Neural Network

class FeedForwardNN(nn.Module):

    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((out for inp,out in embedding_dim))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [51]:
len(cont_features)

5

In [52]:
torch.manual_seed(100)
model=FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.4)

In [53]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [54]:
# Defining loss and optimizer

loss_function=nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)

In [55]:
df.shape

(1201, 10)

In [56]:
cont_values

tensor([[   65.,  8450.,   856.,   854.,    19.],
        [   80.,  9600.,  1262.,     0.,    46.],
        [   68., 11250.,   920.,   866.,    21.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    81.],
        [   68.,  9717.,  1078.,     0.,    72.],
        [   75.,  9937.,  1256.,     0.,    57.]])

In [57]:
cont_values.shape

torch.Size([1201, 5])

In [58]:
batch_size=1200
test_size=int(batch_size*0.15)
train_categorical=cat_features[:batch_size-test_size]
test_categorical=cat_features[batch_size-test_size:batch_size]
train_cont=cont_values[:batch_size-test_size]
test_cont=cont_values[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

In [59]:
len(train_categorical),len(test_categorical),len(train_cont),len(test_cont),len(y_train),len(y_test)

(1020, 180, 1020, 180, 1020, 180)

In [60]:
epochs=5000
final_losses=[]
for i in range(epochs):
    i=i+1
    y_pred=model(train_categorical,train_cont)
    loss=torch.sqrt(loss_function(y_pred,y_train)) ### RMSE
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch number: 1 and the loss : 200496.75
Epoch number: 11 and the loss : 200493.46875
Epoch number: 21 and the loss : 200489.140625
Epoch number: 31 and the loss : 200482.640625
Epoch number: 41 and the loss : 200473.25
Epoch number: 51 and the loss : 200461.375
Epoch number: 61 and the loss : 200446.4375
Epoch number: 71 and the loss : 200429.421875
Epoch number: 81 and the loss : 200407.9375
Epoch number: 91 and the loss : 200383.390625
Epoch number: 101 and the loss : 200355.234375
Epoch number: 111 and the loss : 200322.234375
Epoch number: 121 and the loss : 200291.296875
Epoch number: 131 and the loss : 200252.0625
Epoch number: 141 and the loss : 200206.390625
Epoch number: 151 and the loss : 200162.03125
Epoch number: 161 and the loss : 200112.15625
Epoch number: 171 and the loss : 200059.0625
Epoch number: 181 and the loss : 200006.1875
Epoch number: 191 and the loss : 199947.328125
Epoch number: 201 and the loss : 199881.5
Epoch number: 211 and the loss : 199815.078125
Epoch 

Epoch number: 1781 and the loss : 154073.21875
Epoch number: 1791 and the loss : 153770.21875
Epoch number: 1801 and the loss : 153127.953125
Epoch number: 1811 and the loss : 152820.375
Epoch number: 1821 and the loss : 152269.609375
Epoch number: 1831 and the loss : 152433.21875
Epoch number: 1841 and the loss : 151566.5
Epoch number: 1851 and the loss : 150672.109375
Epoch number: 1861 and the loss : 150731.8125
Epoch number: 1871 and the loss : 150067.765625
Epoch number: 1881 and the loss : 149282.53125
Epoch number: 1891 and the loss : 148349.671875
Epoch number: 1901 and the loss : 148611.21875
Epoch number: 1911 and the loss : 148932.078125
Epoch number: 1921 and the loss : 147677.109375
Epoch number: 1931 and the loss : 147302.390625
Epoch number: 1941 and the loss : 146604.140625
Epoch number: 1951 and the loss : 146429.046875
Epoch number: 1961 and the loss : 145957.671875
Epoch number: 1971 and the loss : 145540.703125
Epoch number: 1981 and the loss : 145697.546875
Epoch n

Epoch number: 3531 and the loss : 68138.796875
Epoch number: 3541 and the loss : 70636.984375
Epoch number: 3551 and the loss : 67985.9921875
Epoch number: 3561 and the loss : 66217.828125
Epoch number: 3571 and the loss : 66651.8046875
Epoch number: 3581 and the loss : 66239.625
Epoch number: 3591 and the loss : 66248.9921875
Epoch number: 3601 and the loss : 64368.00390625
Epoch number: 3611 and the loss : 65148.9765625
Epoch number: 3621 and the loss : 65090.1484375
Epoch number: 3631 and the loss : 63110.9453125
Epoch number: 3641 and the loss : 64636.06640625
Epoch number: 3651 and the loss : 62723.0078125
Epoch number: 3661 and the loss : 63747.4609375
Epoch number: 3671 and the loss : 64179.98046875
Epoch number: 3681 and the loss : 60966.24609375
Epoch number: 3691 and the loss : 61484.68359375
Epoch number: 3701 and the loss : 60379.84765625
Epoch number: 3711 and the loss : 61328.20703125
Epoch number: 3721 and the loss : 60002.23828125
Epoch number: 3731 and the loss : 57454

In [62]:
# Validating the test data

y_pred=""
with torch.no_grad():
    y_pred=model(test_categorical,test_cont)
    loss=torch.sqrt(loss_function(y_pred,y_test))
print('RMSE: {}'.format(loss))

RMSE: 38577.7109375


In [63]:
data_verify=pd.DataFrame(y_test.tolist(),columns=["Test"])

In [64]:
data_predicted=pd.DataFrame(y_pred.tolist(),columns=["Prediction"])

In [65]:
data_predicted

Unnamed: 0,Prediction
0,117531.585938
1,189521.1875
2,157312.328125
3,250937.25
4,212930.96875
5,212139.25
6,152360.6875
7,314240.125
8,141008.96875
9,374493.65625


In [66]:
final_output=pd.concat([data_verify,data_predicted],axis=1)
final_output['Difference']=final_output['Test']-final_output['Prediction']
final_output.head()

Unnamed: 0,Test,Prediction,Difference
0,130000.0,117531.585938,12468.414062
1,138887.0,189521.1875,-50634.1875
2,175500.0,157312.328125,18187.671875
3,195000.0,250937.25,-55937.25
4,142500.0,212930.96875,-70430.96875


In [69]:
# Saving the model

torch.save(model,'HousePrice.pt')

In [70]:
torch.save(model.state_dict(),'HouseWeights.pt')

In [71]:
# Loading the saved model

embs_size=[(15, 8), (5, 3), (2, 1), (4, 2)]
model1=FeedForwardNN(embs_size,5,1,[100,50],p=0.4)

In [72]:
model1.load_state_dict(torch.load('HouseWeights.pt'))

<All keys matched successfully>

In [73]:
model1.eval()

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)