In [None]:
import torch
import pandas as pd,numpy as np
from torch.utils.data import Dataset,DataLoader
from torch import optim
from torch import nn
import os
import matplotlib.pyplot as plt


In [None]:
torch.backends.cudnn.benchmarks =True

In [None]:
# Reading the data houseSalesPrediction
path = r'../input/house-prices-advanced-regression-techniques'
data = pd.read_csv(os.path.join(path,'train.csv'))

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device('cpu')

## Lets understand the data

### Train Data

In [None]:
#Finding out the columns
data.columns

In [None]:
# Drop the NA columns
data = data.replace(to_replace='NA',value=np.NaN)
data.dropna(inplace=True,axis=1)

In [None]:
data.shape

In [None]:
# Lets take columns with unique values less than 90% or less than total values ( This will contribute to some learning)

dataImpColumns = {columns:data[columns].count() for columns in list(data.columns) if len(data[columns].value_counts())<=0.9*data.shape[0]}

In [None]:
# Take columns which are only important
data = data[list(dataImpColumns.keys())]

In [None]:
dict(zip(data['MSZoning'].unique(),range(len(data['MSZoning'].unique()))))

In [None]:
#These columns needs to be lable encoded and will be required for Testing Purpose also
lableEncodedColumnsDict = {column:dict(zip(data[column].unique(),range(len(data[column].unique())))) for column in data.columns if data[column].dtype=='O'}

In [None]:
lableEncodedColumnsDict.keys()

In [None]:
# Lets Label Encode the columns

In [None]:
data.replace(lableEncodedColumnsDict, inplace=True)
data.head()

In [None]:
# Normalize the individual column
normalizeDict = {}
def normalize(series):
    global normalizeDict
    columnName = series.name
    mean,std = series.mean(),series.std()
    normalizeDict[columnName] = (mean,std)
    return (series-mean)/std

In [None]:
# Taking Features
datax = data[[column for column in data.columns if column!='SalePrice']].apply(normalize,axis=0)

# The Predictor
datay = data[['SalePrice']].apply(normalize,axis=0)

In [None]:
# The Scaled Data
datascaled = (torch.tensor(datax.values,requires_grad=True).type(torch.float32),torch.tensor(datay.values,requires_grad=True).type(torch.float32))

In [None]:
# Will be used to store the scaling factors
normalizeDict.keys()

## Model and Dataset Building

In [None]:
class datasetclass(Dataset):
    
    def __init__(self,):
        super(datasetclass,self).__init__()
        self.trainX = torch.Tensor(datascaled[0]).to(device)
        self.trainY = torch.Tensor(datascaled[1]).to(device)


    def __getitem__(self,index):
        return self.trainX[index],self.trainY[index]
    
    def __len__(self,):
        return len(self.trainX)

In [None]:
class modelClass(nn.Module):
    
    def __init__(self,inputDim,outputDim):
        super(modelClass,self).__init__()
        self.LinearLayer1 = nn.Linear(inputDim,inputDim)
        self.LinearLayer2 = nn.Linear(inputDim,inputDim)
        self.LinearLayer3 = nn.Linear(inputDim,inputDim)
        self.LinearLayer4 = nn.Linear(inputDim,inputDim)
        self.LinearLayer5 = nn.Linear(inputDim,outputDim)
        self.ReLULayer1 =  nn.ReLU(inputDim)
        self.ReLULayer2 = nn.ReLU(inputDim)
        self.ReLULayer3 = nn.ReLU(inputDim)
        self.ReLULayer4 = nn.ReLU(inputDim)
        self.LReLULayer1 = nn.LeakyReLU(inputDim)
        self.BatchNorm1 = nn.BatchNorm1d(inputDim)
        self.BatchNorm2 = nn.BatchNorm1d(inputDim)
        self.BatchNorm3 = nn.BatchNorm1d(inputDim)
        self.optimizer = torch.optim.Adam(self.parameters(),lr=0.0001)
        self.lossMSE = nn.modules.MSELoss()
        self.output = torch.tensor([0.1])
        
    def forward(self,x):
        x = self.LinearLayer1(x)
        x = self.BatchNorm1(x)
        x = self.ReLULayer1(x)
        x = self.LinearLayer2(x)
        x = self.BatchNorm2(x)
        x = self.ReLULayer2(x)
        x = self.LinearLayer3(x)
        x = self.BatchNorm3(x)
        x = self.ReLULayer3(x)
        x = self.LinearLayer4(x)
        x = self.ReLULayer4(x)
        x = self.LinearLayer5(x)
        return x
    
    def lossFunc(self,y,yhat):
        return self.lossMSE(yhat,y)
    
    def backward(self,x,y):
        yhat = self.forward(x)
        self.output = self.lossFunc(y,yhat)
        self.output.backward()
        with torch.no_grad():
            self.optimizer.step()
            self.optimizer.zero_grad()
        

In [None]:
# The batch size is 512
bs = 512
epochs = 500
inputDim = len(datascaled[0][0])
outputDim = 1

In [None]:
datasetObj = datasetclass()
data_loader = DataLoader(datasetObj, batch_size=bs, shuffle=True)

In [None]:
# 8 feature is taken into consideration and a single output is compared to the actual to the loss.
modelObj = modelClass(inputDim,outputDim)
modelObj.to(device)

In [None]:
data_loader = DataLoader(datasetObj, batch_size=bs, shuffle=True)

In [None]:
# 500 epochs are taken and trained and loss is calculated.
loss_values = []
for epoch in range(epochs):
    for x,y in data_loader:
        modelObj.train()
        modelObj.backward(x,y)
        running_loss = modelObj.output
    loss_values.append(running_loss)
    plt.plot(loss_values)
print('epoch is ', epoch , 'Final loss is ',modelObj.output)



## Denormalized And Testing Module

In [None]:
# Feature Columns
dataImpColumnsWithoutSalePrice = list(set(list(dataImpColumns.keys()))-{'SalePrice'})

In [None]:
# sample_submission is read for comparing the actual
dataValid = pd.read_csv(os.path.join(path,'test.csv'))
sampleSubmission = pd.read_csv(os.path.join(path,'sample_submission.csv'))[['SalePrice']]

In [None]:
# Filling NaN values with 0 , so the input to the model is not NaN
dataValid = dataValid[dataImpColumnsWithoutSalePrice].fillna(0)
sampleSubmission = sampleSubmission.fillna(0)
print(dataValid.shape,sampleSubmission.shape)

In [None]:
# Label Encode for the test sample
dataValid.replace(lableEncodedColumnsDict, inplace=True)

In [None]:
# Normalize the columns
def normalizeValid(series):
    columnName = series.name
    mean,std = normalizeDict[columnName]
    return (series-mean)/std

In [None]:
# Normalize the columns
datascaledTestX = dataValid.apply(normalizeValid,axis=0)
datascaledTestY = sampleSubmission.apply(normalizeValid,axis=0)

In [None]:
# Normalize the columns
datascaledTestX = torch.tensor(datascaledTestX.values).type(torch.float32)
datascaledTestY = torch.tensor(datascaledTestY.values).type(torch.float32)

In [None]:
# Batch Input the columns
class datasetclassValid(Dataset):
    
    def __init__(self,):
        super(datasetclassValid,self).__init__()
        self.testX = datascaledTestX
        self.testY = datascaledTestY
        
    def __getitem__(self,index):
        return self.testX[index],self.testY[index]
    
    def __len__(self,):
        return self.testX.shape[0]

In [None]:
datasetclassValidObj = datasetclassValid()
data_loader_valid = DataLoader(datasetclassValidObj, batch_size=bs, shuffle=False)

In [None]:
mean,std = normalizeDict['SalePrice']

In [None]:
## Submission Frame
submit = pd.DataFrame(columns=['SalePrice'])

In [None]:
loss = []
for x,y in data_loader_valid:
    modelObj.eval()
    prediction = modelObj(x.to(device))*std+mean
    submit = submit.append(pd.DataFrame(prediction.to(torch.device('cpu')).detach().numpy(),columns=['SalePrice']))

In [None]:
submit.tail()

In [None]:
submit.to_csv('submisson.csv',index=False)

## Next Steps
1. Feature importance is required for training the model and increase the accuracy
2. EDA needs to be done.