#Prerequisite....

01. Theoretical knowledge of Deep Learning
02. ANN with Pytorch
03. Feature Engineering (Categorical --  Embedding Layer, Continous Variables)
04. Pythonic Class to create feed Forward NN's


###Tabular Dataset --> Dataset Which has rows and columns format

#### Handle Datas
01. Categorical Features -- Embedding Layers(in Pytorch)
02. Continuous Featues -- NA  

In [None]:
from google.colab import files
file = files.upload()

In [None]:
import pandas as pd

In [None]:
#Lets Take most important Columns based on previous experience and dropping all NAN values
df=pd.read_csv('houseprice.csv',usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Checking Uniques values available in each columns
#BAsed on unique numbers we can able to say that least is the count == Categorical values 
for i in df.columns:
  print("Column name {} and unique values are {}".format(i,len(df[i].unique())))

In [None]:
#Lets fix feature "YearBuilt" like difference between year built and todays date
#It will help to see Age of Buildings
import datetime
datetime.datetime.now().year

In [None]:
df["Total Years"] = datetime.datetime.now().year-df['YearBuilt']

In [None]:
df.drop("YearBuilt", axis = 1, inplace = True)

In [None]:
df.columns

In [None]:
#Lets split the features based on categorical and continuous features
#Creating Categorical feature
cat_features = ["MSSubClass","MSZoning","Street","LotShape"]
out_features = "SalesPrice"

In [None]:
df["MSSubClass"].unique()

In [None]:
#Lets fix some indexes for all unique values with in the column
#Out motive is to give unique label for all numbers available thats why we dint used OnehotEncoding
#LabelEncoder will sort all numbers first and later it will assign labels(i.e INDEXES)
#LabelEncoding will help in Embedding process(assiging vectors)
from sklearn.preprocessing import LabelEncoder
lbl_encoders = {}
for features in cat_features:
  lbl_encoders[features] = LabelEncoder()
  df[features] = lbl_encoders[features].fit_transform(df[features])

df

In [None]:
import numpy as np

In [None]:
# Stacking and Converting Into Tensors

cat_features = np.stack([df["MSSubClass"], df["MSZoning"], df["Street"], df["LotShape"]],1)
cat_features

In [None]:
# Convert numpy to Tensors
# IMPORTANT ---> Categorical featires never be converted in to float
import torch
cat_features = torch.tensor(cat_features, dtype = torch.int64)
cat_features

In [None]:
#Lets take all Continuous variables
cont_features = []
for i in df.columns:
  if i in ["MSSubClass","MSZoning","Street","LotShape","SalePrice"]:
    pass
  else:
    cont_features.append(i)
cont_features

In [None]:
#Stacking Coninuous variable to a tensor
cont_values = np.stack([df[i].values for i in cont_features], axis = 1)
cont_values = torch.tensor(cont_values, dtype = torch.float)
cont_values

In [None]:
cont_values.dtype

In [None]:
# Dependent Features
y = torch.tensor(df["SalePrice"].values, dtype=torch.float).reshape(-1,1) #using reshape will give you 2D tensor

In [None]:
df.info()

In [None]:
#Shape of cat, cont and y features
cat_features.shape, cont_values.shape, y.shape

# Embedding for Categorical Columns

In [None]:
len(df["MSSubClass"].unique())

In [None]:
#Counting unique values in each columns
cat_dims = [len(df[col].unique()) for col in ["MSSubClass","MSZoning","Street","LotShape"]]
cat_dims #Input Dimension for NN layers

In [None]:
# Output dimension should be set based on input dimension(min(50,feature dimension/2))-- AS PER STANDARD
#Cant go beyond 50
#Example for "MSSubClass" we have 1 uniques values So, min(50,15/2(i.e 7.5)) --> 7
#So, 7 is the output of emebedding layer
embedding_dim = [(x, min(50, (x+1)//2)) for x in cat_dims] #---> // will give you INT values i.e 17 //2 == 8
#We are taking x+1 instaed of just x just to get the higher number ##--> (15+1)//2 = 8 and 15//2 = 7

In [None]:
embedding_dim
#for "MSSubClass" my input dimension is 15 and outpur dimension is 8
#for "MSZoning" my input dimension is 5 and outpur dimension is 3


#*Data Pre Processing steps Completed..........................*

# Creating Emebdding Layer and NN

In [None]:
import torch.nn as nn
import torch.nn.functional as F
embed_representation = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

In [None]:
cat_features

In [None]:
#just for example...
cat_featuresz = cat_features[:4]
cat_featuresz

In [None]:
#Lets Convert into vectors with the help of embed_representation

pd.set_option('display.max_rows', 500)
embedding_val = []
for i, e in enumerate(embed_representation):
  embedding_val.append(e(cat_features[:,i]))

embedding_val


#So the first value 5 in cat_featuresz will be represented as 8 values in first row of embedding_val
#value 0 in cat_featuresz will be represented as 8 values in second row of embedding_val
# ..... for all "MSSubClass" 
#Next we will get only 3 values for second column...
#SO On,,,,,,

In [None]:
# embedding_val not stcked properly so,

z = torch.cat(embedding_val,1) #cat--concat in torch
z

In [None]:
## Implement Dropout to overcome overfitting
dropout = nn.Dropout(.4) #.4 = 40%


In [None]:
final_embed = dropout(z)
final_embed

In [None]:
##### Create a Feed Forward Neural Network
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):

    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5): #out_sz -output layer | p -- droput ratio
        super().__init__() #inheriting all the paremeters from nn.Module
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim]) #Embedding layer
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont) #Batch Norm w.r.t no of continuous variables
        
        layerlist = []
        n_emb = sum((out for inp,out in embedding_dim)) #Total dimension for embedding layers
        n_in = n_emb + n_cont #(embedding feature(cat) + continuous features)
        
        #Creating layers and nuerons
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [None]:
len(cont_features)

In [None]:
torch.manual_seed(100) #If u use 100 then the same parameters will be assigned over every itiration
model = FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.4)


In [None]:
model

#Define Loss And Optimizer

In [None]:
model.parameters

In [None]:
loss_function = nn.MSELoss() #Later convert this in to RMSE
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01) #model.parameters --> these are the generator and it will throw output one by one | to know this u can use "model.parameters()"

In [None]:
df.shape

In [None]:
print(cont_values)
print(cont_values.shape)

In [None]:
#Train Test Split
batch_size=1200
test_size=int(batch_size*0.15) #15% of whole data as test dataset i.e 180
train_categorical=cat_features[:batch_size-test_size] #---> [starting to (1200-180)]
test_categorical=cat_features[batch_size-test_size:batch_size] #---> [(1200-180) to 1200]
train_cont=cont_values[:batch_size-test_size]
test_cont=cont_values[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

In [None]:
len(train_categorical),len(test_categorical),len(train_cont),len(test_cont),len(y_train),len(y_test)

In [None]:

epochs=500
final_losses=[]
#Run the number of epochs
for i in range(epochs):
    i=i+1
    y_pred=model(train_categorical,train_cont) 
    loss=torch.sqrt(loss_function(y_pred,y_train)) ### RMSE
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward() #backpropogation
    optimizer.step()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(range(epochs), final_losses)
plt.ylabel('RMSE Loss')
plt.xlabel('epoch');

In [None]:
#### Validate the Test Data
y_pred=""
with torch.no_grad():
    y_pred=model(test_categorical,test_cont)
    loss=torch.sqrt(loss_function(y_pred,y_test))
print('RMSE: {}'.format(loss))

In [None]:
data_verify=pd.DataFrame(y_test.tolist(),columns=["Test"])

In [None]:
data_predicted=pd.DataFrame(y_pred.tolist(),columns=["Prediction"])

In [None]:
data_predicted

In [None]:
final_output=pd.concat([data_verify,data_predicted],axis=1)
final_output['Difference']=final_output['Test']-final_output['Prediction']
final_output.head()

In [None]:
#### Saving The Model
#### Save the model
# Extension for saving pytorch model is ".pt"
torch.save(model,'HousePrice.pt')

In [None]:
torch.save(model.state_dict(),'HouseWeights.pt')

In [None]:
### Loading the saved Model
embs_size=[(15, 8), (5, 3), (2, 1), (4, 2)]
model1=FeedForwardNN(embs_size,5,1,[100,50],p=0.4)

In [None]:
model1.load_state_dict(torch.load('HouseWeights.pt')) #Saving weights

In [None]:
model1.eval()


# ..............................THANK YOU............................
---

