In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader   #for batch seperation
from torchvision.transforms import ToTensor  ####convert tensors to vectors
from torch.utils import data

In [None]:
train_df=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_df=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.select_dtypes( exclude= 'object')

In [None]:
f=train_df.select_dtypes( exclude= 'object')

In [None]:
correlations = f.corr()['SalePrice'] #calculate the correlations with 'SalePrice' 

In [None]:
sorted_correlations = f[['SalePrice']].sort_values(by='SalePrice', ascending=False) #calculate correlations and sort them in descending order based on the 'SalePrice' column

In [None]:
test_df.drop_duplicates(inplace=True) 
train_df.drop_duplicates(inplace=True)

## Removing unnecessary columns

In [None]:
train_df =train_df.drop(['Id','Street', 'Utilities','LandSlope','Condition2','RoofMatl','CentralAir','LowQualFinSF','3SsnPorch','ScreenPorch','PoolArea','MiscVal',"MiscFeature",'Fence','PoolQC','Alley','MasVnrType','FireplaceQu'], axis=1)
test_df =test_df.drop(['Id','Street', 'Utilities','LandSlope','Condition2','RoofMatl','CentralAir','LowQualFinSF','3SsnPorch','ScreenPorch','PoolArea','MiscVal',"MiscFeature",'Fence','PoolQC','Alley','MasVnrType','FireplaceQu'], axis=1)

## Merging correlated columns

In [None]:
# add the two new columns "n_bathrooms" and "area_with_basement" to both the train_df and test_df
for df in (train_df, test_df):
    df["n_bathrooms"] = df["BsmtFullBath"] + (df["BsmtHalfBath"]*0.5) + df["FullBath"] + (df["HalfBath"]*0.5) # no of halfbath in the basement, multiplied by 0.5 to account for partial bathrooms
    df["area_with_basement"]  = df["GrLivArea"] + df["TotalBsmtSF"] # total basement square footage,above-ground living area square footage

In [None]:
# we are dropping these columns to improve model performance and reduce complexity
train_df =train_df.drop(["BsmtFullBath","BsmtHalfBath","FullBath","HalfBath",'GrLivArea','TotalBsmtSF'], axis=1)
test_df =test_df.drop(["BsmtFullBath","BsmtHalfBath","FullBath","HalfBath",'GrLivArea','TotalBsmtSF'], axis=1)

In [None]:
train_df.info()

In [None]:
pd.set_option('display.max_rows',100) # to limit the number of displayed rows to avoid excessive output
train_df.isna().sum()

## Filling null values Categorical

#### here 'c','n' are list contains categotical and numerical columns having null values in it

In [None]:
# you will have two lists:c containing the names of columns with missing values and an 'object' and n containing the names of columns with missing values and a non-'object' data type
c = [] #initialized as empty lists to store the column names
n = []
for i in train_df.columns:
    if train_df[i].isnull().sum() > 0: # it checks if the column contains missing values 
        if (train_df[i].dtype) == 'O': # checks the data type of the column 
            c.append(i) # If the data type is 'O'/object,column name is appended to the c list.
        else :
            n.append(i)
print(c,n)

### here 'cc','nnn' are list contains categotical and numerical columns having null values in it

In [None]:
cc = []
nnn = []
for i in test_df.columns:
    if test_df[i].isnull().sum() > 0:
        if (test_df[i].dtype) == 'O':
            cc.append(i)
        else :
            nnn.append(i)
print(cc,nnn)

### filling nan with some values

In [None]:
# checks the missing values,If there are missing values,it checks if the second element of the columnis not NaN
# it fills the missing values in that column with the value from the second element using the fillna method
for i in c:
    if train_df[i].isnull().sum() > 0 and not pd.isna(train_df[i].iloc[1]):
        train_df[i].fillna(train_df[i].iloc[1], inplace=True)
    elif train_df[i].isnull().sum() > 0 and not pd.isna(train_df[i].iloc[1]):
        train_df[i].fillna(train_df[i].iloc[1], inplace=True)

In [None]:
for i in cc:
    if test_df[i].isnull().sum() > 0 and not pd.isna(test_df[i].iloc[1]):
        test_df[i].fillna(test_df[i].iloc[1], inplace=True)
    elif test_df[i].isnull().sum() > 0 and not pd.isna(test_df[i].iloc[1]):
        test_df[i].fillna(test_df[i].iloc[1], inplace=True)

In [None]:
for i in cc :
    print('-*-*'*10)
    print(i)
    print(test_df[i].isnull().sum())
    print('___+++__'*8)

In [None]:
for i in c :
    print('-*-*'*10)
    print(i)
    print(train_df[i].isnull().sum())
    print('___+++__'*8)

## Filling null values continuous

In [None]:
for i in n :
    train_df[i] = train_df[i].fillna(np.mean(train_df[i])) #calculates the mean of the non-missing values in that column and fills the missing values with that mean

In [None]:
for i in nnn :
    test_df[i] = test_df[i].fillna(np.mean(test_df[i]))

In [None]:
for i in nnn :
    print(i)
    print(test_df[i].isnull().sum())

In [None]:
for i in n :
    print(i)
    print(train_df[i].isnull().sum())

### fill nan values

In [None]:
a = train_df[n]
a.corr()

In [None]:
train_df.describe().T

In [None]:
train_df.describe(include = 'O').T

In [None]:
test_df.describe()

## training label

In [None]:
train_out = train_df['SalePrice']
train_out = np.log(train_out) #calculates the natural logarithm (base e) of the values in the train_out 

In [None]:
train_out

In [None]:
train_df.drop('SalePrice',axis = 1,inplace = True)
train_df.head()

## testing label

In [None]:
test_actualout = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
test_actualout = test_actualout['SalePrice']
test_actualout = np.log(test_actualout)
test_actualout

In [None]:
# cate = [i for i in train_data.columns if train_data[i].dtype == object]   nextline code is same insted of doing all the stuff we simply ues select_dtypes
cate = train_df.select_dtypes(include  = 'O').columns
cate

In [None]:
len(cate)

In [None]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
for i in cate:
    train_df[i] = l.fit_transform(train_df[i])
    test_df[i] = l.fit_transform(test_df[i])
#  fits the encoder on the unique categories in the column and transforms the categories into numerical labels

In [None]:
train_df

In [None]:
test_df

In [None]:
type(train_df)

In [None]:
train_df.corr()

In [None]:
g = pd.DataFrame(train_df.dtypes,columns=['dtype'])
train_df

## Feature and label scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train=scaler.fit_transform(train_df)
X_test = scaler.transform(test_df)

## dataframe to torch dataloaders

In [None]:
# X_train = torch.tensor(train_data, dtype=torch.float32)
# y_train = torch.tensor(train_out, dtype=torch.float32)
# X_test = torch.tensor(test_data, dtype=torch.float32)
# y_test = torch.tensor(test_actualout, dtype=torch.float32)
from torch.autograd import Variable
X_train = Variable(torch.from_numpy(X_train)).float()
Y_train = torch.tensor(train_out).float()
X_test = Variable(torch.from_numpy(X_test)).float()
Y_test = torch.tensor(test_actualout).float()

In [None]:
train_dataset = data.TensorDataset(X_train,Y_train)
batch = 64
train_dataloader = DataLoader(train_dataset,batch_size = batch,shuffle = True,)

In [None]:
test_dataset = data.TensorDataset(X_test,Y_test)
batch_size = 64
test_dataloader = DataLoader(test_dataset,batch_size = batch_size,shuffle = True) #the data will be randomly shuffled for each epoch

## Activation function

In [None]:
import torch.nn as nn

relu = nn.ReLU()
(X_train)

## Neuralnetwork Architecture

In [None]:
class Neuralnetwork(nn.Module):
    def __init__(self,n_input,n_hidden,n_output):
        super(Neuralnetwork,self).__init__()
        #self.flatten = nn.Flatten()
        self.layer1 = nn.Linear(n_input,n_hidden)
        self.relu = nn.ReLU()

        self.layer2 = nn.Linear(n_hidden,256)
        self.out = nn.Linear(256,n_output)
    def forward(self,x):
        #x = self.flatten(x)
        x = self.layer1(x)
        
        x = self.relu(x)
        x = self.layer2(x)
        x = relu(x)
        x = self.out(x)
        out_ =self.relu(x)
        return out_

## GPU & optimizer Setup

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Neuralnetwork(X_train.shape[1],512,1)
model = model.to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr = 0.01)
model

## Training

In [None]:
def training(train_dataloader,model,loss_,optimizer):
    
    model.train()
    
    for batch,(X,y)in enumerate(train_dataloader):
        
        X = X.to(device)
        y = y.to(device)
        
        pred = model(X)
        #print('predicted value is ',pred)
        los = loss_(pred,y)
        #print('loss is as ',loss)
#3backpropagation
        los.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        
        if batch% 5 ==0:
            #print('batch number is *************       ' ,batch)
            print(f'loss:{los}')
#         with torch.no_grad():
#             pred = model(X_train.float())
#             train_mae = mean_absolute_error(y, pred.numpy())
#             train_rmse = np.sqrt(mean_squared_error(y, pred.numpy()))
#             print(train_rmse)

In [None]:
from sklearn.metrics import mean_absolute_error
#mean_absolute_error(y.detach().cpu().numpy(), pred.detach().cpu().numpy())
##.detach().cpu().numpy()   this is to move between cpu and gpu 

In [None]:
def test(model,data,loss_):
    model.eval()
    test_loss = 0
    size  = len(X_test)
    
    with torch.no_grad():
        
        for batch,(X,y) in enumerate(data):
            X = X.to(device)
            y = y.to(device)
    
            pred = model(X)
        
            batch_loss = loss_fn(pred, y)
            #print('test_loss is ',test_loss)
            #print('batch_loss is          ',batch_loss)
            #print('batch_loss of items is           ',   batch_loss.item())
            
            test_loss += batch_loss.item()
    
        average_loss = test_loss / size
        print(f'Test Loss: {average_loss:.4f}')

In [None]:
epoch = 10
i = 1
for t in range(epoch):
    print('-----------------epoch number -----',i)
    training(train_dataloader,model,loss_fn,optimizer)
    #print(y)
    test(model,test_dataloader,loss_fn)  
    i+=1