<a href="https://colab.research.google.com/github/sharlenechen0113/Real-Estate-Price-Prediction/blob/main/model_resnet_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.datasets as dset
import torchvision.transforms as T
import torchvision.models
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import numpy as np
import pandas as pd
from PIL import Image
from sklearn import preprocessing, metrics, model_selection

In [None]:
USE_GPU = True

dtype = torch.float

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100

print('using device:', device)

In [None]:
train_mean = 0.0
train_std = 0.0

In [None]:
# load images
from os import listdir, path

images_list = listdir('/content/drive/MyDrive/SC201_Final_Project/images/map')

print(images_list)

image_dic_index = {}
image_dic_latlng = {}
for i in range(len(images_list)):
  image_dic_index[i] = images_list[i][:len(images_list[i])-4]
  image_dic_latlng[images_list[i][:len(images_list[i])-4]] = i
print(image_dic_index)
print(image_dic_latlng)

In [None]:
nan_cache = {}
def data_preprocess(filename,mode='Train'):
  data = pd.read_csv(filename)
  data.insert(len(data.columns), 'index', '')
  data.loc[data['lat_lng'] == '25_121.50227','lat_lng'] = '25.0_121.50227'
  index_list = []
  for index, row in data.iterrows():
    index_list.append(image_dic_latlng[row[2]])
  data['index'] = index_list
  if mode == 'Train':
    # split data to train, validation and test datsets
    train_data, val_data = model_selection.train_test_split(data,test_size = 0.2)
    means = train_data.mean()
    train_data = train_data.fillna(means)
    val_data = val_data.fillna(means)
    for column in data:
      nan_cache[column] = means
    train_y = train_data.pop('price_30000')
    val_y = val_data.pop('price_30000')
    return train_data, val_data, train_y, val_y
  elif mode == 'Test':
    real_test_data = data.fillna(nan_cache)
    return real_test_data

In [None]:
FILE = '/content/drive/MyDrive/SC201_Final_Project/Data/new_data_0902.csv' # Your File Path

In [None]:
train_data, val_data, train_y, val_y = data_preprocess(FILE,mode='Train')
print(val_data.shape)


In [None]:
train_mean = 154170.694
train_std = 79570.40139
to_drop = ['unit_price','price_15000','zoning', 'lat','lng','lat_lng','unit_berth_price','compartmented','management_committee','floors_area','establishment','clothing_store','home_goods_store','store','local_government_office','university','natural_feature','health','tourist_attraction','transit_station','food']
for drop_items in to_drop:
  train_data.pop(drop_items)
  val_data.pop(drop_items)

print(train_data.columns)
print(val_data.shape)

In [None]:
# load images to dictionary
def load_imgs(lat_lng):
  image = Image.open('/content/drive/MyDrive/SC201_Final_Project/images/map/{}.png'.format(lat_lng)).convert('RGB')
  image = T.ToTensor()(image)
  return image

In [None]:
class HousingDataset(torch.utils.data.Dataset):
  def __init__(self,dataset,labels):
    self.y_train = torch.tensor(labels.values,dtype=torch.float32).view(len(labels),1)
    self.x_train = torch.tensor(dataset.values,dtype=torch.float32)
    print("HousingDataset shape {}, {}".format(self.x_train.shape, self.y_train.shape))
  def __len__(self):
    return len(self.y_train)
  def __getitem__(self,idx):
    index = int(self.x_train[idx][71])
    # print(index)      # how to extract values from tensor
    x_img = load_imgs(image_dic_index[index])
    return self.x_train[idx], x_img , self.y_train[idx]

In [None]:
# Datasets load training examples one at a time, so we wrap each Dataset in a 
# DataLoader which iterates through the Dataset and forms minibatches. We divide
# the training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
# This is for after map data is loaded into the features, so shuffling is done
BATCH_SIZE = 32
def form_minibatch(data,mode='Train'):
  if mode == 'Train':
    data = DataLoader(data,batch_size=BATCH_SIZE,shuffle=True)
  elif mode == 'Test':
    data = DataLoader(data,batch_size=BATCH_SIZE)
  return data

train_dl = form_minibatch(HousingDataset(train_data,train_y))
eval_dl = form_minibatch(HousingDataset(val_data,val_y),mode='Test')
print("training size {}, eval size {}".format(len(train_dl), len(eval_dl)))

In [None]:
# model = MyModel()
class MyModel(nn.Module):
    def __init__(self,input_size,output_size):
        super().__init__()
        self.nn = nn.Sequential(
        nn.Linear(input_size, 60),
        nn.BatchNorm1d(60),
        nn.ReLU(),
        nn.Linear(60, 45),
        nn.BatchNorm1d(45),
        nn.ReLU(),
        nn.Linear(45, output_size),
        nn.BatchNorm1d(output_size),
        )
        # epoch 25-29
        resnet = torchvision.models.resnet50(pretrained=True)
        for param in resnet.parameters():
          param.requires_grad = True
        

        self.features2 = nn.Sequential(
              nn.AdaptiveAvgPool2d((224,224)),
              *list(resnet.children())[:6],

              # N x 512 * 28 * 28
              nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1),
              nn.BatchNorm2d(256),
              nn.ReLU(),

              # N x 256 * 28 * 28
              nn.Conv2d(in_channels=256, out_channels=128, kernel_size=3, padding=1),
              nn.BatchNorm2d(128),
              nn.ReLU(),

              # N x 128 * 28 * 28
              nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, padding=1),
              nn.BatchNorm2d(64),
              nn.ReLU(),

              # N x 64 x 28 x 28  
              nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, padding=1),
              nn.BatchNorm2d(32),
              nn.ReLU(),

              nn.MaxPool2d(kernel_size=2, stride=2),  

              # N x 32 x 14 x 14
              nn.Conv2d(in_channels=32, out_channels=16, kernel_size=3, padding=1),
              nn.BatchNorm2d(16),
              nn.ReLU(),

              # N x 16 x 14 x 14
              nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3, padding=1),
              nn.BatchNorm2d(8),
              nn.ReLU(),

              nn.MaxPool2d(kernel_size=2, stride=2),

              # N x 8 x 7 x 7
              
              nn.Flatten(),
              nn.Linear(8*7*7,output_size)
          )
        self.metrics = 0
        self.fc_out = nn.Sequential(
            nn.Linear(output_size*2,output_size)
        )


    def forward(self,x1,x2):
        x1 = self.nn(x1)
        x1 = x1.view(x1.size(0), -1)
        x1 = F.relu(x1)

        x2 = self.features2(x2)
        x2 = x2.view(x2.size(0), -1)
        x2 = F.relu(x2)
        # Concatenate in dim1 (feature dimension)
        x = torch.cat((x1, x2),1)
        scores = self.fc_out(x)
        return scores

net = MyModel(71,33)
optimizer = torch.optim.Adam(net.parameters(),lr=1e-4,weight_decay=0.99)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)
optimizer_SGD_momentum = torch.optim.SGD(net.parameters(),lr=1e-5,momentum=0.9,weight_decay=0.98)

In [None]:
loss_history=[]
def train(data1, data2, model, optimizer, epochs=1):
    """
    Train a model on real estate data property features 
    and extracte map features using the PyTorch Module API.
    
    Inputs:
    - data1: training data
    - data2: validation data
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x,x2,y) in enumerate(data1):
            model.train()  # put model to training mode
            indices = torch.tensor(range(0, 71))    # how to drop column from tensor
            x1 = torch.index_select(x, 1, indices)
            x1 = x1.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            x2 = x2.to(device=device,dtype=dtype)
            y = y.to(device=device, dtype=torch.long)
            
            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()
            
            scores = model(x1,x2)
            y = torch.squeeze(y)
            criterion = nn.CrossEntropyLoss()
            loss = criterion(scores,y)

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()
            
            if t % 100 == 0:
                # running_loss = 0.0
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                print('Checking accuracy on validation set')
                num_correct, num_samples, acc = check_accuracy(data2, model)
                print('Epoch %d, Got %d / %d correct (%.2f)' % (e,num_correct, num_samples, acc))
        lr_scheduler.step(model.metrics)


In [None]:
def check_accuracy(data, model):
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
      for x,x2,y in data:
        indices = torch.tensor(range(0, 71))
        x1 = torch.index_select(x, 1, indices)
        x1 = x1.to(device=device, dtype=dtype)  # move to device, e.g. GPU
        x2 = x2.to(device=device,dtype=dtype)
        y = y.to(device=device, dtype=torch.long)
        scores = model(x1,x2)
        y = torch.squeeze(y)
        _, preds = scores.max(1)
        num_correct += (preds == y).sum()
        num_samples += preds.size(0)
      acc = float(num_correct) / num_samples
      return num_correct, num_samples, 100*acc

In [None]:
train(train_dl, eval_dl, net,optimizer,epochs=28)

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(len(loss_history)),loss_history)
plt.xlabel('iteration')
plt.ylabel('training loss')
plt.title('Training Loss history')
plt.show()