In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math

from PIL import Image
from numpy import asarray

import torch
from torch import nn
from torch.nn import functional as F
import torchvision
import torchvision.transforms as transforms

from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import cv2
# processing
from sklearn.model_selection import train_test_split


# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))



# loading Data

In [None]:
data = pd.read_csv('../input/house-prices-and-images-socal/socal2.csv')
print(data.shape)
display(data.head())
display(data.info())
display(data.describe())

In [None]:
data.price.hist()

In [None]:
data.sqft.hist()

## loading images

In [None]:
print(data.image_id.max())
imgs = []
img_shapes = []
c = 0
for i in  range(data.image_id.max()):
    try:
        image = Image.open('../input/house-prices-and-images-socal/socal2/socal_pics/{}.jpg'.format(i))
    #     plt.imshow(image)
    #     plt.show()
        img_data = asarray(image)
    #     np.transpose(data, (2,0,1)).shape
        imgs.append(np.transpose(img_data, (2,0,1)))
        img_shapes.append(img_data.shape)
        if (311, 415, 3) != img_data.shape:
            c+=1
    except:
        print(i)
    
print(len(imgs))
print(set(img_shapes))
print(c)

we only take (311,415,3) images and data points

In [None]:
exclude_list = []
for i in  range(data.image_id.max()):
    image = Image.open('../input/house-prices-and-images-socal/socal2/socal_pics/{}.jpg'.format(i))
    temp = asarray(image)
    if i%1000==0:
        print(i)
    if (311, 415, 3) != temp.shape:
        exclude_list.append(i)

print(len(exclude_list))
print(exclude_list)

# exclude_list = [0, 14, 373, 477, 482, 550, 717, 747, 752, 903, 1128, 1208, 1312, 1697, 1725, 1908, 2385, 2554, 2600, 2729, 3545, 3747, 4754, 4923, 4925, 4926, 4927, 4928, 4929, 4936, 4949, 5013, 5037, 5062, 5117, 5118, 5119, 5120, 5121, 5124, 5129, 5134, 5135, 5136, 5177, 5180, 5312, 5313, 5368, 5413, 5510, 5511, 5524, 5530, 5533, 5551, 5591, 5640, 5648, 5657, 5705, 5712, 5713, 5719, 5720, 5725, 5728, 5744, 5757, 5782, 5833, 5862, 5906, 5913, 5924, 5925, 5934, 5935, 5941, 5942, 5953, 6000, 6108, 6109, 6114, 6115, 6142, 6165, 6304, 6305, 6306, 6322, 6483, 6488, 6489, 6502, 6503, 6506, 6509, 6510, 6511, 6526, 6649, 6672, 6697, 6698, 6701, 6714, 6770, 6835, 6836, 6854, 6872, 6873, 6893, 6895, 6948, 6969, 6992, 7029, 7033, 7037, 7039, 7065, 7090, 7093, 7108, 7144, 7162, 7163, 7193, 7291, 7297, 7321, 7333, 7336, 7484, 7485, 7493, 7496, 7518, 7523, 7526, 7535, 7542, 7677, 7678, 7681, 7682, 7684, 7685, 7736, 7743, 7751, 7816, 7874, 7876, 7877, 7888, 7907, 7969, 8238, 8709, 8809, 8886, 8895, 9081, 9194, 9492, 10000, 10488, 10609, 11821, 11822, 11998, 12024, 12498, 12623, 14476, 14481, 14875]

# Data Processing

In [None]:
null_count = data.isnull().sum().sort_values(ascending = False)
null_count = null_count[null_count>0]
null_count

In [None]:
temp = data[~data.image_id.isin(exclude_list)]
x_train, x_test, y_train, y_test = train_test_split(temp.drop(['price', 'citi', 'street'], axis=1), temp['price'], test_size=0.30, random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
%%time
cnt=0
images_path='../input/house-prices-and-images-socal/socal2/socal_pics'
x_train_images=np.zeros((x_train.shape[0],64,64,3),dtype='uint32')
for i in x_train.image_id:
    sample=cv2.imread(images_path+'/'+str(i)+'.jpg')
    imgs=cv2.resize(sample,(64,64))
    x_train_images[cnt]=imgs
    cnt+=1

print("No. of images: ",cnt)

In [None]:
%%time
cnt=0
images_path='../input/house-prices-and-images-socal/socal2/socal_pics'
x_test_images=np.zeros((x_test.shape[0],64,64,3),dtype='uint32')
for i in x_test.image_id:
    sample=cv2.imread(images_path+'/'+str(i)+'.jpg')
    imgs=cv2.resize(sample,(64,64))
    x_test_images[cnt]=imgs
    cnt+=1

print("No. of images: ",cnt)

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, x, img, y):
        self.x = torch.tensor(x[['n_citi', 'bed', 'bath', 'sqft']].values).float()
        self.img = img
        self.y = torch.tensor(y.values).float()

    def __len__(self):
        return len(self.x)

    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        img = self.img[idx]
        img = torchvision.transforms.functional.to_tensor(img.astype(np.uint8).reshape((64, 64, 3)))
        return {'x': x, 'y': y, 'img': img}
    
BATCH_SIZE = 256    
train_dataset = MyDataset(x_train,x_train_images, y_train)
dataLoader_train = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

test_dataset = MyDataset(x_test,x_test_images, y_test)
dataLoader_test = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)


# Modeling

In [None]:
class Model(torch.nn.Module):
    
    def __init__(self, input_shape):
        super().__init__()
        
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=(3, 3)),
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(32),
            torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(5, 5)),
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(64),
        )
        
        self.flatten = torch.nn.Sequential(torch.nn.AdaptiveMaxPool2d(1), torch.nn.Flatten())
        
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(input_shape, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
        )
        
        self.final_fc = torch.nn.Sequential(
            torch.nn.Linear(128+64, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 1)
        )
        
    def forward(self, x, img):
        img = self.conv(img)
        img = self.flatten(img) 
        x = self.fc(x)
        combined = torch.cat((img, x), 1)
        price = self.final_fc(combined)
        return price
    
model = Model(4)
print(model)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [None]:
%%time

n_epochs = 10
print('started!')
for epoch in range(n_epochs):
    train_batch_loss = 0
    model.train()
    for step, batch in enumerate(dataLoader_train):
        x = batch["x"]
        img = batch["img"]
        y = batch["y"]

        optimizer.zero_grad()
        outputs = model(x = x, img = img)
        loss = criterion(outputs[:,0], y)
        loss.backward()
        optimizer.step()
        train_batch_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()

    test_batch_loss = 0
    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(dataLoader_test):
            x = batch["x"]
            img = batch["img"]
            y = batch["y"]
            outputs = model(x = x, img = img)
            loss = criterion(outputs[:,0], y)
            test_batch_loss += loss.item()

    print('epoch {}/{} finished with train loss: {} and test loss: {}'.format(epoch+1, n_epochs,
                                                                              train_batch_loss / len(dataLoader_train),
                                                                              test_batch_loss / len(dataLoader_test)))
    
torch.save(model.state_dict(), './model_two_input')

In [None]:
def reg_report(true, pred, name='Test'):
    print("\n{} Results :\n".format(name))
    print("RSS :",sum((pred-true)**2))
    print("RSE :",math.sqrt(sum((pred-true)**2)*(1/(len(pred)-2))))
    print("TSS :",sum((true-true.mean())**2))
    print("R Squared :",1-(sum((pred-true)**2)/sum((true-true.mean())**2)))
    print("MSE :",((pred-true)**2).mean())
    print('MAE :',(abs(pred-true)).mean())
    print('Accuracy with 10% :', ((pred<=true*1.1) & (true*0.9<=pred)).mean())
    

def eval_report(y_train, pred_train,y_test, pred_test):
    reg_report(y_train, pred_train, name='Train')
    reg_report(y_test, pred_test, name='Test')
    
def res(dataLoader, name = 'Test'):  
    trues = []
    preds = []
    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(dataLoader):
            x = batch["x"]
            img = batch["img"]
            y = batch["y"]

            outputs = model(x = x, img = img)

            trues = trues + y.tolist()
            preds = preds + outputs[:,0].tolist()


    reg_report(true =  np.array(trues), pred = np.array(preds), name=name)

res(dataLoader_test, name = 'Test')
res(dataLoader_train, name = 'Train')