In [1]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('other/train.csv')
y = df["Habitability_score"].values
ids = df['Property_ID'].values  
df.drop(['Habitability_score', 'Property_ID'], axis=1, inplace=True)
n_data = df.shape[0]

In [2]:
class DataPrep(object):
    def __init__(self, df, y):
        self.df = df.copy()
        self.y = y.copy()
        self.n_data = df.shape[0]
        self.arrs = []
        self.cols = []
        self.nans = []
    
    def search_nan(self):
        for i in self.df:
            for j in self.df[i]:
                if j != j:
                    self.nans.append(i)
                    break

    def train_test_split(self, test_size=0.2):
        if self.prepared_df is not None:
            n_train = int(self.n_data*(1-test_size))
            arr_data = self.prepared_df.to_numpy()
            arr_y = self.y
            np.random.seed(31)
            p = np.random.permutation(len(self.y))
            arr_data = arr_data[p]
            arr_y = arr_y[p]
            return arr_data[:n_train], arr_y[:n_train], arr_data[n_train:], arr_y[n_train:]
        else:
            print("Run prepare() first!!!")
        
    def fill_categorical_nan_and_one_hot_encode(self, col):
        arr = self.df[col].to_numpy()
        dct = dict()
        nan_ind = []

        for i in range(len(arr)):
            element = arr[i]
            if isinstance(element, str) and element != 'nan':
                try:
                    dct[element] += 1
                except:
                    dct[element] = 1
            else:
                nan_ind.append(i)

        nonnan_len = len(arr) - len(nan_ind)
        for i in dct:
            dct[i] = dct[i]/nonnan_len
        for i in nan_ind:
            # arr[i] = np.random.choice(list(dct.keys()), p=list(dct.values()))
            arr[i] = "nan"
        freq_arr = np.zeros(len(dct.keys()))
        uni_vals = np.unique(arr).tolist()
        uni_vals.remove("nan")
        encoded_arr = np.zeros((self.n_data, len(uni_vals)))
        encod_dict = dict()
        for i in range(len(uni_vals)):
            encod_dict[uni_vals[i]] = i
        for i in range(len(freq_arr)):
            freq_arr[i] = dct[uni_vals[i]]
        for i in range(self.n_data):
            if arr[i] == "nan":
                encoded_arr[i] = freq_arr.copy()
            else:
                encoded_arr[i][encod_dict[arr[i]]] = 1
        return encoded_arr, encod_dict

    def fill_numerical_nan(self, col):
        return self.df[col].fillna(self.df[col].mean())

    def one_hot_encode_not_nan(self, col):
        arr = self.df[col].to_numpy()
        dct = dict()

        for i in range(len(arr)):
            element = arr[i]
            try:
                dct[element] += 1
            except:
                dct[element] = 1

        uni_vals = np.unique(arr).tolist()
        encoded_arr = np.zeros((self.n_data, len(uni_vals)))
        encod_dict = dict()
        for i in range(len(uni_vals)):
            encod_dict[uni_vals[i]] = i
        for i in range(self.n_data):
            encoded_arr[i][encod_dict[arr[i]]] = 1
        return encoded_arr, encod_dict

    def normalize_numerical(self, arr):
        return (arr - arr.mean())/arr.std()

    def prepare(self):
        d_types = self.df.dtypes
        self.search_nan()
        for i in self.df:
            if i in self.nans:
                if d_types[i] == "object":
                    ret_tuple = self.fill_categorical_nan_and_one_hot_encode(i)
                    for k in ret_tuple[0].T:
                        self.arrs.append(k)
                    for j in ret_tuple[1]:
                        self.cols.append(i+'_'+j)
                else:
                    self.arrs.append(self.normalize_numerical(self.fill_numerical_nan(i)))
                    self.cols.append(i)
            else:
                if d_types[i] == "object":
                    ret_tuple = self.one_hot_encode_not_nan(i)
                    for k in ret_tuple[0].T:
                        self.arrs.append(k)
                    for j in ret_tuple[1]:
                        self.cols.append(i+'_'+j)
                else:
                    self.arrs.append(self.normalize_numerical(self.df[i]))
                    self.cols.append(i)
        self.prepared_df = pd.DataFrame(np.array(self.arrs).T, columns=self.cols)
        return pd.DataFrame(np.array(self.arrs).T, columns=self.cols), self.y


In [3]:
def calculate_loss(net, X, y):
    y_hat = net.test(X)
    y_hat = np.array(y_hat.tolist())
    y_hat = y_hat.squeeze()
    y = np.array(y.tolist())
    y = y.squeeze()
    return np.mean((y_hat - y)**2)

def plot(loss, valloss):
    import matplotlib.pyplot as plt
    plt.plot(loss, label = 'train loss')
    plt.plot(valloss, label = 'validation loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()


In [4]:
d_set = DataPrep(df, y)
df_new, labels = d_set.prepare()
X_train, y_train, X_val, y_val = d_set.train_test_split()
"(%d, %d), (%d, %d)" % (X_train.shape[0], X_train.shape[1], X_val.shape[0], X_val.shape[1])

'(31599, 31), (7900, 31)'

In [5]:
df

Unnamed: 0,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review
0,Apartment,106,,1,Semi_Furnished,0.0,No,Once in a day - Morning,5.89,Slightly below average,Medium,90.0,3.86
1,Apartment,733,2.0,2,Unfurnished,1.0,No,Once in a day - Evening,4.37,Well below average,Medium,96.0,3.55
2,Apartment,737,4.0,2,Fully Furnished,0.0,No,Once in a day - Morning,7.45,Slightly below average,Medium,121.0,3.81
3,Apartment,900,3.0,2,Unfurnished,2.0,Yes,Once in a day - Morning,6.16,Well above average,Medium,100.0,1.34
4,Bungalow,2238,14.0,6,Fully Furnished,0.0,No,All time,5.46,Well below average,Medium,116.0,4.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39494,Single-family home,1120,3.0,2,,0.0,No,All time,5.55,Slightly above average,Medium,80.0,3.56
39495,Apartment,445,1.0,3,Fully Furnished,1.0,No,All time,5.70,Slightly above average,Medium,86.0,2.93
39496,Bungalow,3780,6.0,6,Unfurnished,0.0,Yes,Once in two days,6.84,Well below average,Medium,137.0,3.80
39497,Single-family home,1266,3.0,1,Semi_Furnished,0.0,No,Once in a day - Morning,4.60,Slightly above average,,88.0,3.25


In [6]:
X = df_new.to_numpy()
y = labels
n_features = X.shape[1]
"%d data available with %d features (some are one hot encoded)"%(n_data, n_features)

'39499 data available with 31 features (some are one hot encoded)'

In [None]:
from NeuralNetwork import NeuralNetwork as nn
from data_layer import Dataset
from pyflow import Tensor, L1Loss, L2Loss, FullyConnected, SGD, CrossEntropyLoss, ReLU, Sigmoid
from gc import collect
collect()

y_val = Tensor(y_val[:, None, None]) if type(y_val) == np.ndarray else y_val
batch_size = 2048
dataset = Dataset(X, y, batch_size, True)
loss_layer = L2Loss()
net = nn(dataset, loss_layer)

in_size = n_features
hidden_sizes = (16, 4, 2)
out_size = 1

momentum = 0.8
lr = 1e-1
Optimizer = 'Adam'

my_layers = []
my_activations = []
my_layers.append(FullyConnected(in_size, hidden_sizes[0], lr, Optimizer, 0.9, 0.999))
my_layers.append(FullyConnected(hidden_sizes[0], hidden_sizes[1], lr, Optimizer, 0.9, 0.999))
my_layers.append(FullyConnected(hidden_sizes[1], hidden_sizes[2], lr, Optimizer, 0.9, 0.999))
my_layers.append(FullyConnected(hidden_sizes[2], out_size, lr, Optimizer, 0.9, 0.999))

my_activations.append(Sigmoid())
my_activations.append(Sigmoid())
my_activations.append(Sigmoid())

for i in range(len(my_layers)):
    net.append_layer(my_layers[i])
    if i < len(my_activations):
        net.append_layer(my_activations[i])
epochs = 250
loss, valloss = net.train(epochs, cross_val = True, valset = (X_val, y_val), verbose = True)
collect()
name_str = f'{hidden_sizes} {batch_size}, {lr}, {momentum}, {Optimizer}, {my_activations[0].__class__.__name__}, {loss_layer.__class__.__name__}, {epochs}, {loss[-1]}, {valloss[-1]}'
np.save(f'runs/loss_with_{name_str}.npy', loss)
np.save(f'runs/valloss_with_{name_str}.npy', valloss)
with open('runs/train_hist.txt', 'a') as f:
    print(name_str, file=f)

Epoch:    1
	Train Loss: 5080.08	Val Loss: 4666.12
Epoch:    2
	Train Loss: 4286.95	Val Loss: 3921.30
Epoch:    3
	Train Loss: 3589.15	Val Loss: 3272.18
Epoch:    4
	Train Loss: 2984.94	Val Loss: 2713.58
Epoch:    5
	Train Loss: 2467.51	Val Loss: 2237.59
Epoch:    6
	Train Loss: 2028.65	Val Loss: 1835.87
Epoch:    7
	Train Loss: 1660.10	Val Loss: 1500.25
Epoch:    8
	Train Loss: 1353.78	Val Loss: 1222.81
Epoch:    9
	Train Loss: 1101.95	Val Loss: 996.00
Epoch:   10
	Train Loss: 897.27	Val Loss: 812.73
Epoch:   11
	Train Loss: 732.88	Val Loss: 666.42
Epoch:   12
	Train Loss: 602.47	Val Loss: 551.05
Epoch:   13
	Train Loss: 500.32	Val Loss: 461.24
Epoch:   14
	Train Loss: 421.35	Val Loss: 392.23
Epoch:   15
	Train Loss: 361.10	Val Loss: 339.89
Epoch:   16
	Train Loss: 315.76	Val Loss: 300.73
Epoch:   17
	Train Loss: 282.10	Val Loss: 271.82
Epoch:   18
	Train Loss: 257.45	Val Loss: 250.76
Epoch:   19
	Train Loss: 239.66	Val Loss: 235.62
Epoch:   20
	Train Loss: 226.99	Val Loss: 224.89
Epo

FileNotFoundError: [Errno 2] No such file or directory: 'runs/loss_with_(16, 4, 2) 2048, 0.1, 0.8, Adam, Sigmoid, L2Loss, 250, 199.69340270145713, 201.6522922608246.npy'

In [None]:
calculate_loss(net, X_val, y_val), calculate_loss(net, X, y)
import matplotlib.pyplot as plt
t = np.arange(0, epochs)
loss = net.loss
valloss = net.vallosses
plt.plot(t, loss, label = 'train loss')
plt.plot(t, valloss, label = 'validation loss')
plt.legend()
print(valloss)

In [None]:
net.save_model('model_Adam_48_valllos')

In [None]:
from os import listdir
for i in listdir('runs'):
    if i.startswith('loss_with_'):
        name = i[10:]
        train_loss = np.load(f'runs/{i}', allow_pickle=True)
        val_loss = np.load(f'runs/valloss_with_{name}', allow_pickle=True)
        labels = name[:-4]
        plot(train_loss, val_loss)
        splitted = labels.split(', ')
        try:
            splitted[-2] = float(splitted[-2])
            splitted[-1] = float(splitted[-1])
            print(", ".join(splitted[:-2]), "%.2f"%splitted[-2], "%.2f"%splitted[-1])
        except:
            print(labels)

In [1]:
from NeuralNetwork import NeuralNetwork as nn
from data_layer import Dataset
from pyflow import Tensor, L1Loss, L2Loss, FullyConnected, SGD, CrossEntropyLoss, ReLU
from gc import collect
import numpy as np
collect()

layer = FullyConnected(31, 16, 1e-3, 'SGD')
layer2 = FullyConnected(16, 8, 1e-3, 'SGD')
layer3 = FullyConnected(8, 4, 1e-3, 'SGD')
layer4 = FullyConnected(4, 1, 1e-3, 'SGD')

net = nn('SGD', path='saved_models/model_Adam_48_valllos')
net.append_layer(layer)
net.append_layer(ReLU())
net.append_layer(layer2)
net.append_layer(ReLU())
net.append_layer(layer3)
net.append_layer(ReLU())
net.append_layer(layer4)

net.loss_layer = L1Loss()

net.save_model('model')

In [None]:
net2 = nn('SGD')
net2.load_model('model')

In [2]:
for i in range(len(net.layers)):
    if net.layers[i].__class__.__name__ == 'FullyConnected':
        W1 = np.array(net.layers[i].weights.tolist())[0]
        W2 = np.array(net2.layers[i].weights.tolist())[0]
        b1 = np.array(net.layers[i].bias.tolist())[0]
        b2 = np.array(net2.layers[i].bias.tolist())[0]
        print(np.allclose(W1, W2))
        print(np.allclose(b1, b2))

NameError: name 'net2' is not defined

In [7]:
net.layers[-1]

0.592845 0.844266 0.857946 0.847252 

In [12]:
import pickle as pkl

with open('data.pkl', 'wb') as f:
    pkl.dump(df_new.to_numpy(), f)

with open('labels.pkl', 'wb') as f:
    pkl.dump(labels, f)