In [182]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import os
import torch # type: ignore
import torch.nn as nn # type: ignore
from torch.utils.data import Dataset, DataLoader # type: ignore
import torch.optim as optim # type: ignore
from os.path import join
import datetime
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler


%matplotlib inline

PATH_TO_DATA = os.path.join("../data", "Task3")

In [183]:
full_df = pd.read_csv(os.path.join(PATH_TO_DATA, "housing.csv"))


In [184]:
numerical_features=list(full_df.columns)
numerical_features.remove("ocean_proximity")
numerical_features.remove("median_house_value")
print(numerical_features)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']


In [185]:
max_house_age = full_df["housing_median_age"].max()
full_df["age_clipped"]=full_df["housing_median_age"] == max_house_age

In [186]:
full_df["median_house_value_log"] = np.log1p(full_df["median_house_value"])


In [187]:
skewed_features = [
    "households",
    "median_income",
    "population",
    "total_bedrooms",
    "total_rooms",
]
log_numerical_features = []
for f in skewed_features:
    full_df[f + "_log"] = np.log1p(full_df[f])
    log_numerical_features.append(f + "_log")

In [188]:
lin = LinearRegression()

# we will train our model based on all numerical non-target features with not NaN total_bedrooms
appropriate_columns = full_df.drop(
    [
        "median_house_value",
        "median_house_value_log",
        "ocean_proximity",
        "total_bedrooms_log",
    ],
    axis=1,
)
train_data = appropriate_columns[~pd.isnull(full_df).any(axis=1)]
lin.fit(train_data.drop(["total_bedrooms"], axis=1), train_data["total_bedrooms"])
full_df["total_bedrooms_is_nan"] = pd.isnull(full_df).any(axis=1).astype(int)
full_df["total_bedrooms"].loc[pd.isnull(full_df).any(axis=1)] = lin.predict(
    full_df.drop(
        [
            "median_house_value",
            "median_house_value_log",
            "total_bedrooms",
            "total_bedrooms_log",
            "ocean_proximity",
            "total_bedrooms_is_nan",
        ],
        axis=1,
    )[pd.isnull(full_df).any(axis=1)]
)
full_df['total_bedrooms_log']=np.log1p(full_df['total_bedrooms'])
full_df=full_df.dropna()


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  full_df["total_bedrooms"].loc[pd.isnull(full_df).any(axis=1)] = lin.predict(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [189]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20639 entries, 0 to 20639
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   longitude               20639 non-null  float64
 1   latitude                20639 non-null  float64
 2   housing_median_age      20639 non-null  float64
 3   total_rooms             20639 non-null  float64
 4   total_bedrooms          20639 non-null  float64
 5   population              20639 non-null  float64
 6   households              20639 non-null  float64
 7   median_income           20639 non-null  float64
 8   median_house_value      20639 non-null  float64
 9   ocean_proximity         20639 non-null  object 
 10  age_clipped             20639 non-null  bool   
 11  median_house_value_log  20639 non-null  float64
 12  households_log          20639 non-null  float64
 13  median_income_log       20639 non-null  float64
 14  population_log          20639 non-null  flo

In [190]:
ocean_proximity_dummies = pd.get_dummies(
    full_df["ocean_proximity"],
    drop_first=True,
)
dummies_names = list(ocean_proximity_dummies.columns)
full_df = pd.concat([full_df, ocean_proximity_dummies[: full_df.shape[0]]], axis=1)

full_df = full_df.drop(["ocean_proximity"], axis=1)


In [191]:
sf_coord = [-122.4194, 37.7749]
la_coord = [-118.2437, 34.0522]

full_df["distance_to_SF"] = np.sqrt(
    (full_df["longitude"] - sf_coord[0]) ** 2
    + (full_df["latitude"] - sf_coord[1]) ** 2
)

full_df["distance_to_LA"] = np.sqrt(
    (full_df["longitude"] - la_coord[0]) ** 2
    + (full_df["latitude"] - la_coord[1]) ** 2
)


In [192]:
# features_to_scale = (
#     numerical_features + log_numerical_features+["distance_to_SF", "distance_to_LA"]
# )

features_to_scale = (
   log_numerical_features+["distance_to_SF", "distance_to_LA"]
)

In [193]:
scaler = StandardScaler()

scaled_features = pd.DataFrame(
    scaler.fit_transform(full_df[features_to_scale]),
    columns=features_to_scale,
    index= full_df.index,
)

X=pd.concat([full_df[dummies_names+['age_clipped']], scaled_features], axis=1, ignore_index = True)
y=full_df["median_house_value"]


In [194]:
X=X.to_numpy(dtype=np.float32)
y=y.to_numpy(dtype=np.float32)

In [195]:
# X=full_df.drop(columns=['median_house_value', 'ocean_proximity']).to_numpy(dtype=np.float32)
# y=full_df['median_house_value'].to_numpy(dtype=np.float32)

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [197]:
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).view(-1, 1).float()

X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).view(-1, 1).float()

datasets = torch.utils.data.TensorDataset(X_train, y_train)
train_iter = torch.utils.data.DataLoader(datasets, batch_size=64, shuffle=True)

In [198]:
class NeuralNet(nn.Module):

    def __init__(
        self, input_dimension, output_dimension, n_hidden_layers, neurons, retrain_seed, output_activation=None
    ):
        super(NeuralNet, self).__init__()
        # Number of input dimensions n
        self.input_dimension = input_dimension
        # Number of output dimensions m
        self.output_dimension = output_dimension
        # Number of neurons per layer
        self.neurons = neurons
        # Number of hidden layers
        self.n_hidden_layers = n_hidden_layers
        # Activation function
        # self.activation = nn.Tanh()
        # self.activation = nn.LeakyReLU()
        self.activation=nn.ReLU()
        # self.output_activation=nn.ReLU()
        if output_activation is None:
            self.output_activation = nn.Identity()

        self.input_layer = nn.Linear(self.input_dimension, self.neurons)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(self.neurons, self.neurons) for _ in range(n_hidden_layers - 1)]
        )
        self.output_layer = nn.Linear(self.neurons, self.output_dimension)
        self.retrain_seed = retrain_seed
        # Random Seed for weight initialization
        self.init_xavier()

    def forward(self, x):
        # The forward function performs the set of affine and non-linear transformations defining the network
        # (see equation above)
        x = self.activation(self.input_layer(x))
        for k, l in enumerate(self.hidden_layers):
            x = self.activation(l(x))
        return self.output_activation(self.output_layer(x))

    def init_xavier(self):
        torch.manual_seed(self.retrain_seed)

        def init_weights(m):
            if type(m) == nn.Linear and m.weight.requires_grad and m.bias.requires_grad:
                g = nn.init.calculate_gain("tanh")
                torch.nn.init.xavier_uniform_(m.weight, gain=g)
                # torch.nn.init.xavier_normal_(m.weight, gain=g)
                m.bias.data.fill_(0)

        self.apply(init_weights)

In [199]:
class CalHousing:
    def __init__(self, n_hidden_layers, n_neurons, train_df, target_df,X_valid,y_valid, seed):
        self.n_hidden_layers=n_hidden_layers
        self.n_neurons = n_neurons
        self.seed=seed
        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
        train_tensor = torch.tensor(train_df.values.astype(np.float32),dtype=torch.float32)
        target_tensor = torch.tensor(self.scale_targets(target_df), dtype=torch.float32)
        self.data = DataLoader(
            torch.utils.data.TensorDataset(train_tensor, target_tensor),
            batch_size=64,
            shuffle=False
        )
        self.model=NeuralNet(
            input_dimension=train_df.shape[1],
            output_dimension=1,
            n_hidden_layers=n_hidden_layers,
            neurons=n_neurons,
            retrain_seed=self.seed

        ).to(self.device)
        self.X=train_df
        self.y=target_df

        self.X_valid=X_valid
        self.y_valid=y_valid

        self.metadata_file=join("../logs","task_3" ,"metadata.json")

    def compute_loss(self, inputs, targets, verbose=True):
        preds=self.model(inputs)
        # targets=targets/10000
        res=targets-preds
        loss=torch.mean(res**2)
        # if verbose: print("Total loss: ", round(loss.item(), 4))
        return loss

    def save(self, loss_history):
        filename=join("..","models", "task_3", datetime.datetime.now().strftime("%m-%d %H:%M:%S")+".pt")
        salient_info={}
        salient_info["n_hidden_layers"]=self.n_hidden_layers
        salient_info["n_neurons"]=self.n_neurons
        salient_info["final_loss"]=loss_history[-1]
        salient_info["min_loss"]=min(loss_history)
        salient_info["model_path"]=filename
        salient_info["seed"]=self.seed

        torch.save(self.model.state_dict(),filename )
        with open(self.metadata_file, "a") as f:
            json.dump(salient_info, f)

    def scale_targets(self, target_df):
        y=target_df.values.astype(np.float32)
        self.scaler=StandardScaler()
        return(self.scaler.fit_transform(y.reshape(-1, 1)))
    
    # def validate(self, X,y):
    #     self.model.eval()
    #     with torch.no_grad():
    #         inputs=torch.tensor(X.values.astype(np.float32),dtype=torch.float32).to(self.device)
    #         targets=torch.tensor(self.scale_targets(y),dtype=torch.float32).to(self.device)
    #         loss=self.compute_loss(inputs, targets)
    #         return loss
    
    def RMSE(self, validation=True):
        if validation:
            X=self.X_valid
            y=self.y_valid
        else:
            X=self.X
            y=self.y
        self.model.eval()
        with torch.no_grad():
            inputs=torch.tensor(X.values.astype(np.float32),dtype=torch.float32).to(self.device)
            # targets=torch.tensor(self.scale_targets(y),dtype=torch.float32).to(self.device)
            preds=self.model(inputs)
            preds=self.scaler.inverse_transform(preds.cpu().numpy())
            loss=np.sqrt(np.mean((y.values-preds)**2))
            return loss



        

In [200]:
model=NeuralNet(12,1,4,40,13)
model.train()
criterion = nn.MSELoss(reduction='sum')

In [201]:
batch_size = 50
num_epochs = 1500
learning_rate = 0.0001

def train(model_inp, num_epochs = num_epochs):
    optimizer = torch.optim.RMSprop(model_inp.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for inputs, labels in train_iter:
            # forward pass
            outputs = model_inp(inputs)
            # defining loss
            loss = criterion(outputs, labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            # computing gradients
            loss.backward()
            # accumulating running loss
            running_loss += loss.item()
            # updated weights based on computed gradients
            optimizer.step()
        if epoch % 20 == 0:    
            print('Epoch [%d]/[%d] running accumulative loss across all batches: %.3f' %
                  (epoch + 1, num_epochs, running_loss))
        running_loss = 0.0

In [202]:
train(model)

Epoch [1]/[1500] running accumulative loss across all batches: 810669977632768.000


Epoch [21]/[1500] running accumulative loss across all batches: 279391773261824.000
Epoch [41]/[1500] running accumulative loss across all batches: 89194355277824.000
Epoch [61]/[1500] running accumulative loss across all batches: 68248307572736.000
Epoch [81]/[1500] running accumulative loss across all batches: 60780762267648.000
Epoch [101]/[1500] running accumulative loss across all batches: 57238434209792.000
Epoch [121]/[1500] running accumulative loss across all batches: 55629554581504.000
Epoch [141]/[1500] running accumulative loss across all batches: 54811090812928.000
Epoch [161]/[1500] running accumulative loss across all batches: 54294554697728.000
Epoch [181]/[1500] running accumulative loss across all batches: 53910180880384.000
Epoch [201]/[1500] running accumulative loss across all batches: 53595560861696.000
Epoch [221]/[1500] running accumulative loss across all batches: 53317472026624.000
Epoch [241]/[1500] running accumulative loss across all batches: 53069982007296

In [203]:
model.eval()
outputs = model(X_test)
err = np.sqrt(mean_squared_error(outputs.detach().numpy(), y_test.detach().numpy()))
print("Validation: ", err)
print("Training: ", np.sqrt(mean_squared_error(model(X_train).detach().numpy(), y_train.detach().numpy())))

Validation:  57505.145
Training:  56603.266
