In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
import numpy as np

from sklearn.metrics import accuracy_score

import torch.utils.data as data_utils
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.autograd import Variable

from torch.utils import data
from torch import nn, optim
import torch.nn.functional as F

## Load data

In [2]:
dir_data = "/Users/romulo/Documents/Dataset/Earthquake Damage/"
#dir_data = "Dataset/"
df_x_train = pd.read_csv(dir_data+"train_values.csv",index_col="building_id")
df_y_train = pd.read_csv(dir_data+"train_labels.csv",index_col="building_id")
df_x_test = pd.read_csv(dir_data+"test_values.csv",index_col="building_id")

# df_x_train = df_x_train[:10000]
# df_y_train = df_y_train[:10000]
# df_x_test = df_x_test[:10000]

data_train = df_x_train.merge(df_y_train, how='left', left_index=True, right_index=True)



print("len train:",len(data_train))
print("len train:",len(df_x_test))

len train: 260601
len train: 86868


## Preparate data

In [3]:
# get y train
y_train = data_train['damage_grade'].values

# remove y of data_train
data_train = data_train.drop('damage_grade', 1)

In [4]:
# let's put the train data and test data together to make get_dummies and then divide
df_x_all = data_train.append(df_x_test)
print("len all:",len(df_x_all))

# get dummies from cat columns
cat_var = [key for key in dict(df_x_all.dtypes) if dict(df_x_all.dtypes)[key] in ['object'] ]
df_x_all = pd.get_dummies(df_x_all, prefix=cat_var, columns=cat_var)

#divide x_train and x_test
x_train = df_x_all.iloc[:len(data_train)]
x_test = df_x_all.iloc[len(data_train):]

len all: 347469


In [5]:
# clear memory
del df_x_all,df_x_train,df_y_train,data_train,df_x_test

In [6]:
#get x_dev and y_dev (10% from train)
x_train, x_dev, y_train, y_dev = train_test_split( x_train, y_train, test_size=0.1, random_state=42)

y_train = np.array([x-1 for x in y_train])
y_dev = np.array([x-1 for x in y_dev])


In [7]:
print("len x_train : %d  len y_train: %d " %(len(x_train),len(y_train)) )
print("len x_dev   : %d  len y_dev  : %d " %(len(x_dev),len(y_dev)) )
print("len x_test  : %d" %(len(x_test)) )

len x_train : 234540  len y_train: 234540 
len x_dev   : 26061  len y_dev  : 26061 
len x_test  : 86868


In [8]:
## Pytorch

In [9]:
#prepate data for pytorch
class MyDataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, my_data, labels):
        'Initialization'
        self.data = my_data
        self.labels = labels

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        x = self.data[index]
        y = self.labels[index]

        return x,y

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


x_train_tensor = torch.from_numpy(x_train.values).float().to(device)
y_train_tensor = torch.from_numpy(y_train).long().to(device)

x_dev_tensor = torch.from_numpy(x_dev.values).float().to(device)
y_dev_tensor = torch.from_numpy(y_dev).long().to(device)

my_train = MyDataset(x_train_tensor,y_train_tensor)
my_dev = MyDataset(x_dev_tensor,y_dev_tensor )



trainloader = torch.utils.data.DataLoader(my_train, batch_size=64, shuffle=True)
devloader = torch.utils.data.DataLoader(my_dev, batch_size=64, shuffle=True)

In [10]:
class Classifier(nn.Module):
    def __init__(self,IN_DIM,num_class):
        super().__init__()
        self.fc1 = nn.Linear(IN_DIM, 500)
        self.fc2 = nn.Linear(500, 300)
        
        self.fc3 = nn.Linear(300, 300)
        self.fc4 = nn.Linear(300, 200)
        self.fc5 = nn.Linear(200, 100)
        self.fc6 = nn.Linear(100, 30)
        
        self.fc7 = nn.Linear(30, num_class)
        
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))
        x = F.relu(self.fc5(x))
        x = self.dropout(F.relu(self.fc6(x)))
        
        x = F.log_softmax(self.fc7(x), dim=1)
        
        return x

In [13]:
# TODO: Create the network, define the criterion and optimizer
_,IN_DIM = my_dev.data.shape
num_class = 4
model = Classifier(IN_DIM,num_class)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

In [14]:
epochs = 30
steps = 0

train_losses, test_losses = [], []

for e in range(epochs):
    running_loss = 0
    for points, labels in trainloader:
        
        optimizer.zero_grad()
        
        log_ps = model(points)
        loss = criterion(log_ps, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    else:
        test_loss = 0
        accuracy = 0
        
        # Turn off gradients for validation, saves memory and computations
        with torch.no_grad():
            for points, labels in devloader:
                log_ps = model(points)
                test_loss += criterion(log_ps, labels)
                
                ps = torch.exp(log_ps)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == labels.view(*top_class.shape)
                accuracy += torch.mean(equals.type(torch.FloatTensor))
                
        train_losses.append(running_loss/len(devloader))
        test_losses.append(test_loss/len(devloader))

        print("Epoch: {}/{}.. ".format(e+1, epochs),
              "Training Loss: {:.3f}.. ".format(running_loss/len(devloader)),
              "Dev Loss: {:.3f}.. ".format(test_loss/len(devloader)),
              "Dev Accuracy: {:.3f}".format(accuracy/len(devloader)))

Epoch: 1/30..  Training Loss: 8.704..  Dev Loss: 0.916..  Dev Accuracy: 0.568
Epoch: 2/30..  Training Loss: 8.210..  Dev Loss: 0.916..  Dev Accuracy: 0.568
Epoch: 3/30..  Training Loss: 8.200..  Dev Loss: 0.915..  Dev Accuracy: 0.568
Epoch: 4/30..  Training Loss: 8.200..  Dev Loss: 0.915..  Dev Accuracy: 0.568
Epoch: 5/30..  Training Loss: 8.200..  Dev Loss: 0.915..  Dev Accuracy: 0.568
Epoch: 6/30..  Training Loss: 8.200..  Dev Loss: 0.916..  Dev Accuracy: 0.567
Epoch: 7/30..  Training Loss: 8.200..  Dev Loss: 0.915..  Dev Accuracy: 0.568


KeyboardInterrupt: 