In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as torchdata

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

df_train = pd.read_csv('../input/train.tsv', sep='\t')
df_test = pd.read_csv('../data/test.tsv', sep='\t')

# train
df_cate = df_train.category_name.str.split('/', expand=True, n=2).fillna(np.nan)
df_cate.columns = ['cate_b', 'cate_m', 'cate_s']  # category big, middle, small
df_train = pd.concat([df_train, df_cate], axis=1)
df_train['has_brand'] = (~df_train.brand_name.isnull()).astype(int)

# test
df_cate = df_test.category_name.str.split('/', expand=True, n=2).fillna(np.nan)
df_cate.columns = ['cate_b', 'cate_m', 'cate_s']  # category big, middle, small
df_test = pd.concat([df_test, df_cate], axis=1)
df_test['has_brand'] = (~df_test.brand_name.isnull()).astype(int)

cate_b_unique = df_train.cate_b.unique()
cate_b_dict = {cat: i for i, cat in enumerate(cate_b_unique, 1)}
cate_b_dict[np.nan] = 0
cate_m_unique = df_train.cate_m.unique()
cate_m_dict = {cat: i for i, cat in enumerate(cate_m_unique, 1)}
cate_m_dict[np.nan] = 0
cate_s_unique = df_train.cate_s.unique()
cate_s_dict = {cat: i for i, cat in enumerate(cate_s_unique, 1)}
cate_s_dict[np.nan] = 0

use_cols = ['price', 'item_condition_id', 'has_brand', 'shipping', 'cate_b', 'cate_m', 'cate_s']
df = df_train.copy()
df = df.loc[:, use_cols]
df_t = df_test.copy()
df_t = df_t.loc[:, use_cols[1:]]

# check if category data is in the test data
def check_cate(test_cates, unique):
    c_list = []
    for c in test_cates:
        if ((unique == c).sum() == 0) & (type(c) == str):
            c_list.append(c)
    return c_list
    
for cat, cat_unique in zip(['cate_b', 'cate_m', 'cate_s'], 
                           [cate_b_unique, cate_m_unique, cate_s_unique]):
    print(cat)
    print('='*30)
    not_in_list = check_cate(df_test[cat].unique(), cat_unique)
    print('num of categories that are not in test data:', len(not_in_list))
    if not_in_list:
        print(not_in_list)
    print('='*30)

# change categories to numbers
for cat, cat_dict in zip(['cate_b', 'cate_m', 'cate_s'], [cate_b_dict, cate_m_dict, cate_s_dict]):
    df[cat] = df[cat].map(cat_dict.get)
    df_t[cat] = df_t[cat].map(cat_dict.get)
    df_t[cat].loc[df_t[cat].isnull()] = 0
    df_t[cat] = df_t[cat].astype(int)
    
    
# batch size and data setting
BATCH_SIZE = 512
data_train = df.values
data_test = df_t.values

class CustomDataset(torchdata.Dataset):
    def __init__(self, data, train=True):
        n_row, n_col = data.shape
        self.x = torch.FloatTensor(data[:, 1:]).contiguous().view(-1, n_col-1)
        self.y = torch.FloatTensor(np.log(data[:, :1] + 1)).contiguous().view(-1, 1)
        
    def __getitem__(self, index):
        # return index of batch size
        return self.x[index], self.y[index]
        
    def __len__(self):
        # lenth of data set
        return len(self.x)
        
train_dataset = CustomDataset(data=data_train)
train_loader = torchdata.DataLoader(dataset=train_dataset,
                                   batch_size=BATCH_SIZE, 
                                   shuffle=True, 
                                   drop_last=True)
                                   

class Network(nn.Module):
    def __init__(self, I, H, O):
        super(Network, self).__init__()
        self.I = I
        self.H1 = H[0]
        self.H2 = H[1]
        self.O = O
        
        self.l1 = nn.Linear(self.I, self.H1)
        self.l2 = nn.Linear(self.H1, self.H2)
        self.l3 = nn.Linear(self.H2, self.O)
        self.activation = nn.LeakyReLU()
        self.bn1 = nn.BatchNorm1d(self.H1)
        self.bn2 = nn.BatchNorm1d(self.H2)
        
    def forward(self, inputs):
        outputs = self.bn1(self.l1(inputs))
        outputs = self.activation(outputs)
        outputs = self.bn2(self.l2(outputs))
        outputs = self.activation(outputs)
        
        return torch.log(self.l3(outputs) + 1)
    
    def predict(self, inputs):
        outputs = self.forward(inputs)
        return torch.exp(outputs) - 1

# building model
EPOCH=10
LR=0.1
I = 6

model = Network(I, [40, 20], 1)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# training model
model.train()
for epoch in range(EPOCH):
    losses=[]
    for i, (inputs, lg_targets) in enumerate(train_loader):
        inputs, lg_targets = Variable(inputs).view(-1, I), Variable(lg_targets)

        model.zero_grad()
        lg_outputs = model.forward(inputs)
        loss = loss_function(lg_outputs, lg_targets)
        loss.backward()
        optimizer.step()

        losses.append(loss.data[0])
        if i % 500 == 0:
            print("[{0}/{1}] [{2}/{3}] mean_loss : {4:.3f}"\
                  .format(epoch, EPOCH, i, len(train_loader), np.mean(losses)))
            losses=[]

data_test = torch.FloatTensor(data_test).contiguous().view(-1, 6)
model.eval()
pred = model.predict(Variable(test))

sub = pd.read_csv('../input/sample_submission_stg2.csv', index_col='test_id')
sub['price'] = pred.data.numpy()
sub.to_csv('./data/torch_model.csv', index=True)