In [22]:
df_train = pd.read_csv('./data/train.tsv', sep='\t')
df_test = pd.read_csv('./data/test.tsv', sep='\t')

In [23]:
df_cate = df_train.category_name.str.split('/', expand=True, n=2).fillna(np.nan)
df_cate.columns = ['cate_b', 'cate_m', 'cate_s']  # category big, middle, small
df_train = pd.concat([df_train, df_cate], axis=1)
df_train['has_brand'] = (~df_train.brand_name.isnull()).astype(int)

In [24]:
df_cate = df_test.category_name.str.split('/', expand=True, n=2).fillna(np.nan)
df_cate.columns = ['cate_b', 'cate_m', 'cate_s']  # category big, middle, small
df_test = pd.concat([df_test, df_cate], axis=1)
df_test['has_brand'] = (~df_test.brand_name.isnull()).astype(int)

In [25]:
cate_b_unique = df_train.cate_b.unique()
cate_b_dict = {cat: i for i, cat in enumerate(cate_b_unique, 1)}
cate_b_dict[np.nan] = 0
cate_m_unique = df_train.cate_m.unique()
cate_m_dict = {cat: i for i, cat in enumerate(cate_m_unique, 1)}
cate_m_dict[np.nan] = 0
cate_s_unique = df_train.cate_s.unique()
cate_s_dict = {cat: i for i, cat in enumerate(cate_s_unique, 1)}
cate_s_dict[np.nan] = 0

In [26]:
use_cols = ['price', 'item_condition_id', 'has_brand', 'shipping', 'cate_b', 'cate_m', 'cate_s']
df = df_train.copy()
df = df.loc[:, use_cols]
df_t = df_test.copy()
df_t = df_t.loc[:, use_cols[1:]]

check if category data is in the test data

In [27]:
def check_cate(test_cates, unique):
    c_list = []
    for c in test_cates:
        if ((unique == c).sum() == 0) & (type(c) == str):
            c_list.append(c)
    return c_list

In [28]:
for cat, cat_unique in zip(['cate_b', 'cate_m', 'cate_s'], 
                           [cate_b_unique, cate_m_unique, cate_s_unique]):
    print(cat)
    print('='*30)
    not_in_list = check_cate(df_test[cat].unique(), cat_unique)
    print('num of categories that are not in test data:', len(not_in_list))
    if not_in_list:
        print(not_in_list)
    print('='*30)

cate_b
num of categories that are not in test data: 0
cate_m
num of categories that are not in test data: 0
cate_s
num of categories that are not in test data: 12
['Album', 'Amigurumi', 'Pretend', 'Professional & Trade', 'Tandem', 'Home', 'Fiber Art', 'Portraits', 'Rails & Rail Guards', 'Bedroom', 'Computer', 'Rugs']


In [29]:
for cat, cat_dict in zip(['cate_b', 'cate_m', 'cate_s'], [cate_b_dict, cate_m_dict, cate_s_dict]):
    df[cat] = df[cat].map(cat_dict.get)
    df_t[cat] = df_t[cat].map(cat_dict.get)
    df_t[cat].loc[df_t[cat].isnull()] = 0
    df_t[cat] = df_t[cat].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Random Forest Regressor

In [69]:
X_train = df.iloc[:, 1:].values
y_train = df.iloc[:, :1].values
# y_train = np.log(y_train + 1)
X_test = df_t.values

model: RandomForestRegressor

In [70]:
from sklearn.ensemble import RandomForestRegressor

In [71]:
model = RandomForestRegressor()

In [72]:
model.fit(X_train, y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [73]:
pred = model.predict(X_test)

In [80]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='test_id')
sub['price'] = np.exp(pred) - 1
sub.to_csv('./data/random_forest_regressor.csv', index=True)

In [65]:
a = pd.read_csv('./data/random_forest_regressor.csv')

## deep learning model: pytorch

In [115]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as torchdata

In [124]:
BATCH_SIZE = 512
SPLIT_RATE = 0.8
data = df.values
n_row, n_col = data.shape
split_idx = int(n_row*0.8)
data_train = data[:split_idx, :]
data_valid = data[split_idx:, :]
print(data_train.shape[0], data_valid.shape[0])

1186028 296507


In [134]:
class CustomDataset(torchdata.Dataset):
    def __init__(self, data, train=True):
        n_row, n_col = data.shape
        self.x = torch.FloatTensor(data[:, 1:]).contiguous().view(-1, n_col-1)
        self.y = torch.FloatTensor(np.log(data[:, :1] + 1)).contiguous().view(-1, 1)
        
    def __getitem__(self, index):
        # 인덱스에 해당하는 데이터셋 리턴
        return self.x[index], self.y[index]
        
    def __len__(self):
        # 데이터셋 수
        return len(self.x)

In [135]:
train_dataset = CustomDataset(data=data_train)
train_loader = torchdata.DataLoader(dataset=train_dataset,
                                   batch_size=BATCH_SIZE, 
                                   shuffle=True, 
                                   drop_last=True)
valid_dataset = CustomDataset(data=data_valid)
valid_loader = torchdata.DataLoader(dataset=valid_dataset,
                                   batch_size=BATCH_SIZE, 
                                   shuffle=True, 
                                   drop_last=True)

In [140]:
class Network(nn.Module):
    def __init__(self, I, H, O):
        super(Network, self).__init__()
        self.I = I
        self.H1 = H[0]
        self.H2 = H[1]
        self.O = O
        
        self.l1 = nn.Linear(self.I, self.H1)
        self.l2 = nn.Linear(self.H1, self.H2)
        self.l3 = nn.Linear(self.H2, self.O)
        self.activation = nn.LeakyReLU()
        self.bn1 = nn.BatchNorm1d(self.H1)
        self.bn2 = nn.BatchNorm1d(self.H2)
        
    def forward(self, inputs):
        outputs = self.bn1(self.l1(inputs))
        outputs = self.activation(outputs)
        outputs = self.bn2(self.l2(outputs))
        outputs = self.activation(outputs)
        
        return torch.log(self.l3(outputs) + 1)
    
    def predict(self, inputs):
        outputs = self.forward(inputs)
        return torch.exp(outputs) - 1

In [143]:
def evaluation(data_loader, model):
    model.eval() # for batch norm at test time!
    loss_function = nn.MSELoss(size_average=False)
    losses=0
    for i, (inputs, lg_targets) in enumerate(data_loader):
        inputs, lg_targets = Variable(inputs).view(-1, I), Variable(lg_targets)
        lg_outputs = model.forward(inputs)
        losses += loss_function(lg_outputs, lg_targets).data[0]
    return losses/len(data_loader.dataset)

In [145]:
EPOCH=5
LR=0.1
I = 6

model = Network(I, [50, 30], 1)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [146]:
model.train()
for epoch in range(EPOCH):
    losses=[]
    for i, (inputs, lg_targets) in enumerate(train_loader):
        inputs, lg_targets = Variable(inputs).view(-1, I), Variable(lg_targets)

        model.zero_grad()
        lg_outputs = model.forward(inputs)
        loss = loss_function(lg_outputs, lg_targets)
        loss.backward()
        optimizer.step()

        losses.append(loss.data[0])
        if i % 500 == 0:
            print("[{0}/{1}] [{2}/{3}] mean_loss : {4:.3f}"\
                  .format(epoch, EPOCH, i, len(train_loader), np.mean(losses)))
            losses=[]

[0/5] [0/2316] mean_loss : 8.593
[0/5] [500/2316] mean_loss : 0.531
[0/5] [1000/2316] mean_loss : 0.481
[0/5] [1500/2316] mean_loss : 0.477
[0/5] [2000/2316] mean_loss : 0.478
[1/5] [0/2316] mean_loss : 0.477
[1/5] [500/2316] mean_loss : 0.470
[1/5] [1000/2316] mean_loss : 0.466
[1/5] [1500/2316] mean_loss : 0.465
[1/5] [2000/2316] mean_loss : 0.463
[2/5] [0/2316] mean_loss : 0.469
[2/5] [500/2316] mean_loss : 0.462
[2/5] [1000/2316] mean_loss : 0.457
[2/5] [1500/2316] mean_loss : 0.457
[2/5] [2000/2316] mean_loss : 0.458
[3/5] [0/2316] mean_loss : 0.565
[3/5] [500/2316] mean_loss : 0.459
[3/5] [1000/2316] mean_loss : 0.453
[3/5] [1500/2316] mean_loss : 0.453
[3/5] [2000/2316] mean_loss : 0.455
[4/5] [0/2316] mean_loss : 0.470
[4/5] [500/2316] mean_loss : 0.450
[4/5] [1000/2316] mean_loss : 0.453
[4/5] [1500/2316] mean_loss : 0.451
[4/5] [2000/2316] mean_loss : 0.452


In [147]:
evaluation(valid_loader, model)

0.4514838425060501

In [149]:
test = torch.FloatTensor(X_test).contiguous().view(-1, 6)

In [151]:
model.eval()
pred = model.predict(Variable(test))

In [156]:
pred.data.numpy()

array([[12.317826 ],
       [ 9.913862 ],
       [26.39264  ],
       ...,
       [16.87075  ],
       [15.803221 ],
       [12.1100025]], dtype=float32)

In [157]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='test_id')
sub['price'] = pred.data.numpy()
sub.to_csv('./data/torch_model.csv', index=True)

## deep learning model: keras

In [30]:
import os 

from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization, LeakyReLU
from keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import KFold

%matplotlib inline
import matplotlib.pyplot as plt

In [31]:
BATCH_SIZE = 512
N_SPLIT = 10
I = 6
H = [40, 20]
O = 1
ALPHA = 0.1
EPOCH = 15

filepath = 'model-improvement-{epoch:02d}-{val_mean_squared_error:.2f}.hdf5'
data = df.values
X = data[:, 1:]
y = np.log(data[:, :1] + 1.)
cv = KFold(n_splits=N_SPLIT, shuffle=True)

In [9]:
def create_model(I, H, O, ALPHA):
    model = Sequential()

    model.add(Dense(H[0], input_dim=I, kernel_initializer='uniform'))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=ALPHA))

    model.add(Dense(H[1], input_dim=H[0], kernel_initializer='uniform'))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=ALPHA))

    model.add(Dense(O, input_dim=H[1], activation='linear'))
    
    return model

In [10]:
def get_callbacks(filepath, patience=2):
    early_stopping = EarlyStopping(monitor='val_mean_squared_error', patience=2, mode='min')
    checkpoint = ModelCheckpoint(filepath, monitor='val_mean_squared_error', 
                                 verbose=1, save_best_only=True, mode='min')
    return [early_stopping, checkpoint]

In [None]:
for cv_idx, (train_idx, valid_idx) in enumerate(list(cv.split(X, y))):
    print('CV IDX:', cv_idx)
    print('='*40)
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    model = None
    model = create_model(I, H, O, ALPHA)
    
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    
    callbacks = get_callbacks(filepath, patience=2)
    
    hist = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCH, 
                     validation_data=(X_valid, y_valid),
                     callbacks=callbacks)

Train on 1334281 samples, validate on 148254 samples
Epoch 1/15

In [17]:
def get_mean_scores(N_SPLIT):
    dir_list = os.listdir()
    score_list = []
    for i in range(N_SPLIT):
        cv_list = [filename for filename in dir_list if filename.split('_')[0] == ('cv'+str(i))]
        score_list.append(get_scores(cv_list))
    
    print('Average Cross Validation Score', np.mean(score_list))
    return score_list

def get_scores(cv_list):
    score = min([float(filename.split('-')[3][:4]) for filename in cv_list])
    return score

def get_best_model(num):
    dir_list = os.listdir()
    cv_list = [filename for filename in dir_list if filename.split('_')[0] == ('cv'+str(num))]
    max_epoch = 0
    
    for filename in cv_list:
        if int(filename.split('-')[2]) > max_epoch:
            max_epoch = int(filename.split('-')[2])
            best_model_file = filename
            
    return best_model_file

In [None]:
score_list = get_mean_scores(N_SPLIT)

In [None]:
filename = get_best_model(np.argmin(score_list))
filename

In [None]:
best_model = create_model(I, H, O, ALPHA)
best_model.load_weights(filename)

In [None]:
X_test = df_t.values
pred = best_model.predict(X_test)

In [None]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='test_id')
sub['price'] = np.exp(pred) - 1.
sub.to_csv('./keras_cv044.csv', index=True)