In [1]:
import warnings
warnings.filterwarnings("ignore")
import torch
import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch.utils.data
import torchvision.transforms as transforms
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score
import lightgbm as lgb

In [2]:
import plotly
from plotly.offline import iplot
import plotly.graph_objs as go

In [3]:
plotly.offline.init_notebook_mode(connected=True)

In [4]:
import sys
sys.path.append("../")
from dataset.dataset import *
from util.util import *
from model.model import *

---
<font size="5">Overviews</font>
1. Introduction
1. CelebA dataset
1. Dimension Reduction
---

# Introduction

---
This notebook introduces the dimension reduction I used for LCGAN and it's colored with purple in diagram below.
However, as I mentioned in our paper,
<span style="color:red">the way how the dimension reduction is conducted is optional.</span>
In this experiment, I employed feature importance of [LightGBM](https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf) and docs are [here](https://lightgbm.readthedocs.io/en/latest/).

<img src="./../data/images/whole_picture.png" width="500">

---
The dimension reduction is carried out in the following procedures:
1. In $n$ dimension, conduct classification using LightGBM
1. Extract $k$ features that are important (feature importance)
1. conduct classification with $m$ features out of $k$ features
1. $m$-dimension vector which has the highest accuracy is employed as Relational label

Where $n$ and $m$ is the dimension of the original latent code and reducted latent code respectively.
In this experiment, $n=128$, $k=10$, and $m=3$ is used.

---

# Get celebA dataset

In [5]:
root = "./../../research/sound_dataset/celebA/img_align_celeba_png/"
label_root = "../../research/sound_dataset/celebA/label_folder/"

In [6]:
label_discription = [
    "male, smiling, young",
    "male, smiling, old",
    "male, not_smiling, young",
    "male, not_smiling, old",
    "female, smiling, young",
    "female, smiling, old",
    "female, not_smiling, young",
    "female, not_smiling, old"
]

In [7]:
# 1 5 o clock shadow
# 11 blurry
# 14 chubby
# 15 double chin
# 16 eyeglasses
# 17 goatee
# 21 male
# 23 mustache
# 25 No_Beard
# 31 sideburns
# 32 smiling
# 36 wearing hat
# 40 young
dataset_label = {}
dataset_label["existed"] = [25] 
dataset_label["delete"] = [1, 11, 14, 15, 16, 17, 23, 31, 36] 
dataset_label["class"] = [21, 32, 40] 

In [8]:
classes = tuple(range(2**len(dataset_label["class"])))
classes

(0, 1, 2, 3, 4, 5, 6, 7)

In [9]:
normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])
transform = {}
transform["train"] = transforms.Compose([
    transforms.CenterCrop((128, 128)),
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor()
])
    
transform["test"] = transforms.Compose([
    transforms.CenterCrop((128, 128)),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

---
I just need to get testset.

---

In [10]:
testset = FaceDataset(root, label_root, transform["test"], dataset_label, classes, data_type="test", train_num=5000, val_num=500, test_num=500)
testloader = torch.utils.data.DataLoader(dataset=testset, batch_size=256, shuffle=False)
len(testset)

4000

# Dimension Reduction
---
I conducted dimension reduction via feature importance in LightGBM.
First of all, we're gonna get the latent code by the use of the model previously trained.
And dimension reduction is conducted as I explained in the beginning.

## Preparation

---
I also employed the fuction which conduct test process.

---

In [11]:
def do_test_VAE(net, testloader, device="cuda", mode="train"):
    
    if mode=="train":
        net.train()
    elif mode=="eval":
        net.eval()
    else:
        return None
    
    labels = np.array([])
    losses = []
    with torch.no_grad():
        for itr, data in enumerate(testloader):
            images = data[0].to(device)
            label = data[1].to(device)
            output, z = net(images)
            labels = np.append(labels, label.to("cpu").detach().numpy())
            losses.append(net.loss(images, beta=beta).to("cpu").detach().numpy())
            if itr==0:
                latents = z.to("cpu").detach().numpy()
                inputs = images.to("cpu").detach().numpy()
                outputs = output.to("cpu").detach().numpy()
            else:
                latents = np.concatenate([latents, z.to("cpu").detach().numpy()], axis=0)
                inputs = np.concatenate([inputs, images.to("cpu").detach().numpy()], axis=0)
                outputs = np.concatenate([outputs, output.to("cpu").detach().numpy()], axis=0)
                
    return labels, inputs, outputs, np.mean(losses), latents

---
I prepared the model trained in the previous notebook.

---

In [12]:
z_dim = 128
beta = 0.1
device = "cuda" if torch.cuda.is_available() else "cpu"

net = VAE(z_dim=z_dim, nch_input=3, nch=64, device=device).to(device)
# model_path = f"../data/parameters/VAE_color_celebA_ndim512_beta0.1_lr0.0001_epoch99.pth"
model_path = f"../data/parameters/VAE_celebA_ndim128_beta0.1_lr0.0001_epoch66.pth"
model_param = torch.load(model_path, map_location=device)
net.load_state_dict(model_param)

<All keys matched successfully>

---
Get the test results.
***latents*** indicates the latent code.

---

In [13]:
labels, inputs, outputs, _, latents = do_test_VAE(net, testloader, device, "eval")

---
*latents* are processed to *X* in order to employed as the input data of LightGBM, and *y* is used for the output (the label)

---

In [14]:
z_ = np.reshape(latents, (-1, z_dim)).T
X = copy.copy(z_).T
y = copy.copy(labels).T

---
Data is divided into train, validation, and test data in proportion of approximately 0.67, 0.22, and 0.11 respectively.

---

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, random_state=42)

X_train = pd.DataFrame(X_train)
X_valid = pd.DataFrame(X_valid)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_valid = pd.DataFrame(y_valid)
y_test = pd.DataFrame(y_test)
X_train.shape, X_valid.shape, X_test.shape

((3000, 128), (750, 128), (250, 128))

## Training of LightGBM
I'm gonna train LightGBM and conduct dimension reduction according to the procedure below.

1. In $n$ dimension, conduct classification using LightGAM
1. Extract $k$ features that are important (feature importance)
1. conduct classification with $m$ features out of $k$ features
1. $m$-dimension vector which has the highest accuracy is employed as Relational label

Where $n$ and $m$ is the dimension of the original latent code and reducted latent code respectively.
In this experiment, $n=128$, $k=10$, and $m=3$ is used.

### Classification with $n(128)$-dimension vector

In [16]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

---
Since LightGBM is susceptible to overfitting, I employed optuna for hyper parameter tuning.
These cells below conduct it. It searches for optimum hyper parameter for 1000 times in the experiment.

---

In [25]:
def objective(trial):
    lr = trial.suggest_loguniform("lr", 1e-2, 1)
    num_leaves = trial.suggest_int("num_leaves", 1, 5)
    max_depth = trial.suggest_int("max_depth", 1, 5)
    min_child_samples = trial.suggest_int("min_child_samples", 1, 5)
    min_child_samples = trial.suggest_int("min_child_samples", 1, 5)
    max_bin = trial.suggest_int("max_bin", 5, 9)
    lgbm_params = {
        "objective": "multiclass",
        "num_class": len(classes),
        "learning_rate": lr,
        "num_leaves": 8*num_leaves-1,
        "min_child_samples": 8*min_child_samples-1,
        "max_depth": 8*max_depth-1,
        "max_bin": 2**max_bin - 1
    }
    model = lgb.train(lgbm_params, lgb_train, 
                      valid_sets=lgb_valid, 
                      num_boost_round=1000, 
                      early_stopping_rounds=100,
                      verbose_eval=False
                     )
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    
    # this is for log loss
#     loss = log_loss(y_valid, y_pred)
    
    # this is for error rate
    y_pred = np.argmax(y_pred, axis=1)
    loss = 1-accuracy_score(y_valid, y_pred)
#     print(accuracy)
    
    return loss

In [None]:
TRIAL_SIZE = 1000
study = optuna.create_study()
study.optimize(objective, n_trials=TRIAL_SIZE, n_jobs=1)

---
This is the best parameters.

---

In [30]:
bp = study.best_params
lgbm_params = {
    "objective": "multiclass",
    "num_class": len(classes),
    "learning_rate": bp["lr"],
    "num_leaves": 8*bp["num_leaves"]-1,
    "min_child_samples": 8*bp["min_child_samples"]-1,
    "max_depth": 8*bp["max_depth"]-1,
    "max_bin": 2**bp["max_bin"]-1
}
lgbm_params

{'objective': 'multiclass',
 'num_class': 8,
 'learning_rate': 0.11567471689689475,
 'num_leaves': 7,
 'min_child_samples': 31,
 'max_depth': 31,
 'max_bin': 127}

---
Since this process is also painstaking, I wrote down the optimum parameters. Run the code below to obtain.

---

---
Let's train LightGBM with optimum hyper parameters.

---

In [31]:
model = lgb.train(lgbm_params, lgb_train, 
                  valid_sets=lgb_valid, 
                  num_boost_round=1000, 
                  early_stopping_rounds=100,
                  verbose_eval=False
                 )

---
Check the train, validation, and test results.

---

In [32]:
y_pred = model.predict(X_train, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_train, y_pred)
"train", accuracy

('train', 1.0)

In [33]:
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_valid, y_pred)
"valid", accuracy

('valid', 0.5253333333333333)

In [34]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred)
"test", accuracy

('test', 0.524)

---
Since some data in CelebA dataset are difficult to distinguish the label in the appearance,
the model confused a little bit.
I think this is happy enough.
Ok! Get the important features for this classification.

---

In [35]:
importance = pd.DataFrame(model.feature_importance(), index=range(z_.shape[0]), columns=['importance'])
a = np.reshape(np.arange(z_.shape[0]), (z_.shape[0], 1))
importance = np.hstack((a, importance))

In [36]:
num = 10
importance_sort = importance[np.argsort(importance[:, 1])[::-1], :]
arg_sort = importance_sort[:, 0]
pd.DataFrame(importance_sort[:num, :], columns=["feature", "importance"], index=[ordinal(i) for i in range(1, num+1)])

Unnamed: 0,feature,importance
1st,100,400
2nd,91,347
3rd,73,300
4th,47,298
5th,6,295
6th,120,284
7th,71,283
8th,46,261
9th,66,251
10th,11,239


### Extranction of imoprtant $k(10)$ features and $m(3)$-features classification.
---
As the title put it, I extracted the most $k(10)$ important features.
Then, I conducted classification with $m(3)$ features out of $k$ features, in all possible combinations.
In this classification, hyper-parameter tuning is also carried out in each classification individually.
After that, $m$-dimension vector which has the highest accuracy is selected.

---

---
***k***: it represents \# of features I extracted.

***m***: it represents the dimension of final label.

---

In [37]:
k = 10
m = 3

In [38]:
v_list = []
acc_list = []
i = 0
combinations = list(itertools.combinations(list(range(k)), m))
for v in combinations:
    i += 1
    v = np.array(v)
    z_3d = z_[arg_sort[v], :]
    X = copy.copy(z_3d.T)
    np.random.seed(0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, random_state=42)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    study = optuna.create_study()
    study.optimize(objective, n_trials=100)
    bp = study.best_params
    lgbm_params = {
        "objective": "multiclass",
        "num_class": len(classes),
        "learning_rate": bp["lr"],
        "num_leaves": 8*bp["num_leaves"]-1,
        "min_child_samples": 8*bp["min_child_samples"]-1,
        "max_depth": 8*bp["max_depth"]-1,
        "max_bin": 2**bp["max_bin"]-1
    }
    clear_output(wait=True)
    
    model = lgb.train(lgbm_params, lgb_train, 
                      valid_sets=lgb_valid, 
                      num_boost_round=1000, 
                      early_stopping_rounds=100,
                      verbose_eval=False
                     )
    
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(y_test, y_pred)
    acc_list.append(accuracy)
    v_list.append(v)
    print(f"{i} / {len(combinations)}" )

120 / 120


---
Extract the best 3 features.

---

In [40]:
feature_sort = np.array(v_list)[np.argsort(acc_list), :][::-1,:]
arange = np.sort(arg_sort[[feature_sort[0]]])
z_3d = z_[arange, :]
arange

array([  6,  91, 100])

---
The extracted features might be different as you train LightGBM due to the non-stability of training.

In my experiment, the important feautures were *6*, *91*, and *100* in the VAE I trained.

---

### Take an avarage by the classes
---
In order to use as it the label, I took an average of the extracted vector by the classes, so that the shape will be (3, 8).

---

In [41]:
center = np.array([])
for lbl in classes:
    array = z_3d[:, labels==lbl]
    mean = np.mean(array, axis=1)
    print(lbl, mean)
    center = np.append(center, mean)
#     print(lbl, mean)
center = np.reshape(center, (-1, dim))    

0 [-0.17578965  0.27388418 -0.41167694]
1 [ 0.20066902  0.22555672 -0.32801148]
2 [0.0912234  0.18957677 0.35079104]
3 [0.40836665 0.14090845 0.1924517 ]
4 [-0.34577882 -0.49618778 -0.7121413 ]
5 [-0.19866806 -0.5255212  -0.61670256]
6 [ 0.07906556 -0.50958645 -0.01499732]
7 [ 0.122086   -0.5108191   0.13009055]


---
Visualization of the label.

You can control it interactively.

---

In [42]:
trace = []
for lbl in range(len(classes)):
    m = center[lbl]
    _x = m[0:1] 
    _y = m[1:2]
    _z = m[2:]
    trace.append(go.Scatter3d(x=_x, y=_y, z=_z, mode="markers", name=f"{label_discription[classes[lbl]]}", 
#     trace.append(go.Scatter3d(x=_x, y=_y, z=_z, mode="markers", name=f"{classes[lbl]}", 
                              marker = dict(size=8)))
layout = go.Layout(width=700, height=500)    
data = trace
fig = dict(data=data, layout=layout)
iplot(fig)

In [46]:
save_path = f'../data/parameters/VAE_celebA_{len(classes)}classes_ndim{z_dim}to{dim}.pickle'
save = False
if save:
    with open(save_path, 'wb') as f:
        pickle.dump(center, f)

---
I have demonstrated how I conducted the dimension reduction.
However, again, it's optional, you can use familiar dimension reduction such as permutation importance, PCA, and so on.
In next notebook, I'm gonna implement LCGAN.

---