In [27]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import pandas as pd
import numpy as np
import bs4
import pickle as pkl
from PIL import Image, ImageDraw, ImageOps
import os
import math
import random 

import torch
import torch.nn as nn
from torch.autograd import Variable
from utils.loader import load_new_dataset, load_dataset, maxvalue, minvalue
from utils.model_arg import ModelArg
from utils.evaluate import correct
from utils.selection import getModel
from utils.plot_res import plotpkl, plot
from utils.test import _get_single_block, get_score_correct
from datetime import datetime
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

import itertools

# Generate Neccessary Data

In [None]:
! python parse_feature_from_vega.py

In [None]:
! python generate_empirical_score.py

# Hand-Crafted Features

In [2]:
metric_ids = ['whiteSpaceArea', 'graphicSpread', 'graphicDistance','graphicSize', 'graphicSizeVar', 'graphicSizeMin','groupSizeVar', 'groupDistanceMin', 'graphicXSymmetry', 'graphicXAsymmetry', 'graphicYSymmetry', 'graphicYAsymmetry', 'textXsymmetry', 'textXAsymmetry', 'textYsymmetry','textYAsymmetry']
weights  = [0.2, 50, 50, 50, 50, 125, 2.5, 250, 50, 0.2, 50, 0.2, 50, 0.2, 50, 0.2]
groups = [[0,1,2], [3,4,5], [6,7], [8,9,10,11,12,13,14,15], list(range(0, 16))]
group_names = ['White Space', 'Scale', 'Unity', 'Balance', 'All']

In [3]:
def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking
    Transforms a n-class ranking problem into a two-class classification
    problem. Subclasses implementing particular strategies for choosing
    pairs should override this method.
    In this method, all pairs are choosen, except for those that have the
    same target value. The output is an array of balanced classes, i.e.
    there are the same number of -1 as +1
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data
    y : array, shape (n_samples,) or (n_samples, 2)
        Target labels. If it's a 2D array, the second column represents
        the grouping of samples, i.e., samples with different groups will
        not be considered.
    Returns
    -------
    X_trans : array, shape (k, n_feaures)
        Data as pairs
    y_trans : array, shape (k,)
        Output class labels, where classes have values {-1, +1}
    """
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    comb = itertools.combinations(range(X.shape[0]), 2)
    for k, (i, j) in enumerate(comb):
        if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
            # skip if same target or different group
            continue
        X_new.append(X[i] - X[j])
        y_new.append(np.sign(y[i, 0] - y[j, 0]))
        # output balanced classes
        if y_new[-1] != (-1) ** k:
            y_new[-1] = - y_new[-1]
            X_new[-1] = - X_new[-1]
    return np.asarray(X_new), np.asarray(y_new).ravel()


class RankSVM(SGDClassifier):
    """Performs pairwise ranking with an underlying SGDClassifer model
    Input should be a n-class ranking problem, this object will convert it
    into a two-class classification problem, a setting known as
    `pairwise ranking`.
    Authors: Fabian Pedregosa <fabian@fseoane.net>
             Alexandre Gramfort <alexandre.gramfort@inria.fr>
    https://gist.github.com/2071994
    """

    def fit(self, X, y):
        """
        Fit a pairwise ranking model.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        y : array, shape (n_samples,) or (n_samples, 2)
        Returns
        -------
        self
        """
        X_trans, y_trans = transform_pairwise(X, y)
        super(RankSVM, self).fit(X_trans, y_trans)
        return self

    def predict(self, X):
        pred = super(RankSVM, self).predict(X)
        # preds are mapped to {-1,1}
        # FIXME only works in this example!!!
        pred[pred == -1] = 0
        return pred

    def score(self, X, y):
        """
        Because we transformed into a pairwise problem, chance level is at 0.5
        """
        X_trans, y_trans = transform_pairwise(X, y)
        return np.mean(super(RankSVM, self).predict(X_trans) == y_trans)

In [31]:
exp = 'exp1'
if exp == 'exp1':
    metricsDf = pd.read_csv('../dataset/exp1/metrics.csv')
    labelDf = pd.read_csv('../dataset/exp1/turk_results.csv')
else:
    metricsDf = pd.read_csv('../dataset/exp2/metrics.csv')
    labelDf = pd.read_csv('../dataset/exp2/turk_results.csv')

labelDf['goodName'] = labelDf['good'].apply(lambda x: x.split('/')[-1].split('.')[0])
labelDf['badName'] = labelDf['bad'].apply(lambda x: x.split('/')[-1].split('.')[0])
labelDf.head()

goodPara = metricsDf.merge(labelDf, left_on='name', right_on = 'goodName')
badPara = metricsDf.merge(labelDf, left_on='name', right_on = 'badName')

for idx, group in enumerate(group_names):
    print(group)
    metricsInThisGroup = [metric_ids[i] for i in groups[idx]]
    print(metricsInThisGroup)

    winnerPara = goodPara[metricsInThisGroup]
    loserPara = badPara[metricsInThisGroup]
    winnerPara.head()

    differences = (winnerPara - loserPara).fillna(0).values

    Y = []
    X = differences.copy()
    for idx, d in enumerate(X):
        if idx % 2:
            Y.append(0)
            X[idx] = -d
        else:
            Y.append(1)

    testAccs = []
    for MCVDidx in range(0,10):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, \
                                                            random_state=random.randint(1,10000))

        Accs = []
        Names = []
        for clf, name in ((SGDClassifier(max_iter=100, alpha=0.01), "plain sgd"),
                        (RankSVM(alpha=0.0001, tol=1e-3, loss='hinge'), 'RankSVM'),
        #                   (SGDClassifier(max_iter=1000, alpha=0.01,
        #                                  loss='roc_pairwise_ranking'), "pairwise sgd"),
                          (RankSVM(alpha=0.1, loss='hinge'), 'RankSVM'),
                          ):
            clf.fit(X_train, Y_train)

            pred = clf.predict(X_test)
            Accs.append(metrics.accuracy_score(Y_test, pred))
            Names.append(clf)

        testAccs.append(Accs)

    print(pd.DataFrame(testAccs, columns=Names).mean())
    print("-------------------------")


White Space
['whiteSpaceArea', 'graphicSpread', 'graphicDistance']
SGDClassifier(alpha=0.01, max_iter=100)    0.554608
RankSVM()                                  0.572355
RankSVM(alpha=0.1)                         0.576109
dtype: float64
-------------------------
Scale
['graphicSize', 'graphicSizeVar', 'graphicSizeMin']
SGDClassifier(alpha=0.01, max_iter=100)    0.575768
RankSVM()                                  0.583276
RankSVM(alpha=0.1)                         0.571331
dtype: float64
-------------------------
Unity
['groupSizeVar', 'groupDistanceMin']




SGDClassifier(alpha=0.01, max_iter=100)    0.506826
RankSVM()                                  0.500341
RankSVM(alpha=0.1)                         0.517747
dtype: float64
-------------------------
Balance
['graphicXSymmetry', 'graphicXAsymmetry', 'graphicYSymmetry', 'graphicYAsymmetry', 'textXsymmetry', 'textXAsymmetry', 'textYsymmetry', 'textYAsymmetry']
SGDClassifier(alpha=0.01, max_iter=100)    0.568942
RankSVM()                                  0.584642
RankSVM(alpha=0.1)                         0.568942
dtype: float64
-------------------------
All
['whiteSpaceArea', 'graphicSpread', 'graphicDistance', 'graphicSize', 'graphicSizeVar', 'graphicSizeMin', 'groupSizeVar', 'groupDistanceMin', 'graphicXSymmetry', 'graphicXAsymmetry', 'graphicYSymmetry', 'graphicYAsymmetry', 'textXsymmetry', 'textXAsymmetry', 'textYsymmetry', 'textYAsymmetry']
SGDClassifier(alpha=0.01, max_iter=100)    0.513311
RankSVM()                                  0.556314
RankSVM(alpha=0.1)                         

# Rank SVM

In [37]:
args = ModelArg(
    batch_size=500,
    need_seq=False,
    need_param=True,
    hybrid_linearlist=[64, 256, 128, 64, 32, 16, 8, 4],
    need_img=False,
    h=100,
    w=300,
    val_split_ratio=0.2,
    weights=[1]*6,    #[1,1,1],
    img_path='../dataset/exp2/img',
    label_path='../dataset/exp2/turk_results.csv',
    param_path='../dataset/exp2/parameters.csv'
)
seed = random.randint(1, 10000)     
torch.manual_seed(seed)

device = torch.device("cuda:0")
train_loader, val_loader = load_new_dataset(args, device)

for batch in train_loader:
    features, groundTruth, _, _ = batch
    gp0param = features[0][1]
    gp1param = features[1][1]
    gp1, gp2 = groundTruth
    
winnerPara = gp0param.cpu().detach().numpy()
loserPara = gp1param.cpu().detach().numpy()

differences = winnerPara - loserPara

Y = []
X = differences.copy()
for idx, d in enumerate(X):
    if idx % 2:
        Y.append(0)
        X[idx] = -d
    else:
        Y.append(1)
        
for clf, name in ((SGDClassifier(max_iter=100, alpha=0.01), "plain sgd"),
                (RankSVM(alpha=0.0001, tol=1e-3), 'RankSVM'),
#                   (SGDClassifier(max_iter=1000, alpha=0.01,
#                                  loss='roc_pairwise_ranking'), "pairwise sgd"),
                  (RankSVM(max_iter=1000, alpha=0.1, loss='hinge'), 'RankSVM'),
                  ):
    clf.fit(X, Y)
    print(clf)
    pred = clf.predict(X)

#     print(metrics.f1_score(Y, pred))
    print(metrics.accuracy_score(Y, pred))

size=1133 split=226 train_indices=907 val_indices=226
SGDClassifier(alpha=0.01, max_iter=100)
0.6265356265356266
RankSVM()
0.6265356265356266
RankSVM(alpha=0.1)
0.6167076167076168
