src/shogun/transfer/multitask/LibLinearMTL.cpp

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * Written (W) 2011-2012 Christian Widmer
 * Written (W) 2007-2010 Soeren Sonnenburg
 * Copyright (c) 2007-2009 The LIBLINEAR Project.
 * Copyright (C) 2007-2012 Fraunhofer Institute FIRST and Max-Planck-Society
 */

#include <vector>

#include <shogun/lib/config.h>

#ifdef HAVE_LAPACK
#include <shogun/base/Parameter.h>
#include <shogun/base/progress.h>
#include <shogun/features/DotFeatures.h>
#include <shogun/io/SGIO.h>
#include <shogun/lib/Signal.h>
#include <shogun/lib/Time.h>
#include <shogun/optimization/liblinear/tron.h>
#include <shogun/transfer/multitask/LibLinearMTL.h>

using namespace shogun;


	CLibLinearMTL::CLibLinearMTL()
: CLinearMachine()
{
	init();
}

CLibLinearMTL::CLibLinearMTL(
		float64_t C, CDotFeatures* traindat, CLabels* trainlab)
: CLinearMachine()
{
	init();
	C1=C;
	C2=C;
	use_bias=true;

	set_features(traindat);
	set_labels(trainlab);

}


void CLibLinearMTL::init()
{
	use_bias=false;
	C1=1;
	C2=1;
	set_max_iterations();
	epsilon=1e-5;

	SG_ADD(&C1, "C1", "C Cost constant 1.", MS_AVAILABLE);
	SG_ADD(&C2, "C2", "C Cost constant 2.", MS_AVAILABLE);
	SG_ADD(&use_bias, "use_bias", "Indicates if bias is used.",
			MS_NOT_AVAILABLE);
	SG_ADD(&epsilon, "epsilon", "Convergence precision.", MS_NOT_AVAILABLE);
	SG_ADD(&max_iterations, "max_iterations", "Max number of iterations.",
			MS_NOT_AVAILABLE);

}

CLibLinearMTL::~CLibLinearMTL()
{
}

bool CLibLinearMTL::train_machine(CFeatures* data)
{

	ASSERT(m_labels)

	if (data)
	{
		if (!data->has_property(FP_DOT))
			SG_ERROR("Specified features are not of type CDotFeatures\n")

		set_features((CDotFeatures*) data);
	}
	ASSERT(features)
	m_labels->ensure_valid();


	int32_t num_train_labels=m_labels->get_num_labels();
	int32_t num_feat=features->get_dim_feature_space();
	int32_t num_vec=features->get_num_vectors();

	if (num_vec!=num_train_labels)
	{
		SG_ERROR("number of vectors %d does not match "
				"number of training labels %d\n",
				num_vec, num_train_labels);
	}


	float64_t* training_w = NULL;
	if (use_bias)
		training_w=SG_MALLOC(float64_t, num_feat+1);
	else
		training_w=SG_MALLOC(float64_t, num_feat+0);

	liblinear_problem prob;
	if (use_bias)
	{
		prob.n=num_feat+1;
		memset(training_w, 0, sizeof(float64_t)*(num_feat+1));
	}
	else
	{
		prob.n=num_feat;
		memset(training_w, 0, sizeof(float64_t)*(num_feat+0));
	}
	prob.l=num_vec;
	prob.x=features;
	prob.y=SG_MALLOC(float64_t, prob.l);
	prob.use_bias=use_bias;

	for (int32_t i=0; i<prob.l; i++)
		prob.y[i]=((CBinaryLabels*)m_labels)->get_label(i);

	int pos = 0;
	int neg = 0;
	for(int i=0;i<prob.l;i++)
	{
		if(prob.y[i]==+1)
			pos++;
	}
	neg = prob.l - pos;

	SG_INFO("%d training points %d dims\n", prob.l, prob.n)
	SG_INFO("%d positives, %d negatives\n", pos, neg)

	double Cp=C1;
	double Cn=C2;
	solve_l2r_l1l2_svc(&prob, epsilon, Cp, Cn);

	if (use_bias)
		set_bias(training_w[num_feat]);
	else
		set_bias(0);

	SG_FREE(prob.y);

	SGVector<float64_t> w(num_feat);
	for (int32_t i=0; i<num_feat; i++)
		w[i] = training_w[i];
	set_w(w);

	return true;
}

// A coordinate descent algorithm for
// L1-loss and L2-loss SVM dual problems
//
//  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
//    s.t.      0 <= alpha_i <= upper_bound_i,
//
//  where Qij = yi yj xi^T xj and
//  D is a diagonal matrix
//
// In L1-SVM case:
//		upper_bound_i = Cp if y_i = 1
//		upper_bound_i = Cn if y_i = -1
//		D_ii = 0
// In L2-SVM case:
//		upper_bound_i = INF
//		D_ii = 1/(2*Cp)	if y_i = 1
//		D_ii = 1/(2*Cn)	if y_i = -1
//
// Given:
// x, y, Cp, Cn
// eps is the stopping tolerance
//
// solution will be put in w

#undef GETI
#define GETI(i) (y[i]+1)
// To support weights for instances, use GETI(i) (i)


void CLibLinearMTL::solve_l2r_l1l2_svc(const liblinear_problem *prob, double eps, double Cp, double Cn)
{


	int l = prob->l;
	int w_size = prob->n;
	int i, s, iter = 0;
	double C, d, G;
	double *QD = SG_MALLOC(double, l);
	int *index = SG_MALLOC(int, l);
	//double *alpha = SG_MALLOC(double, l);

	int32_t *y = SG_MALLOC(int32_t, l);
	int active_size = l;
	// PG: projected gradient, for shrinking and stopping
	double PG;
	double PGmax_old = CMath::INFTY;
	double PGmin_old = -CMath::INFTY;
	double PGmax_new, PGmin_new;

	// matrix W
	V = SGMatrix<float64_t>(w_size,num_tasks);

	// save alpha
	alphas = SGVector<float64_t>(l);


	// default solver_type: L2R_L2LOSS_SVC_DUAL
	double diag[3] = {0.5/Cn, 0, 0.5/Cp};
	double upper_bound[3] = {CMath::INFTY, 0, CMath::INFTY};
	if(true)
	{
		diag[0] = 0;
		diag[2] = 0;
		upper_bound[0] = Cn;
		upper_bound[2] = Cp;
	}

	int n = prob->n;

	if (prob->use_bias)
		n--;

	// set V to zero
	for(int32_t k=0; k<w_size*num_tasks; k++)
	{
		V.matrix[k] = 0;
	}

	// init alphas
	for(i=0; i<l; i++)
	{
		alphas[i] = 0;
	}

	for(i=0; i<l; i++)
	{
		if(prob->y[i] > 0)
		{
			y[i] = +1;
		}
		else
		{
			y[i] = -1;
		}
		QD[i] = diag[GETI(i)];
		QD[i] += prob->x->dot(i, prob->x,i);
		index[i] = i;
	}

	auto pb = progress(range(10));
	CTime start_time;
	while (iter < max_iterations && !cancel_computation())
	{
		if (m_max_train_time > 0 && start_time.cur_time_diff() > m_max_train_time)
			break;

		PGmax_new = -CMath::INFTY;
		PGmin_new = CMath::INFTY;

		for (i=0; i<active_size; i++)
		{
			int j = CMath::random(i, active_size-1);
			CMath::swap(index[i], index[j]);
		}

		for (s=0;s<active_size;s++)
		{
			i = index[s];
			int32_t yi = y[i];
			int32_t ti = task_indicator_lhs[i];
			C = upper_bound[GETI(i)];

			// we compute the inner sum by looping over tasks
			// this update is the main result of MTL_DCD
		    typedef std::map<index_t, float64_t>::const_iterator map_iter;

			float64_t inner_sum = 0;
			for (map_iter it=task_similarity_matrix.data[ti].begin(); it!=task_similarity_matrix.data[ti].end(); it++)
			{

				// get data from sparse matrix
				int32_t e_i = it->first;
                float64_t sim = it->second;

				// fetch vector
				float64_t* tmp_w = V.get_column_vector(e_i);
				inner_sum += sim * yi * prob->x->dense_dot(i, tmp_w, n);

				//possibly deal with bias
				//if (prob->use_bias)
				//	G+=w[n];
			}

			// compute gradient
			G = inner_sum-1.0;

			// check if point can be removed from active set
			PG = 0;
			if (alphas[i] == 0)
			{
				if (G > PGmax_old)
				{
					active_size--;
					CMath::swap(index[s], index[active_size]);
					s--;
					continue;
				}
				else if (G < 0)
					PG = G;
			}
			else if (alphas[i] == C)
			{
				if (G < PGmin_old)
				{
					active_size--;
					CMath::swap(index[s], index[active_size]);
					s--;
					continue;
				}
				else if (G > 0)
					PG = G;
			}
			else
				PG = G;

			PGmax_new = CMath::max(PGmax_new, PG);
			PGmin_new = CMath::min(PGmin_new, PG);

			if(fabs(PG) > 1.0e-12)
			{
				// save previous alpha
				double alpha_old = alphas[i];

				// project onto feasible set
				alphas[i] = CMath::min(CMath::max(alphas[i] - G/QD[i], 0.0), C);
				d = (alphas[i] - alpha_old)*yi;

				// update corresponding weight vector
				float64_t* tmp_w = V.get_column_vector(ti);
				prob->x->add_to_dense_vec(d, i, tmp_w, n);


				//if (prob->use_bias)
				//	w[n]+=d;
			}
		}

		iter++;
		float64_t gap=PGmax_new - PGmin_new;
		pb.print_absolute(
		    gap, -CMath::log10(gap), -CMath::log10(1), -CMath::log10(eps));

		if(gap <= eps)
		{
			if(active_size == l)
				break;
			else
			{
				active_size = l;
				PGmax_old = CMath::INFTY;
				PGmin_old = -CMath::INFTY;
				continue;
			}
		}
		PGmax_old = PGmax_new;
		PGmin_old = PGmin_new;
		if (PGmax_old <= 0)
			PGmax_old = CMath::INFTY;
		if (PGmin_old >= 0)
			PGmin_old = -CMath::INFTY;
	}

	pb.complete_absolute();
	SG_INFO("optimization finished, #iter = %d\n",iter)
	if (iter >= max_iterations)
	{
		SG_WARNING("reaching max number of iterations\nUsing -s 2 may be faster"
				"(also see liblinear FAQ)\n\n");
	}


	delete [] QD;
	//delete [] alpha;
	delete [] y;
	delete [] index;
}


float64_t CLibLinearMTL::compute_primal_obj()
{
	/* python protype
	   num_param = param.shape[0]
	   num_dim = len(all_xt[0])
	   num_tasks = int(num_param / num_dim)
	   num_examples = len(all_xt)

# vector to matrix
W = param.reshape(num_tasks, num_dim)

obj = 0

reg_obj = 0
loss_obj = 0

assert len(all_xt) == len(all_xt) == len(task_indicator)

# L2 regularizer
for t in xrange(num_tasks):
reg_obj += 0.5 * np.dot(W[t,:], W[t,:])

# MTL regularizer
for s in xrange(num_tasks):
for t in xrange(num_tasks):
reg_obj += 0.5 * L[s,t] * np.dot(W[s,:], W[t,:])

# loss
for i in xrange(num_examples):
ti = task_indicator[i]
t = all_lt[i] * np.dot(W[ti,:], all_xt[i])
# hinge
loss_obj += max(0, 1 - t)


# combine to final objective
obj = reg_obj + C * loss_obj


return obj
*/

	SG_INFO("DONE to compute Primal OBJ\n")
	// calculate objective value
	SGMatrix<float64_t> W = get_W();

	float64_t obj = 0;
	int32_t num_vec = features->get_num_vectors();
	int32_t w_size = features->get_dim_feature_space();

	// L2 regularizer
	for (int32_t t=0; t<num_tasks; t++)
	{
		float64_t* w_t = W.get_column_vector(t);

		for(int32_t i=0; i<w_size; i++)
		{
			obj += 0.5 * w_t[i]*w_t[i];
		}
	}

	// MTL regularizer
	for (int32_t s=0; s<num_tasks; s++)
	{
		float64_t* w_s = W.get_column_vector(s);
		for (int32_t t=0; t<num_tasks; t++)
		{
			float64_t* w_t = W.get_column_vector(t);
			float64_t l = graph_laplacian.matrix[s*num_tasks+t];

			for(int32_t i=0; i<w_size; i++)
			{
				obj += 0.5 * l * w_s[i]*w_t[i];
			}
		}
	}

	// loss
	for(int32_t i=0; i<num_vec; i++)
	{
		int32_t ti = task_indicator_lhs[i];
		float64_t* w_t = W.get_column_vector(ti);
		float64_t residual = ((CBinaryLabels*)m_labels)->get_label(i) * features->dense_dot(i, w_t, w_size);

		// hinge loss
		obj += C1 * CMath::max(0.0, 1 - residual);

	}

	SG_INFO("DONE to compute Primal OBJ, obj=%f\n",obj)

	return obj;
}

float64_t CLibLinearMTL::compute_dual_obj()
{
	/* python prototype
	   num_xt = len(xt)

# compute quadratic term
for i in xrange(num_xt):
for j in xrange(num_xt):

s = task_indicator[i]
t = task_indicator[j]

obj -= 0.5 * M[s,t] * alphas[i] * alphas[j] * lt[i] * lt[j] * np.dot(xt[i], xt[j])

return obj
*/

	SG_INFO("starting to compute DUAL OBJ\n")

	int32_t num_vec=features->get_num_vectors();

	float64_t obj = 0;

	// compute linear term
	for(int32_t i=0; i<num_vec; i++)
	{
		obj += alphas[i];
	}

	// compute quadratic term

	int32_t v_size = features->get_dim_feature_space();

	// efficient computation
	for (int32_t s=0; s<num_tasks; s++)
	{
		float64_t* v_s = V.get_column_vector(s);
		for (int32_t t=0; t<num_tasks; t++)
		{
			float64_t* v_t = V.get_column_vector(t);
			const float64_t ts = task_similarity_matrix(s, t);

			for(int32_t i=0; i<v_size; i++)
			{
				obj -= 0.5 * ts * v_s[i]*v_t[i];
			}
		}
	}

	/*
	// naiive implementation
	float64_t tmp_val2 = 0;

	for(int32_t i=0; i<num_vec; i++)
	{
		int32_t ti_i = task_indicator_lhs[i];
		for(int32_t j=0; j<num_vec; j++)
		{
			// look up task similarity
			int32_t ti_j = task_indicator_lhs[j];

			const float64_t ts = task_similarity_matrix(ti_i, ti_j);

			// compute objective
			tmp_val2 -= 0.5 * alphas[i] * alphas[j] * ts * ((CBinaryLabels*)m_labels)->get_label(i) *
				((CBinaryLabels*)m_labels)->get_label(j) * features->dot(i, features,j);
		}
	}
	*/


	return obj;
}


float64_t CLibLinearMTL::compute_duality_gap()
{
	return 0.0;
}


#endif //HAVE_LAPACK