In [408]:
#####Multitask linear regression model#####
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import numpy.matlib
import scipy.linalg
import itertools
import math
from scipy import sparse
from pandas.tools.plotting import scatter_matrix
from numpy.random import *
from scipy import optimize
import seaborn as sns
import time

#np.random.seed(98537)

In [409]:
####データの生成####
##データの設定
tasks = 75
n1 = np.random.poisson(np.random.gamma(30, 1/0.1, tasks), tasks)  
n2 = np.random.poisson(np.random.gamma(25, 1/0.1, tasks), tasks)  
n = n1 + n2
N1 = np.sum(n1)   
N2 = np.sum(n2)
N = N1 + N2

In [410]:
##入力変数を生成
#入力変数の設定
k1 = 16; k2 = 12; k3 = 15; k41 = 7; k42 = 6
dt = [i for i in range(tasks)]
dt1 = [i for i in range(tasks)]
dt2 = [i for i in range(tasks)]

#タスクごとに入力変数を生成
for i in range(tasks):
    x1 = np.array(np.random.random(n[i]*k1)).reshape((n[i], k1))
    x2 = np.zeros((n[i], k2))
    for j in range(k2):
        prob = np.random.uniform(0.25, 0.55, 1)
        x2[:, j] = np.random.binomial(1, prob, n[i])
    x3 = np.random.poisson(np.repeat(np.random.gamma(0.5, 0.75/1.0, k3), n[i]).reshape(n[i], k3, order="F"))
    x41 = np.random.multinomial(1, np.random.dirichlet(np.repeat(3.0, k41), 1).reshape(k41), n[i])
    x41 = np.delete(x41, np.argmin(np.sum(x41, axis=0)), axis=1)
    x42 = np.random.multinomial(1, np.random.dirichlet(np.repeat(3.0, k42), 1).reshape(k42), n[i])
    x42 = np.delete(x42, np.argmin(np.sum(x42, axis=0)), axis=1)
    dt[i] = np.concatenate((np.repeat(1, n[i])[:, np.newaxis], x1, x2, x3, x41, x42), axis=1)
    dt1[i] = dt[i][np.arange(n1[i]), ]
    dt2[i] = dt[i][np.delete(np.arange(n[i]), np.arange(n1[i])), ]
k = dt[i].shape[1]

In [411]:
##パラメータを生成
#回帰パラメータを生成
alpha = np.append(np.array([20.0]), np.random.normal(0, 0.75, k-1))
beta = (alpha + np.random.multivariate_normal(np.random.uniform(-2.5, 2.5, k),
                                              np.diag(np.append(2.0, np.repeat(0.5, k-1))), tasks)).T
alphat = alpha.copy(); betat = beta.copy()

#誤差パラメータ
tau = np.array([1.0]) 
Sigma = np.random.gamma(3.0, 1/2.0, tasks)
taut = tau.copy(); Sigmat = Sigma.copy() 

In [412]:
##正規分布から応答変数を生成
#データの格納用配列
y1 = [i for i in range(tasks)]
y2 = [i for i in range(tasks)]

for i in range(tasks):
    #モデルの期待値
    mu1 = np.dot(dt1[i], beta[:, i])
    mu2 = np.dot(dt2[i], beta[:, i])

    #応答変数を生成
    y1[i] = mu1 + np.random.normal(0, Sigma[i], n1[i])
    y2[i] = mu2 + np.random.normal(0, Sigma[i], n2[i])

In [413]:
####Multitask linear regression modelを推定####
##アルゴリズムの設定
#正則化パラメータの候補値
Lambda = np.append(np.array([0.01, 0.05]), np.arange(0.1, 2.5, 0.1))
candidates = Lambda.shape[0]

#パラメータの格納用配列
beta_array = np.zeros((k, tasks, candidates))
Sigma_array = np.zeros((tasks, candidates))
LLho = np.repeat(0.0, Lambda.shape[0])
MSEs = np.repeat(0.0, Lambda.shape[0]) 

#ベストなパラメータでの対数尤度
LLbest = np.repeat(0.0, tasks)
MSEbest = np.repeat(0.0, tasks) 
for j in range(tasks):
    mu = np.dot(dt2[j], betat[:, j])
    LLbest[j] = np.sum(scipy.stats.norm.logpdf(y2[j], mu, Sigmat[j]))
    MSEbest[j] = np.sum(np.power(y2[j] - mu, 2)) / n2[j]

In [414]:
##グリッドサーチで最適な正則化パラメータを推定
for rp in range(candidates):
    #正則化パラメータの逆行列
    inv_lambda = np.linalg.inv(np.diag(np.repeat(Lambda[rp], k))) 
        
    #正則化最小二乗法で回帰パラメータの初期値を設定
    beta0 = np.zeros((k, tasks))
    Sigma0 = np.repeat(0.0, tasks)
    for j in range(tasks):
        x = dt1[j]
        XXV = np.dot(x.T, x) + inv_lambda
        inv_XXV = np.linalg.inv(XXV)
        beta0[:, j] = np.dot(np.dot(inv_XXV, x.T), y1[j])
        Sigma0[j] = np.sum(np.power(y1[j] - np.dot(x, beta0[:, j]), 2)) / np.array([n1[j]], dtype="float")
        
        ##マルチタスク学習でパラメータを推定
        #回帰パラメータの事前分布を設定
        alpha_sums = 0
        for j in range(tasks):
            alpha_sums += n1[j] * beta0[:, j]
        alpha = alpha_sums / N1

        #データの格納用配列
        beta = np.zeros((k, tasks))
        Sigma = np.repeat(0.0, tasks)
        LL = np.repeat(0.0, tasks)
        LL0 = np.repeat(0.0, tasks)
        MSE = np.repeat(0.0, tasks)
        MSE0 = np.repeat(0.0, tasks)
        
        for j in range(tasks):
            #正則化最小二乗法で回帰パラメータを推定
            x = dt1[j]
            Xy = np.dot(x.T, y1[j])
            XXV = np.dot(x.T, x) + inv_lambda
            inv_XXV = np.linalg.inv(XXV)
            beta[:, j] = np.dot(inv_XXV, Xy + np.dot(inv_lambda, alpha))

            #標準偏差を推定
            er = y1[j] - np.dot(x, beta[:, j])
            Sigma[j] = np.sum(np.power(er, 2)) / np.array([n1[j]], dtype="float")

            #テストデータに対する対数尤度と二乗誤差
            mu = np.dot(dt2[j], beta[:, j])
            mu0 = np.dot(dt2[j], beta0[:, j])
            LL[j] = np.sum(scipy.stats.norm.logpdf(y2[j], mu, Sigma[j]))
            LL0[j] = np.sum(scipy.stats.norm.logpdf(y2[j], mu0, Sigma0[j]))
            MSE[j] = np.sum(np.power(y2[j] - mu, 2)) / n2[j]
            MSE0[j] = np.sum(np.power(y2[j] - mu0, 2)) / n2[j]
            
    #対数尤度の和を表示
    beta_array[:, :, rp] = beta
    Sigma_array[:, rp] = Sigma
    LLho[rp] = np.sum(LL)
    MSEs[rp] = np.sum(MSE)
    print(np.round(np.array([LLho[rp], np.sum(LL0), np.sum(LLbest), MSEs[rp], np.sum(MSE0), np.sum(MSEbest)]), 1))

  x = np.asarray((x - loc)/scale, dtype=dtyp)


[-63203.5 -72572.2 -31050.3    923.5   1516.2    219.2]
[-48823.9 -55867.9 -31050.3    474.5    692.     219.2]
[-43853.8 -50114.8 -31050.3    375.6    530.5    219.2]
[-40640.5 -45264.1 -31050.3    311.3    420.7    219.2]
[-40431.9 -42984.8 -31050.3    288.9    373.5    219.2]
[-41334.9 -41745.  -31050.3    278.7    346.7    219.2]
[-42685.8 -41064.5 -31050.3    273.4    329.6    219.2]
[-44174.4 -40724.4 -31050.3    270.4    317.7    219.2]
[-45636.1 -40606.2 -31050.3    268.6    309.1    219.2]
[-46983.1 -40638.7 -31050.3    267.5    302.7    219.2]
[-48173.9 -40776.2 -31050.3    266.8    297.7    219.2]
[-49195.7 -40987.6 -31050.3    266.4    293.7    219.2]
[-50053.8 -41251.2 -31050.3    266.2    290.6    219.2]
[-50763.  -41550.9 -31050.3    266.     288.     219.2]
[-51342.5 -41875.3 -31050.3    266.     285.8    219.2]
[-51812.3 -42215.5 -31050.3    266.     284.     219.2]
[-52191.2 -42564.8 -31050.3    266.     282.4    219.2]
[-52495.9 -42918.1 -31050.3    266.     281.1   