In [66]:
import pandas as pd
import numpy as np
from scipy import stats
from rl687.policies.Linear_Softmax import LinearSoftmax
import csv
from sklearn.model_selection import train_test_split
import cma

In [67]:
class data_preprocess:
    def __init__(self,data_path,num_batches=5):
        self.path=data_path
        self.num_batches=num_batches
        data=[]
        with open(self.path,newline='',encoding='utf-8') as csv_file:
            file=csv.reader(csv_file,delimiter=',')
            for i, H in enumerate(file):
                data.append(H)
        self.data=np.asarray(data)
        self.data_batches=[]
        self.N=200000
        x=self.N//num_batches
        for i in range(num_batches):
            start=i*x
            end=start+x if start+x<=200000 else 200000
            self.data_batches.append(self.data[start:end])
        self.D_s=[]
        self.D_c=[]
        
    def split_data(self,split):
        for i in range(self.num_batches):
            D_c, D_s = train_test_split(self.data_batches[i],test_size=split)
            self.D_c.append(D_c)
            self.D_s.append(D_s)


        
    

In [68]:
class HCOPE():
    def __init__(self,D_c,theta_b,num_actions,num_features,order,n_Ds,delta,lower_bound):
            self.D_c=D_c
            self.theta_b=theta_b
            self.num_actions=num_actions
            self.num_state_features=num_features
            self.order=order
            self.Ds_size=n_Ds
            self.delta=delta
            self.b=lower_bound
            self.pb=LinearSoftmax(self.num_actions,self.order, self.num_state_features,0.001,self.theta_b,sigma=1)
    
    def PDIS(self,H,pe):
        prev_prod=1
        pdis=0
        for i in range(len(H)//3):
            s=np.float(H[i*3])
            #print(i)
            a=int(H[i*3+1])
            r=np.float(H[i*3+2])
            #print(pe.policy(s,a))
            prod=(float(pe.policy(s,a))/float(self.pb.policy(s,a)))*prev_prod
            pdis+=r*prod
            prev_prod=prod
        return pdis
    
    def thetaTopolicy(self,theta):
        p=LinearSoftmax(self.num_actions,self.order, self.num_state_features,0.001,theta,sigma=1)
        #p.theta=theta
        return p
    
    def estimate_J(self,data,theta_e):
        policy_b=self.thetaTopolicy(self.theta_b)
        policy_e=self.thetaTopolicy(theta_e)
        J=0
        pdis_list=[]
        n=data.shape[0]
        for H in data:
            pdis_H=self.PDIS(H,policy_e)
            J+=pdis_H
            pdis_list.append(pdis_H)
        J/=n
        pdis_list=np.array(pdis_list)
        return J,pdis_list
    
    def ssd(self,J,pdis_list):
        n=len(pdis_list)
        return ((np.sum((pdis_list-J)**2)/(n-1))**(0.5))

    def student_t_test(self,D,theta_e):
        J,PDIS_list=self.estimate_J(D,theta_e)
        std=self.ssd(J,PDIS_list)
        n=len(PDIS_list)
        return (J-((std/n**(0.5))*stats.t.ppf(1-self.delta,n-1)))
    
    def optimise_PDIS(self,theta_e):
        J,PDIS_list=self.estimate_J(self.D_c,theta_e)
        std=self.ssd(J,PDIS_list)
        g=J-2*((std/self.Ds_size**(0.5))*stats.t.ppf(1-self.delta,self.Ds_size-1))
        if g<self.b:
            return 10000
        else: return -J
           

In [69]:
def CMAES_policy(D_c, theta_e,theta_b,num_actions,num_features,order,n_Ds,delta,b):
    pdis = HCOPE(D_c,theta_b,num_actions,num_features,order,n_Ds,delta,b)
    es = cma.CMAEvolutionStrategy(theta_b, 1.0)
    ret = es.optimize(pdis.optimise_PDIS)
    print(es.result_pretty())
    return ret

In [70]:
num_a=2
num_dims=1
order=1
theta_e=np.array([1,1,0.01,-0.01])
theta_b=np.array([0.01,-0.01,1,1])
delta=0.04
b=1

In [71]:
data=data_preprocess('data.csv',1)
data.split_data(0.3)

In [72]:

D_c=data.D_c[0]
D_s=data.D_s[0]
n_Ds=len(D_s)

In [None]:
param=CMAES_policy(D_c, theta_e,theta_b,num_a,num_dims,order,n_Ds,delta,b)

(4_w,8)-aCMA-ES (mu_w=2.6,w_1=52%) in dimension 4 (seed=358473, Fri Dec 13 03:26:57 2019)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1      8 -1.060232103814293e+01 1.0e+00 1.15e+00  1e+00  1e+00 1:20.5
    2     16 -1.332411941103739e+01 1.5e+00 1.55e+00  2e+00  2e+00 2:43.4
    3     24 -1.415148570931315e+01 1.7e+00 1.86e+00  2e+00  3e+00 4:00.9
    4     32 -1.430537938602692e+01 2.0e+00 2.41e+00  2e+00  4e+00 5:21.3
    5     40 -1.430803829334102e+01 2.3e+00 3.09e+00  2e+00  5e+00 6:40.5
    6     48 -1.430806148371529e+01 2.9e+00 4.02e+00  3e+00  7e+00 7:57.9
    7     56 -1.430806148561103e+01 3.3e+00 5.77e+00  4e+00  1e+01 9:22.6
    8     64 -1.430806148561121e+01 3.4e+00 7.83e+00  6e+00  1e+01 10:41.9


In [None]:
print(param.result_pretty())

In [36]:
#policies 
[0.6627, 0.9739, -0.9077, -0.9822]
[1,1,-1,-1]
[1.4950253 , 1.38426051,  -2.05568217, -0.69637852]
[10.22528905, 8.28499955, -0.93434349, -9.96264375]

SyntaxError: invalid character in identifier (<ipython-input-36-46db6b4ceedd>, line 5)

In [49]:
#test on safety dataset
theta_e=np.asarray([1.4950253 , 1.38426051,  -2.05568217, -0.69637852])
D_s=data.D_s[0]
test=HCOPE(D_c,theta_b,num_a,num_dims,order,n_Ds,0.01,b)
print(test.student_t_test(D_s,theta_e))

6.39792703206637
