In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from rl687.policies.Linear_Softmax import LinearSoftmax
import csv
from sklearn.model_selection import train_test_split
import cma
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

In [None]:
class data_preprocess:
    def __init__(self,data_path,num_batches=5):
        self.path=data_path
        self.num_batches=num_batches
        data=[]
        with open(self.path,newline='',encoding='utf-8') as csv_file:
            file=csv.reader(csv_file,delimiter=',')
            for i, H in enumerate(file):
                data.append(H)
        self.data=np.asarray(data)
        self.data_batches=[]
        self.N=200000
        x=self.N//num_batches
        for i in range(num_batches):
            start=i*x
            end=start+x if start+x<=200000 else 200000
            self.data_batches.append(self.data[start:end])
        self.D_s=[]
        self.D_c=[]
        
    def split_data(self,split):
        for i in range(self.num_batches):
            D_c, D_s = train_test_split(self.data_batches[i],test_size=split)
            self.D_c.append(D_c)
            self.D_s.append(D_s)


        
    

In [None]:
class HCOPE():
    def __init__(self,D_c,theta_b,num_actions,num_features,order,n_Ds,delta,lower_bound):
            self.D_c=D_c
            self.theta_b=theta_b
            self.num_actions=num_actions
            self.num_state_features=num_features
            self.order=order
            self.Ds_size=n_Ds
            self.delta=delta
            self.b=lower_bound
            self.pb=LinearSoftmax(self.num_actions,self.order, self.num_state_features,0.001,self.theta_b,sigma=1)
    
    def PDIS(self,H,pe):
        prev_prod=1
        pdis=0
        for i in range(len(H)//3):
            s=np.float(H[i*3])
            #print(i)
            a=int(H[i*3+1])
            r=np.float(H[i*3+2])
            #print(pe.policy(s,a))
            prod=(float(pe.policy(s,a))/float(self.pb.policy(s,a)))*prev_prod
            pdis+=r*prod
            prev_prod=prod
        return pdis
    
    def thetaTopolicy(self,theta):
        p=LinearSoftmax(self.num_actions,self.order, self.num_state_features,0.001,theta,sigma=1)
        #p.theta=theta
        return p
    
    def estimate_J(self,data,theta_e):
        policy_b=self.thetaTopolicy(self.theta_b)
        policy_e=self.thetaTopolicy(theta_e)
        J=0
        pdis_list=[]
        n=data.shape[0]
        for H in data:
            pdis_H=self.PDIS(H,policy_e)
            J+=pdis_H
            pdis_list.append(pdis_H)
        J/=n
        pdis_list=np.array(pdis_list)
        return J,pdis_list
    
    def ssd(self,J,pdis_list):
        n=len(pdis_list)
        return ((np.sum((pdis_list-J)**2)/(n-1))**(0.5))

    def student_t_test(self,D,theta_e):
        J,PDIS_list=self.estimate_J(D,theta_e)
        std=self.ssd(J,PDIS_list)
        n=len(PDIS_list)
        return (J,(J-((std/n**(0.5))*stats.t.ppf(1-self.delta,n-1))))
    
    def optimise_PDIS(self,theta_e):
        J,PDIS_list=self.estimate_J(self.D_c,theta_e)
        std=self.ssd(J,PDIS_list)
        g=J-2*((std/self.Ds_size**(0.5))*stats.t.ppf(1-self.delta,self.Ds_size-1))
        if g<self.b:
            return 100000
        else: return -J
           

In [None]:
def CMAES_policy(D_c, theta_e,theta_b,num_actions,num_features,order,n_Ds,delta,b):
    policies=[]
    pdis = HCOPE(D_c,theta_b,num_actions,num_features,order,n_Ds,delta,b)
    c=0
    es = cma.CMAEvolutionStrategy(theta_b, 1,{'bounds': [-2, 2]})
    #ret = es.optimize(pdis.optimise_PDIS)
    while (not es.stop()):
        solutions = es.ask(20)
        es.tell(solutions, [pdis.optimise_PDIS(x) for x in solutions])
        c+=1
        print(c)
        if es.result[1]<-9:
            print(es.result[0])
            policies.append((es.result[0]))
    return policies,es

In [5]:
num_a=2
num_dims=1
order=1
theta_e=np.array([1,1,0.01,-0.01])
theta_b=np.array([0.01,-0.01,1,1])
delta=0.005
b=1.03

In [6]:
data=data_preprocess('data.csv',4)
data.split_data(0.3)

In [None]:
policies=[]
best_policies=[]
for i in range(3,4):
    D_c=data.D_c[i]
    D_s=data.D_s[i]
    n_Ds=len(D_s)
    policy,best_policy=CMAES_policy(D_c, theta_e,theta_b,num_a,num_dims,order,n_Ds,delta,b)
    policies=policies+policy
    best_policies.append(best_policy.result[0])

In [17]:
#test on safety dataset
n_Ds=len(data.D_s[0])
policies=[[9.80061, -5.59132, -7.88554, -6.54961],[1.409911 , 0.12306 ,-1.40863 ,-0.12423],[5.48074, 9.9999, -6.461319, -9.9999], [21.95001, 14.88005, -2.09167, -20.34627], [10.75273, 5.06190, -4.29105, -15.63130], [1.22538825, -0.01030995, -1.99951519, 1.21804982], [1.22538825, -0.01030995, -1.99951519, 1.21804982], [1.22539, -0.01031, -1.99951, 1.21804], [1.409911, 0.12306, -1.40863, -0.123835], [0.9335642, 1.15941849, -1.9291111, 1.93718108], [0.9335642, 1.15941849, -1.9291111, 1.93718108]]
lower_bound=2*1.03
x=[0]*len(policies)
remove_indices=[]
test=HCOPE(data.data,theta_b,num_a,num_dims,order,n_Ds,0.001,b)
for i in range(len(policies)):
    #for j in range(4):
    theta_e=np.asarray(policies[i])
    #D_s=data.D_s[j]
    J,score=test.student_t_test(data.data,theta_e)
    print(J)
    if score>lower_bound and J>11:
        if score>x[i]:
            x[i]=score
    else: 
        remove_indices.append(i)

for i in remove_indices:
    del policies[i]
    del x[i]

print(list(sorted(zip(x,policies))))

p=[k for _,k in sorted(zip(x,policies),reverse=True)]
print(p)


14.663721916730209
13.185056183720905
14.73550637601263
14.761009332069476
14.726845064785643
13.330839941004339
13.330839941004339
13.33083904295923
13.184987720226935
12.995315819539037
12.995315819539037
[(11.324171161804838, [0.9335642, 1.15941849, -1.9291111, 1.93718108]), (11.324171161804838, [0.9335642, 1.15941849, -1.9291111, 1.93718108]), (11.336572189261018, [1.409911, 0.12306, -1.40863, -0.123835]), (11.336585302853353, [1.409911, 0.12306, -1.40863, -0.12423]), (11.579032823181072, [1.22539, -0.01031, -1.99951, 1.21804]), (11.579034518408122, [1.22538825, -0.01030995, -1.99951519, 1.21804982]), (11.579034518408122, [1.22538825, -0.01030995, -1.99951519, 1.21804982]), (12.260721837502599, [9.80061, -5.59132, -7.88554, -6.54961]), (12.350342740280603, [10.75273, 5.0619, -4.29105, -15.6313]), (12.383541637971714, [21.95001, 14.88005, -2.09167, -20.34627]), (12.388158717930644, [5.48074, 9.9999, -6.461319, -9.9999])]
[[5.48074, 9.9999, -6.461319, -9.9999], [21.95001, 14.88005, -

In [16]:
p=[[9.80061, -5.59132, -7.88554, -6.54961],[1.409911 , 0.12306 ,-1.40863 ,-0.12423],[5.48074, 9.9999, -6.461319, -9.9999], [21.95001, 14.88005, -2.09167, -20.34627], [10.75273, 5.06190, -4.29105, -15.63130], [1.22538825, -0.01030995, -1.99951519, 1.21804982], [1.22538825, -0.01030995, -1.99951519, 1.21804982], [1.22539, -0.01031, -1.99951, 1.21804], [1.409911, 0.12306, -1.40863, -0.123835], [0.9335642, 1.15941849, -1.9291111, 1.93718108], [0.9335642, 1.15941849, -1.9291111, 1.93718108]]
j=1
while j<=100:
    for i in range(len(p)):
        if j>100:
            break
        with open(str(j)+'.csv', "w") as csvfile:
            csvwriter = csv.writer(csvfile,  delimiter=',')
            csvwriter.writerow(list(p[i]))
            j+=1

