# Satellite Environment with OpenAI Gym

In [12]:
from gym import Env
from gym.spaces import *
import numpy as np
import random
import matplotlib.pyplot as plt

In [13]:
"""
State of satellite
P: power of satellite
S1: memory in sensor 1
S2: memory in sensor 2
S3: memory in sensor 3
E: electronics of satellite
O: orbital motion of satellite
T: time of the day-> ionosphere behavior

State Space:
S = (P, S1, S2, S3, E, O, T)
P = {100, 99, 98, …, 3, 2, 1, 0}
S1 = {100, 99, 98, …, 3, 2, 1, 0}
S2 = {100, 99, 98, …, 3, 2, 1, 0}
S3 = {100, 99, 98, …, 3, 2, 1, 0}
E = {0, 1} not working / working
O = {0, 1} not in range / in range
T = {0, 1, 2, 3, 4} night, dawn, morning, afternoon, dusk

Action Space:
A = {transmit, not transmit} = {1, 0}

Step Function:
*If a1= transmit = 1, then:
    P(state=s) transitions to a lower value P(state=s’) ---> power discharges
    Ex: P=100 -> a1 (transmit) -> P=95
    S1(state=s) transitions to a lower value S1(state=s’) ---> free memory in sensor 1 decreases (data transferred)
    Ex: S1=100 -> a1 (transmit) -> S1=96.5

*If a1= not transmit = 0, then:
    P(state=s) transitions to a higher value P(state=s’) ---> power charges
    Ex: P=70 -> a1 (not transmit) -> P=73
    S1(state=s) remains the same S1(state=s’) ---> free memory in sensor 1 constant (no data transferred)
    Ex: S1=70 -> a1 (not transmit) -> S1=70

Reward:
*If a1=1 (transmit), then:    ---> For each transmission we get reward
    reward +=5 
*If P(state=s’)>=30, then:    ---> Rewards for maintaining power more than 30%
    reward +=2 
*If we transmit all the data within the time satellite is within range then:
    reward += 5*(remaining time for satellite in range)  --->More rewards for quicker transmission

Termination:
An action can occur every minute for an hour (when the satellite is within range) or until the sensor memory is full i.e. terminate if:
*S1(state=s) = 0 --->Transmission complete
*O(state=s) = 0  --->Satellite out of range

"""

class SatelliteEnv(Env):
    def __init__(self):
        # A = {transmit, not transmit} = {1, 0}
        self.action_space = Discrete(2)
        # S = (P, S1, S2, S3, E, O, T)
        #Note currently only using P,S1,O
        self.observation_space = Tuple((Box(0, 100, shape=(2,)),MultiDiscrete([2])))  # (P, S1, S2, S3, E, O, T)
        # Set start states
        self.state = np.zeros(3)
        for i in range(3): 
            if i<2:
                self.state[i] = 100
            elif i<6:
                self.state[i] = 1
        
        #Transmission Time 60 mins
        self.transmission_time = 60
        
    def step(self, action):
        #A = {transmit, not transmit} = {1, 0}
        #Update States

        if self.state[0]>=5: #no action 1 if P < 5 
            if action==1:
                self.state[0] += -5   #discharging
                self.state[1] += -3 #update free memory
        if action==0:
            self.state[0] += +3 #charging
            
        #Check States within bounds
        for i in range(2):
            if self.state[i]>100:
                self.state[i] = 100  
            elif self.state[i]< 0:
                self.state[i] = 0
             
        # Reduce transmission time by 1 minute
        self.transmission_time -= 1
        
        # Calculate reward
        reward = 0
        if self.state[0]>=30:
            reward += 2
        else: 
            reward += 0
        if action==1:
            reward += 5
        if self.state[1]==0:
            reward += 100
        
        # Check if transmission time is done
        if self.transmission_time <= 0:
            self.state[2] = 0
    
        if 0 in self.state[[1,2]]:
            done = True
        else:
            done = False
        
        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset states
        self.state = np.zeros(3)
        for i in range(3): 
            if i<2:
                self.state[i] = 100 #random.randint(90,100)
            else:
                self.state[i] = 1
        # Reset time
        self.transmission_time = 60 
        return self.state
    

In [14]:
env = SatelliteEnv()

In [15]:
env.observation_space.sample()

(array([86.32203, 88.84843], dtype=float32), array([0], dtype=int64))

In [22]:
episodes = 1
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    a=[]
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        a.append(action)
        score+=reward
    out_states = ','.join(map('{:6}'.format, n_state[[0,1,2]]))
    print('Episode:{0:3}   Score:{1:5,.1f}   Transmissions:{2:4}   Total Time:{3:4}   Final States(P,S,O):{4:5}'.format(episode,score,a.count(1),np.size(a), out_states))

Episode:  1   Score:266.0   Transmissions:  36   Total Time:  60   Final States(P,S,O):   4.0,   1.0,   0.0


# Value Iteration

### S_lookup function

In [23]:
P = np.arange(0,101,1)
S1 = np.append([0],np.arange(1,101,3))
O = np.arange(2)
A = np.arange(2)
state = np.array([P,S1,O])
N = np.size(state[0])*np.size(state[1])*np.size(state[2])
S_lookup = np.zeros([N,3])
i = 0

for l in range(np.size(state[0])):
    for m in range(np.size(state[1])):
        for n in range(np.size(state[2])):
            S_lookup[i] = [state[0][l],state[1][m],state[2][n]]
            i = i + 1
S_lookup = S_lookup.astype(int)
dims = S_lookup.max (0)+1

  state = np.array([P,S1,O])


### Populating matrices T(s, a, s') and R(s, a)

In [7]:
S = np.arange(0, 7070)
A = [0, 1]
T = np.zeros([7070, 2, 7070])  # s, a, s'
R = np.zeros([7070, 2])
gamma = 1
dims = S_lookup.max (0)+1

# Populate matrices
for s in S:
    p, s1, o = S_lookup[s]
    # action = 0 transition matrix
    pp = p + 3
    if pp > 100:
        pp = 100
    s1p = s1
    op = o
    n_state = np.array([pp, s1p, op])
    sp = np.where(np.in1d(np.ravel_multi_index(S_lookup.T,dims),np.ravel_multi_index(n_state.T,dims)))[0][0]
    T[s, 0, sp] += 1

    # action = 1 transition matrix
    if p >= 5:
        pp = p - 5
        s1p = s1 - 3
        if s1p < 0:
            s1p = 0
        op = o
        n_state = np.array([pp, s1p, op])
        sp = np.where(np.in1d(np.ravel_multi_index(S_lookup.T,dims),np.ravel_multi_index(n_state.T,dims)))[0][0]
        T[s, 1, sp] += 1
        
    ## Fill Reward matrices
    if p >= 30:
        R[s,0] += 2
        R[s,1] += 2
    R[s,1] += 5
    if s1 == 0:
        R[s,0] += 100
        R[s,1] += 100 


### Implementing value iteration

In [8]:
Q = np.zeros([7070, 2])
U = np.arange(0, 7070)
Policies = []
threshold = 0.5
#for i in range(100):  # when ready, change to while loop until convergence
for s in range(len(S)):
    for a in range(len(A)):
        Q[s, a] = R[s,a] + (gamma * sum([T[s, a, sp] * U[sp] for sp in S]))
    Uold = U
    U = np.max(Q, axis=1)
    if max(abs(Uold - U)) < threshold:
        print('stopped after ' + str(s) + ' loops.' )
        break
Policy = np.argmax(Q, axis=1)
        

### Testing Policy 

In [11]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    a=[]
    action = 1
    while not done:
        #env.render()
        #action = env.action_space.sample()
        a.append(action)
        n_state, reward, done, info = env.step(action)
        p, s1, o = n_state
        score+=reward
        n_state_3 = np.array([int(p), int(s1), int(o)]) 
        n_state_idx = np.where(np.in1d(np.ravel_multi_index(S_lookup.T,dims),np.ravel_multi_index(n_state_3.T,dims)))[0][0]
        action = Policy[n_state_idx]
    out_states = ','.join(map('{:6}'.format, n_state[[0,1,2]]))
    print('Episode:{0:3}   Score:{1:5,.1f}   Transmissions:{2:4}   Total Time:{3:4}   Final States(P,S,O):{4:5}'.format(episode, score, a.count(1), np.size(a), out_states))

Episode:  1   Score:328.0   Transmissions:  60   Total Time:  60   Final States(P,S,O):   0.0,  40.0,   0.0
Episode:  2   Score:328.0   Transmissions:  60   Total Time:  60   Final States(P,S,O):   0.0,  40.0,   0.0
Episode:  3   Score:328.0   Transmissions:  60   Total Time:  60   Final States(P,S,O):   0.0,  40.0,   0.0
Episode:  4   Score:328.0   Transmissions:  60   Total Time:  60   Final States(P,S,O):   0.0,  40.0,   0.0
Episode:  5   Score:328.0   Transmissions:  60   Total Time:  60   Final States(P,S,O):   0.0,  40.0,   0.0
