In [37]:
import gym
import numpy as np

def feature_extractor(state):
    """
    Simple feature extractor for state representation.

    Parameters:
    - state: Current state from the environment.

    Returns:
    - features: Extracted features from the state.
    """
    # For simplicity, use the state itself as features
    return np.array(state)

def linear_function_approximation(weights, features):

    return np.dot(weights, features)

def td_prediction(env, num_episodes, alpha, gamma):
    """
    Temporal Difference (TD) prediction for state values using linear function approximation.

    Parameters:
    - env: Gym environment.
    - num_episodes: Number of episodes for training.
    - alpha: Learning rate.
    - gamma: Discount factor.

    Returns:
    - weights: Learned weights for the linear approximation.
    """
    num_features = len(feature_extractor(env.reset()))
    weights = np.zeros((num_features,num_features))

    for episode in range(num_episodes):
        state = env.reset()
        features = feature_extractor(state)[0]
        features = np.array(features).reshape(2,2)
        while True:
            # Choose an action using an exploration-exploitation strategy (e.g., epsilon-greedy)
            action = env.action_space.sample()  # Random action for illustration

            # Take the chosen action and observe the next state and reward
            next_state, reward, done, a , b = env.step(action)
            next_features = feature_extractor(next_state)
            next_features = np.array(next_features).reshape(2,2)
            # TD error calculation
            td_error = reward + gamma * linear_function_approximation(weights, next_features) - \
                       linear_function_approximation(weights, features)

            # Update weights based on the TD error and features
            #print(alpha * td_error * features)
            weights += alpha * td_error * features
            if done:
                break

            # Move to the next state
            state = next_state
            features = next_features

    return weights

# Example usage:
env = gym.make('CartPole-v1')
num_episodes = 1000
alpha = 0.01
gamma = 0.99

learned_weights = td_prediction(env, num_episodes, alpha, gamma)

# Test the learned weights
test_state = env.reset()
test_features = feature_extractor(test_state)
estimated_value = linear_function_approximation(learned_weights, np.array(test_features)[0].reshape(2,2))
print("Estimated Value for the Test State:", estimated_value)


  return np.array(state)


Estimated Value for the Test State: [[-0.0730509   0.03693237]
 [-0.40206735  0.21885557]]
