In [1]:
import os
from operator import itemgetter

from IPython.display import display, clear_output

import boto3
import hcl2

In [2]:
from helper import wait_until

# Define

In [3]:
def is_volume_available(response: dict) -> bool:
    state = response['Volumes'][0]['State']
    clear_output(wait=True)
    display(state)
    return state.lower() == 'available'


In [4]:
def is_cluster_active(response: dict) -> bool:
    status = response['cluster']['status']
    clear_output(wait=True)
    display(status)
    return status == 'ACTIVE'


In [5]:
def is_node_group_active(response: dict) -> bool:
    status = response['nodegroup']['status']
    clear_output(wait=True)
    display(status)
    return status in ['ACTIVE', 'CREATE_FAILED']


# Instantiate

In [6]:
TERRAFORM_FOLDER = '/terraform/eberron-agent'

In [7]:
with open(os.path.join(TERRAFORM_FOLDER, "provider.tf"), "r") as file:
    terraform = hcl2.load(file)
with open(os.path.join(TERRAFORM_FOLDER, "variables.tf"), "r") as file:
    terraform = terraform | hcl2.load(file)
for var in terraform['variable']:
    if 'region' in var.keys():
        REGION = var['region']['default']
    elif 'cluster_name' in var.keys():
        CLUSTER_NAME = var['cluster_name']['default']
    elif 'purpose' in var.keys():
        PURPOSE = var['purpose']['default']
TAGS = {'cluster': CLUSTER_NAME, 'purpose': PURPOSE}
VOLUME_FILTERS = [
    {'Name': f'tag:purpose', 
     'Values': ['kubyterlab-llm', 'llm']}]

In [8]:
session = boto3.Session(region_name=REGION)
eks_client = session.client('eks')
ec2_client = session.client('ec2')

# Create or Restore Volume

In [9]:
try:
    response = ec2_client.describe_volumes(Filters=VOLUME_FILTERS)
    volumes = response.get('Volumes', [])
except RuntimeError:
    volumes = []
volume_ids = [volume['VolumeId'] for volume in volumes]
volume_ids

['vol-0270b18622c0ec3de']

In [10]:
assert len(volume_ids) <= 1  # TODO: Get the latest one if more than one.
if volume_ids:
    volume_id = volume_ids[0]
    availability_zone = volumes[0]['AvailabilityZone']
else:
    volume_id = None
volume_id

'vol-0270b18622c0ec3de'

In [11]:
if not volume_id:
    response = ec2_client.describe_snapshots(Filters=VOLUME_FILTERS)
    snapshots = response.get('Snapshots', [])
    if snapshots:
        sorted_snapshots = sorted(snapshots, key=itemgetter('StartTime'), reverse=True)
        snapshot_id = sorted_snapshots[0]['SnapshotId']
        print(snapshot_id)

In [12]:
assert volume_id or snapshot_id

In [13]:
if not volume_id:
    response = ec2_client.describe_availability_zones()
    availability_zones = response['AvailabilityZones']
    availability_zone = availability_zones[0]['ZoneName']
    # availability_zone = f'{REGION}a'
    if snapshots:
        # TODO: Change this to the latest snapshot!!
        response = ec2_client.create_volume(
            SnapshotId=snapshot_id,
            # Size=EBS_VOLUME_SIZE,
            AvailabilityZone=availability_zone,
            VolumeType='gp3',
            TagSpecifications=[
                {
                    'ResourceType': 'volume',
                    'Tags': [{'Key': k, 'Value': TAGS[k]} for k in TAGS]
                }
            ]
        )
    else:
        response = ec2_client.create_volume(
            Size=EBS_VOLUME_SIZE,
            AvailabilityZone=availability_zones[0]['ZoneName'],
            VolumeType='gp3',
            TagSpecifications=[
                {
                    'ResourceType': 'volume',
                    'Tags': [{'Key': k, 'Value': TAGS[k]} for k in TAGS]
                }
            ]
        )
    wait_until(ec2_client.describe_volumes, {'VolumeIds': [response['VolumeId']]}, is_volume_available)
    volume_id = response['VolumeId']
volume_id, availability_zone

('vol-0270b18622c0ec3de', 'ca-central-1a')

# Create Cluster With Terraform

In [14]:
%%time
!terraform -chdir=$TERRAFORM_FOLDER init -upgrade

[0m[1mInitializing the backend...[0m
[0m[1mUpgrading modules...[0m
Downloading registry.terraform.io/terraform-aws-modules/eks/aws 20.33.1 for eks...
- eks in .terraform/modules/eks
- eks.eks_managed_node_group in .terraform/modules/eks/modules/eks-managed-node-group
- eks.eks_managed_node_group.user_data in .terraform/modules/eks/modules/_user_data
- eks.fargate_profile in .terraform/modules/eks/modules/fargate-profile
Downloading registry.terraform.io/terraform-aws-modules/kms/aws 2.1.0 for eks.kms...
- eks.kms in .terraform/modules/eks.kms
- eks.self_managed_node_group in .terraform/modules/eks/modules/self-managed-node-group
- eks.self_managed_node_group.user_data in .terraform/modules/eks/modules/_user_data
Downloading registry.terraform.io/terraform-aws-modules/vpc/aws 5.18.1 for vpc...
- vpc in .terraform/modules/vpc
[0m[1mInitializing provider plugins...[0m
- Finding hashicorp/null versions matching ">= 3.0.0"...
- Finding hashicorp/kubernetes versions matching "~> 2.3

In [15]:
%%time
!terraform -chdir=$TERRAFORM_FOLDER validate

[32m[1mSuccess![0m The configuration is valid.
[0m
CPU times: user 193 ms, sys: 33.1 ms, total: 226 ms
Wall time: 17.5 s


In [16]:
%%time
!terraform -chdir=$TERRAFORM_FOLDER plan -out=./plan.out

[0m[1mdata.aws_availability_zones.available: Reading...[0m[0m
[0m[1mmodule.eks.data.aws_partition.current[0]: Reading...[0m[0m
[0m[1mmodule.eks.module.kms.data.aws_partition.current[0]: Reading...[0m[0m
[0m[1mmodule.eks.module.kms.data.aws_caller_identity.current[0]: Reading...[0m[0m
[0m[1mmodule.eks.data.aws_caller_identity.current[0]: Reading...[0m[0m
[0m[1mmodule.eks.data.aws_iam_policy_document.assume_role_policy[0]: Reading...[0m[0m
[0m[1mmodule.eks.data.aws_partition.current[0]: Read complete after 0s [id=aws][0m
[0m[1mmodule.eks.data.aws_iam_policy_document.assume_role_policy[0]: Read complete after 0s [id=2830595799][0m
[0m[1mmodule.eks.module.kms.data.aws_partition.current[0]: Read complete after 0s [id=aws][0m
[0m[1mmodule.eks.data.aws_iam_policy_document.custom[0]: Reading...[0m[0m
[0m[1mmodule.eks.data.aws_iam_policy_document.custom[0]: Read complete after 0s [id=513122117][0m
[0m[1mmodule.eks.module.kms.data.aws_caller_identity.cur

In [17]:
%%time
!terraform -chdir=$TERRAFORM_FOLDER apply  -auto-approve ./plan.out

[0m[1mmodule.eks.aws_iam_policy.custom[0]: Creating...[0m[0m
[0m[1mmodule.eks.aws_cloudwatch_log_group.this[0]: Creating...[0m[0m
[0m[1maws_iam_role.eks_cluster: Creating...[0m[0m
[0m[1mmodule.eks.aws_iam_role.this[0]: Creating...[0m[0m
[0m[1mmodule.vpc.aws_vpc.this[0]: Creating...[0m[0m
[0m[1maws_iam_role.eks_nodes: Creating...[0m[0m
[0m[1mmodule.eks.aws_cloudwatch_log_group.this[0]: Creation complete after 1s [id=/aws/eks/eberron-agent/cluster][0m
[0m[1mmodule.eks.aws_iam_role.this[0]: Creation complete after 1s [id=eberron-agent-cluster-20250131153258470700000002][0m
[0m[1mmodule.eks.aws_iam_policy.custom[0]: Creation complete after 1s [id=arn:aws:iam::275678099358:policy/eberron-agent-cluster-20250131153258457100000001][0m
[0m[1maws_iam_role.eks_cluster: Creation complete after 1s [id=eberron-agent-eks-cluster][0m
[0m[1maws_iam_role.eks_nodes: Creation complete after 1s [id=EKS_Worker_Node_Role][0m
[0m[1mmodule.eks.aws_iam_role_policy_attach

In [18]:
cluster_response = eks_client.describe_cluster(name=CLUSTER_NAME)
oidc_id = cluster_response['cluster']['identity']['oidc']['issuer'].split('/')[-1]
oidc_url = f'https://oidc.eks.{REGION}.amazonaws.com/id/{oidc_id}'
oidc_url

'https://oidc.eks.ca-central-1.amazonaws.com/id/638FE75EEC473044A3FDB37E95EC4B47'

In [19]:
!aws sts get-caller-identity

{
    "UserId": "275678099358",
    "Account": "275678099358",
    "Arn": "arn:aws:iam::275678099358:root"
}


In [42]:
!aws iam list-attached-user-policies --user-name 275678099358


An error occurred (NoSuchEntity) when calling the ListAttachedUserPolicies operation: The user with name 275678099358 cannot be found.


In [40]:
!aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.endpoint"


"https://EA632662E52405E6EB99992C92BF36FD.gr7.ca-central-1.eks.amazonaws.com"


In [None]:
# Your current IAM principal doesn’t have access to Kubernetes objects on this cluster.
# This may be due to the current user or role not having Kubernetes RBAC permissions to describe cluster resources or not having an entry in the cluster’s auth config map.

# NodeCreationFailure                          Instances failed to join the kubernetes cluster

In [37]:
!kubectl get configmap -n kube-system aws-auth -o yaml

E0130 22:54:26.673421    2719 memcache.go:265] couldn't get current server API group list: Get "https://EA632662E52405E6EB99992C92BF36FD.gr7.ca-central-1.eks.amazonaws.com/api?timeout=32s": dial tcp 10.0.2.34:443: i/o timeout
E0130 22:54:56.672465    2719 memcache.go:265] couldn't get current server API group list: Get "https://EA632662E52405E6EB99992C92BF36FD.gr7.ca-central-1.eks.amazonaws.com/api?timeout=32s": dial tcp 10.0.2.34:443: i/o timeout
^C
