# Import

In [1]:
import os
from time import sleep
from time import time as unixtime
from typing import Callable, List
import random
from math import ceil
import yaml
import string
import json
from urllib.parse import urlparse
from time import sleep
import re
from operator import itemgetter
import subprocess
from tempfile import NamedTemporaryFile

from dotenv import load_dotenv
from IPython.display import display, clear_output

import boto3
import kubernetes
from kubernetes.client.rest import ApiException

from pyhelm3 import Client as HelmClient


In [2]:
load_dotenv('/.env')

True

In [3]:
from helper import wait_until
from helper.ec2 import (get_vpcs_ids, get_internet_gateway_ids_attached_to_vpc, 
                        get_route_table_ids_for_vpc, route_to_gateway_exists, 
                        get_subnet_ids_in_vpc, get_security_group_ids)
from helper.k8s import (get_one_running_pod, get_jupyter_token_from_pod)

# Define

In [4]:
REGION = 'ca-central-1'
CLUSTER_NAME = 'kubyterlab-llm'
EBS_VOLUME_SIZE = 500 # GiB
TAGS = {'cluster': CLUSTER_NAME, 'purpose': 'llm'}  # Do not change the keys, they are hardcoded throughout.
CLUSTER_TAGS = {'cluster': CLUSTER_NAME}
VOLUME_FILTERS = [
    {'Name': f'tag:purpose', 
     'Values': ['kubyterlab-llm', 'llm']}]
K8S_VERSION = os.environ['K8S_VERSION']  # '1.30'
K8S_VERSION = '.'.join(K8S_VERSION.split('.')[:2]) if len(K8S_VERSION.split('.')) > 2 else K8S_VERSION
INSTANCE_TYPES = {'gpu': ['g4dn.2xlarge'], 'default': ['t3.medium']}
ALLOWED_PORTS = [80, 443, 22]

In [5]:
def check_all_vpcs_available(response: dict) -> True:
    if not 'Vpcs' in response:
        raise ValueError
    return all([vpc.get('State', '') == 'available' for vpc in response['Vpcs']])

In [6]:
def is_cluster_active(response: dict) -> bool:
    status = response['cluster']['status']
    clear_output(wait=True)
    display(status)
    return status == 'ACTIVE'


In [7]:
def is_node_group_active(response: dict) -> bool:
    status = response['nodegroup']['status']
    clear_output(wait=True)
    display(status)
    return status in ['ACTIVE', 'CREATE_FAILED']


In [8]:
def is_snapshot_completed(response: dict) -> bool:
    state = response['Snapshots'][0]['State']
    clear_output(wait=True)
    display(state)
    return state.lower() == 'completed'


In [9]:
def is_volume_available(response: dict) -> bool:
    state = response['Volumes'][0]['State']
    clear_output(wait=True)
    display(state)
    return state.lower() == 'available'


# Instantiate

In [10]:
session = boto3.Session(region_name=REGION)
eks_client = session.client('eks')
ec2_client = session.client('ec2')
iam_client = session.client('iam')

aws_account_id = boto3.client('sts').get_caller_identity().get('Account')

# Create or Restore Volume

In [12]:
try:
    response = ec2_client.describe_volumes(Filters=VOLUME_FILTERS)
    volumes = response.get('Volumes', [])
except RuntimeError:
    volumes = []
volume_ids = [volume['VolumeId'] for volume in volumes]
volume_ids

['vol-021e0e2f73fe34170']

In [13]:
assert len(volume_ids) <= 1  # TODO: Get the latest one if more than one.
if volume_ids:
    volume_id = volume_ids[0]
    availability_zone = volumes[0]['AvailabilityZone']
else:
    volume_id = None

In [14]:
if not volume_id:
    response = ec2_client.describe_snapshots(Filters=VOLUME_FILTERS)
    snapshots = response.get('Snapshots', [])
    if snapshots:
        sorted_snapshots = sorted(snapshots, key=itemgetter('StartTime'), reverse=True)
        snapshot_id = sorted_snapshots[0]['SnapshotId']
        print(snapshot_id)

In [15]:
if not volume_id:
    response = ec2_client.describe_availability_zones()
    availability_zones = response['AvailabilityZones']
    availability_zone = availability_zones[0]['ZoneName']
    # availability_zone = f'{REGION}a'
    if snapshots:
        # TODO: Change this to the latest snapshot!!
        response = ec2_client.create_volume(
            SnapshotId=snapshot_id,
            Size=EBS_VOLUME_SIZE,
            AvailabilityZone=availability_zone,
            VolumeType='gp3',
            TagSpecifications=[
                {
                    'ResourceType': 'volume',
                    'Tags': [{'Key': 'purpose', 'Value': 'llm'}]
                }
            ]
        )
    else:
        response = ec2_client.create_volume(
            Size=EBS_VOLUME_SIZE,
            AvailabilityZone=availability_zones[0]['ZoneName'],
            VolumeType='gp3',
            TagSpecifications=[
                {
                    'ResourceType': 'volume',
                    'Tags': [{'Key': 'purpose', 'Value': 'llm'}]
                }
            ]
        )
    wait_until(ec2_client.describe_volumes, {'VolumeIds': [response['VolumeId']]}, is_volume_available)
    volume_id = response['VolumeId']
volume_id, availability_zone

('vol-021e0e2f73fe34170', 'ca-central-1a')

In [16]:
# TODO: Change the parts below to use Terraform

# Create VPC

In [17]:
vpc_ids = get_vpcs_ids(ec2_client, TAGS)
vpc_exists = len(vpc_ids) > 0

if len(vpc_ids) > 1:
    raise RuntimeError
elif vpc_exists:
    vpc_id = vpc_ids[0]

# Create VPC
if not vpc_exists:
    print('Creating VPC...')
    vpc_response = ec2_client.create_vpc(
        CidrBlock='10.0.0.0/16',
        TagSpecifications=[
            {
                'ResourceType': 'vpc',
                'Tags': [{'Key': tag, 'Value': TAGS[tag] }for tag in TAGS]
            }
        ]
    )
    vpc_id = vpc_response['Vpc']['VpcId']
wait_until(ec2_client.describe_vpcs, {'VpcIds': [vpc_id]}, check_all_vpcs_available)

# Create Internet Gateway
igw_ids = get_internet_gateway_ids_attached_to_vpc(ec2_client, vpc_id)
igw_exists = len(igw_ids) > 0

if len(igw_ids) > 1:
    raise RuntimeError
elif igw_exists:
    igw_id = igw_ids[0]


if not igw_exists:
    print('Creating Internet Gateway...')
    # Create an Internet Gateway
    igw_response = ec2_client.create_internet_gateway()
    igw_id = igw_response['InternetGateway']['InternetGatewayId']
    
    # Attach Internet Gateway to VPC
    ec2_client.attach_internet_gateway(
        InternetGatewayId=igw_id,
        VpcId=vpc_id
    )

# Create a route table
route_table_ids = get_route_table_ids_for_vpc(ec2_client, vpc_id)
route_table_exists = len(route_table_ids) > 0

if not route_table_exists:
    print('Creating a route table...')
    # Create a route table
    route_table_response = ec2_client.create_route_table(VpcId=vpc_id)
    route_table_id = route_table_response['RouteTable']['RouteTableId']

is_route_created = False
for route_table_id in route_table_ids:
    if route_to_gateway_exists(ec2_client, route_table_id, igw_id):
        is_route_created = True
        break

if not is_route_created:
    # Create a route to the Internet Gateway
    ec2_client.create_route(
        RouteTableId=route_table_id,
        DestinationCidrBlock='0.0.0.0/0',
        GatewayId=igw_id
    )

vpc_id, igw_id, route_table_id

('vpc-0ba31ec9327f8e890', 'igw-0aab310379559d7cf', 'rtb-0fc1da438a5ff2898')

# Create Subnets

In [18]:
subnet_ids = get_subnet_ids_in_vpc(ec2_client, vpc_id)
min_subnet_count = 2
current_subnet_count = len(subnet_ids)

# if current_subnet_count < min_subnet_count:
    # response = ec2_client.describe_availability_zones()
    
    # availability_zones = [az['ZoneName'] for az in response['AvailabilityZones']]
    # random.shuffle(availability_zones)
    # TODO: Following logic is still necessary for situations with larger subnets. Change to something reasonable.
    # availability_zones *= ceil(min_subnet_count / len(availability_zones))
    

while current_subnet_count < min_subnet_count:
    i = current_subnet_count + 1
    print(f'Creating subnet #{i}...')
    subnet_response = ec2_client.create_subnet(
        VpcId=vpc_id,
        CidrBlock=f'10.0.{i}.0/24',
        # AvailabilityZone=availability_zone,
        AvailabilityZone=availability_zones[current_subnet_count]['ZoneName'],
        TagSpecifications=[
            {
                'ResourceType': 'subnet',
                'Tags': [{'Key': tag, 'Value': TAGS[tag]} for tag in TAGS]
            }
        ]
    )
    subnet_id = subnet_response['Subnet']['SubnetId']
    subnet_ids.append(subnet_id)
    current_subnet_count = len(subnet_ids)

# check if any subnet is public, add the table to first subnet if not

In [19]:
subnets_response = ec2_client.describe_subnets(SubnetIds=subnet_ids)
assert len(subnets_response) == len(subnet_ids) == min_subnet_count
is_any_subnet_open_to_public = False
for subnet in subnets_response['Subnets']:
    if subnets_response.get('MapPublicIpOnLaunch', False):
        # I could also look at the route tables, and see if they are assigned to any subnet. ec2_client.describe_route_tables(RouteTableIds=[route_table_id]) RouteTables.SubnetId (optional parameter)
        public_subnet_id = subnet['SubnetId']
        is_any_subnet_open_to_public = True

if not is_any_subnet_open_to_public:
    for subnet in subnets_response['Subnets']:
        if subnet['CidrBlock'] == '10.0.1.0/24':
            public_subnet_id = subnet['SubnetId']

            # Associate the public subnet with the route table
            ec2_client.associate_route_table(
                RouteTableId=route_table_id,
                SubnetId=public_subnet_id
            )
            
            # Modify the public subnet to auto-assign public IPs
            ec2_client.modify_subnet_attribute(
                SubnetId=public_subnet_id,
                MapPublicIpOnLaunch={"Value": True}
            )
            break
            

# Create Security Group

In [20]:
security_group_ids = get_security_group_ids(ec2_client, TAGS)

security_group_exists = len(security_group_ids) > 0

if len(security_group_ids) > 1:
    raise RuntimeError
elif security_group_exists:
    security_group_id = security_group_ids[0]

# The security group gets torn down when deleted, so there is no need to check the rules and rewrite all of them.

if not security_group_exists:
    print('Creating Security Group...')
    response = ec2_client.create_security_group(
        GroupName=f'eks-cluster-sg-{CLUSTER_NAME}',
        TagSpecifications=[
            {
                'ResourceType': 'security-group', 
                'Tags': [{'Key': tag, 'Value': TAGS[tag]} for tag in TAGS]
            }
        ],
        Description=f'Security group for EKS cluster: {CLUSTER_NAME}',
        VpcId=vpc_id
    )
    security_group_id = response['GroupId']

    # TODO: Can I make the CidrIp more restrictive for the next deployment? Load Balancer needs to have a static IP, probably through the Kubernetes YAML?
    for port in ALLOWED_PORTS:
        ec2_client.authorize_security_group_ingress(
            GroupId=security_group_id,
            IpPermissions=[
                {
                    'IpProtocol': 'tcp',
                    'FromPort': port,
                    'ToPort': port,
                    'IpRanges': [{'CidrIp': '0.0.0.0/0'}]
                }
            ]
        )
    
security_group_id

'sg-0ac7721dab43cc0fc'

In [21]:
volume_id, vpc_id, igw_id, route_table_id, public_subnet_id, subnet_ids, security_group_id

('vol-021e0e2f73fe34170',
 'vpc-0ba31ec9327f8e890',
 'igw-0aab310379559d7cf',
 'rtb-0fc1da438a5ff2898',
 'subnet-0374c1250c2c5914e',
 ['subnet-09561967a87d79ac3', 'subnet-0374c1250c2c5914e'],
 'sg-0ac7721dab43cc0fc')

# Create Cluster

In [22]:
tear_down_and_rebuild = False
response = eks_client.list_clusters()
clusters = response['clusters']
if CLUSTER_NAME not in clusters:
    eks_client.create_cluster(name=CLUSTER_NAME, 
                              version=K8S_VERSION, 
                              roleArn=f'arn:aws:iam::{aws_account_id}:role/EKS_Cluster_Role', 
                              resourcesVpcConfig={'subnetIds': subnet_ids,
                                                  'securityGroupIds': [security_group_id],
                                                  'endpointPublicAccess': True,
                                                  'endpointPrivateAccess': False
                              },
                              tags=TAGS,
                             )
else:
    if tear_down_and_rebuild:
        pass
        # TODO: Create a new cluster with a temp name.
        # Wait until the cluster has finished forming
        # Delete the old cluster
        # Wait until deletion is complete
        # Rename the new cluster
        
wait_until(eks_client.describe_cluster, {'name': CLUSTER_NAME}, is_cluster_active, timeout=7 * 60)
response = eks_client.describe_cluster(name=CLUSTER_NAME)
assert response['cluster']['status'] == 'ACTIVE'

'ACTIVE'

# Create IAM Role for Node Groups

In [23]:
# Define the role name
role_name = 'EKS_Cluster_Role'  # WARNING: I get a weird error if I call this role anything else. The call looks for this particular name, and I do not know how to override it.

# Create the trust policy for the role
trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": [
                    "eks.amazonaws.com",
                    "ec2.amazonaws.com"
                ]
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

# Create the IAM role
try:
    response = iam_client.create_role(
        RoleName=role_name,
        AssumeRolePolicyDocument=json.dumps(trust_policy),
        Description='Role for EKS Node Group'
    )
    node_role_arn = response['Role']['Arn']
    print(f"Created role: {node_role_arn}")
except iam_client.exceptions.EntityAlreadyExistsException:
    response = iam_client.get_role(RoleName=role_name)
    node_role_arn = response['Role']['Arn']
    print(f"Role {role_name} already exists. Arn: {node_role_arn}.")
    # TODO: Check if trust_policy is correct.

# Attach necessary policies
policies = [
    'AmazonEKSWorkerNodePolicy',
    'AmazonEC2ContainerRegistryReadOnly',
    'AmazonEKS_CNI_Policy',
    'AmazonEKSClusterPolicy',
    # 'AmazonSSMManagedInstanceCore',
]
# Policy AmazonSSMManagedInstanceCore is not necessary, I used it for debugging, to connect to the node and run commands. 

for policy in policies:
    try:
        iam_client.attach_role_policy(
            RoleName=role_name,
            PolicyArn=f'arn:aws:iam::aws:policy/{policy}'
        )
        print(f"Attached policy {policy} to role {role_name}.")
    except Exception as e:
        print(f"Error attaching policy {policy}: {e}")

Role EKS_Cluster_Role already exists. Arn: arn:aws:iam::275678099358:role/EKS_Cluster_Role.
Attached policy AmazonEKSWorkerNodePolicy to role EKS_Cluster_Role.
Attached policy AmazonEC2ContainerRegistryReadOnly to role EKS_Cluster_Role.
Attached policy AmazonEKS_CNI_Policy to role EKS_Cluster_Role.
Attached policy AmazonEKSClusterPolicy to role EKS_Cluster_Role.


# Add Node Groups

In [24]:
# TODO: This part may also need a wait.

In [25]:
node_groups = eks_client.list_nodegroups(clusterName=CLUSTER_NAME)
# TODO: Make this static and put to the top, with different `diskSize` and `scalingConfig`
node_group_name = 'gpu'
ami_type = 'AL2_x86_64_GPU'
if node_group_name in node_groups['nodegroups']:
    response = eks_client.describe_nodegroup(clusterName=CLUSTER_NAME, nodegroupName=node_group_name)
    status = response['nodegroup']['status']
    if status == 'CREATE_FAILED':
        print('Node group exists, but failed to create. Deleting...')
        failed_node_group_info = response

        eks_client.delete_nodegroup(clusterName=CLUSTER_NAME, nodegroupName=node_group_name)
    
        wait_until(eks_client.list_nodegroups, {'clusterName': CLUSTER_NAME}, lambda x: node_group_name not in x['nodegroups'])
        node_groups = eks_client.list_nodegroups(clusterName=CLUSTER_NAME)
    else:
        print('Node group exists.')



if not node_groups['nodegroups']:
    print('Creating node group...')
    response = eks_client.create_nodegroup(
        clusterName=CLUSTER_NAME,
        nodegroupName=node_group_name,
        scalingConfig={
            'desiredSize': 1,
            'minSize': 1,
            'maxSize': 1
        },
        diskSize=50,  # Size in GiB
        subnets=[public_subnet_id],
        nodeRole=node_role_arn,
        amiType=ami_type,
        instanceTypes=INSTANCE_TYPES[node_group_name],
        labels={
            'gpu-memory': 'true'
        },
        taints=[
            {
                'key': 'nvidia.com/gpu',
                'value': 'true',
                'effect': 'NO_SCHEDULE'
            }
        ]
    )
wait_until(eks_client.describe_nodegroup, {'clusterName': CLUSTER_NAME, 'nodegroupName': node_group_name}, is_node_group_active)

'ACTIVE'

True

In [26]:
node_groups = eks_client.list_nodegroups(clusterName=CLUSTER_NAME)
# TODO: Make this static and put to the top, with different `diskSize` and `scalingConfig`
node_group_name = 'default'
ami_type = 'AL2_x86_64'
if node_group_name in node_groups['nodegroups']:
    response = eks_client.describe_nodegroup(clusterName=CLUSTER_NAME, nodegroupName=node_group_name)
    status = response['nodegroup']['status']
    if status == 'CREATE_FAILED':
        print('Node group exists, but failed to create. Deleting...')
        failed_node_group_info = response

        eks_client.delete_nodegroup(clusterName=CLUSTER_NAME, nodegroupName=node_group_name)
    
        wait_until(eks_client.list_nodegroups, {'clusterName': CLUSTER_NAME}, lambda x: node_group_name not in x['nodegroups'])
        node_groups = eks_client.list_nodegroups(clusterName=CLUSTER_NAME)
    else:
        print('Node group exists.')


# TODO: See if lower disksize will work.
if node_group_name not in node_groups['nodegroups']:
    print('Creating node group...')
    response = eks_client.create_nodegroup(
        clusterName=CLUSTER_NAME,
        nodegroupName=node_group_name,
        scalingConfig={
            'desiredSize': 2,
            'minSize': 2,
            'maxSize': 2
        },
        diskSize=1000,  # Size in GiB
        subnets=[public_subnet_id],
        nodeRole=node_role_arn,
        amiType=ami_type,
        instanceTypes=INSTANCE_TYPES[node_group_name],
    )
wait_until(eks_client.describe_nodegroup, {'clusterName': CLUSTER_NAME, 'nodegroupName': node_group_name}, is_node_group_active)

'ACTIVE'

True

# Initialize Kubernetes

In [27]:
!aws eks update-kubeconfig --name $CLUSTER_NAME --region $REGION

Added new context arn:aws:eks:ca-central-1:275678099358:cluster/kubyterlab-llm to /root/.kube/config


# Initialize Helm

In [28]:
!helm repo add aws-ebs-csi-driver https://kubernetes-sigs.github.io/aws-ebs-csi-driver

"aws-ebs-csi-driver" has been added to your repositories


In [29]:
!helm repo add eks https://aws.github.io/eks-charts

"eks" has been added to your repositories


In [30]:
!helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx

"ingress-nginx" has been added to your repositories


In [31]:
!helm repo update

Hang tight while we grab the latest from your chart repositories...
...Successfully got an update from the "aws-ebs-csi-driver" chart repository
...Successfully got an update from the "ingress-nginx" chart repository
...Successfully got an update from the "eks" chart repository
Update Complete. ⎈Happy Helming!⎈


# Create a Role to Install the CSI Driver and Give Permissions Using the OIDC Provider

In [32]:
cluster_response = eks_client.describe_cluster(name=CLUSTER_NAME)
oidc_id = cluster_response['cluster']['identity']['oidc']['issuer'].split('/')[-1]
oidc_url = f'https://oidc.eks.{REGION}.amazonaws.com/id/{oidc_id}'
oidc_url

'https://oidc.eks.ca-central-1.amazonaws.com/id/26168D03C1C42A23884C0AE1BA33F5A7'

In [33]:
!eksctl utils associate-iam-oidc-provider --cluster $CLUSTER_NAME --approve

[36m2025-02-22 23:59:41 [ℹ]  IAM Open ID Connect provider is already associated with cluster "kubyterlab-llm" in "ca-central-1"
[0m

In [34]:
amazon_eks_ebs_csi_driver_policy_name = 'AmazonEKS_EBS_CSI_Driver_Policy'
amazon_eks_ebs_csi_driver_policy = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Action": [
        "ec2:CreateSnapshot",
        "ec2:AttachVolume",
        "ec2:DetachVolume",
        "ec2:ModifyVolume",
        "ec2:DescribeAvailabilityZones",
        "ec2:DescribeInstances",
        "ec2:DescribeSnapshots",
        "ec2:DescribeTags",
        "ec2:DescribeVolumes",
        "ec2:DescribeVolumesModifications"
      ],
      "Resource": "*"
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:CreateTags"
      ],
      "Resource": [
        "arn:aws:ec2:*:*:volume/*",
        "arn:aws:ec2:*:*:snapshot/*"
      ],
      "Condition": {
        "StringEquals": {
          "ec2:CreateAction": [
            "CreateVolume",
            "CreateSnapshot"
          ]
        }
      }
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:DeleteTags"
      ],
      "Resource": [
        "arn:aws:ec2:*:*:volume/*",
        "arn:aws:ec2:*:*:snapshot/*"
      ]
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:CreateVolume"
      ],
      "Resource": "*",
      "Condition": {
        "StringLike": {
          "aws:RequestTag/ebs.csi.aws.com/cluster": "true"
        }
      }
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:CreateVolume"
      ],
      "Resource": "*",
      "Condition": {
        "StringLike": {
          "aws:RequestTag/CSIVolumeName": "*"
        }
      }
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:DeleteVolume"
      ],
      "Resource": "*",
      "Condition": {
        "StringLike": {
          "ec2:ResourceTag/CSIVolumeName": "*"
        }
      }
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:DeleteVolume"
      ],
      "Resource": "*",
      "Condition": {
        "StringLike": {
          "ec2:ResourceTag/ebs.csi.aws.com/cluster": "true"
        }
      }
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:DeleteSnapshot"
      ],
      "Resource": "*",
      "Condition": {
        "StringLike": {
          "ec2:ResourceTag/CSIVolumeSnapshotName": "*"
        }
      }
    },
    {
      "Effect": "Allow",
      "Action": [
        "ec2:DeleteSnapshot"
      ],
      "Resource": "*",
      "Condition": {
        "StringLike": {
          "ec2:ResourceTag/ebs.csi.aws.com/cluster": "true"
        }
      }
    }
  ]
}
try:
    response = iam_client.create_policy(
        PolicyName=amazon_eks_ebs_csi_driver_policy_name,
        PolicyDocument=json.dumps(amazon_eks_ebs_csi_driver_policy)
    )
    amazon_eks_ebs_csi_driver_policy_arn = response['Policy']['Arn']
except iam_client.exceptions.EntityAlreadyExistsException:
    amazon_eks_ebs_csi_driver_policy_arn = f'arn:aws:iam::{aws_account_id}:policy/{amazon_eks_ebs_csi_driver_policy_name}'
    response = iam_client.get_policy(PolicyArn=amazon_eks_ebs_csi_driver_policy_arn)

    # TODO: If policy body is not the same, update. Code to update follows.
    # iam_client.create_policy_version(
    #     PolicyArn=policy_arn,
    #     PolicyDocument=policy_document_json,
    #     SetAsDefault=True
    # )

amazon_eks_ebs_csi_driver_policy_arn

'arn:aws:iam::275678099358:policy/AmazonEKS_EBS_CSI_Driver_Policy'

In [35]:
trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Federated": f"arn:aws:iam::{aws_account_id}:oidc-provider/oidc.eks.{REGION}.amazonaws.com/id/{oidc_id}"
            },
            "Action": "sts:AssumeRoleWithWebIdentity",
            "Condition": {
                "StringEquals": {
                    f"oidc.eks.{REGION}.amazonaws.com/id/{oidc_id}:aud": "sts.amazonaws.com",
                    f"oidc.eks.{REGION}.amazonaws.com/id/{oidc_id}:sub": "system:serviceaccount:kube-system:ebs-csi-controller-sa"
                }
            }
        }
    ]
}

try:
    create_role_response = iam_client.create_role(
        RoleName='AmazonEKS_EBS_CSI_DriverRole',
        
        AssumeRolePolicyDocument=json.dumps(trust_policy),
        Description='Role for Amazon EKS EBS CSI Driver'
    )
    print(f"Role created successfully: {create_role_response['Role']['Arn']}")
except iam_client.exceptions.EntityAlreadyExistsException:
    print("Role already exists. Updating the trust policy.")
    iam_client.update_assume_role_policy(
        RoleName='AmazonEKS_EBS_CSI_DriverRole',
        PolicyDocument=json.dumps(trust_policy)
    )

attach_policy_response = iam_client.attach_role_policy(
    RoleName='AmazonEKS_EBS_CSI_DriverRole',
    PolicyArn='arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy'
)

attach_policy_response = iam_client.attach_role_policy(
    RoleName='AmazonEKS_EBS_CSI_DriverRole',
    PolicyArn=amazon_eks_ebs_csi_driver_policy_arn
)

Role already exists. Updating the trust policy.


# Install CSI Driver

In [36]:
!kubectl create sa ebs-csi-controller-sa -n kube-system

error: failed to create serviceaccount: serviceaccounts "ebs-csi-controller-sa" already exists


In [37]:
!kubectl annotate sa ebs-csi-controller-sa -n kube-system eks.amazonaws.com/role-arn=$ebs_csi_driver_role_arn


serviceaccount/ebs-csi-controller-sa annotated


In [37]:
!kubectl annotate sa ebs-csi-controller-sa -n kube-system meta.helm.sh/release-namespace=kube-system

serviceaccount/ebs-csi-controller-sa annotated


In [38]:
!kubectl annotate sa ebs-csi-controller-sa -n kube-system meta.helm.sh/release-name=aws-ebs-csi-driver

serviceaccount/ebs-csi-controller-sa annotated


In [39]:
!kubectl label sa ebs-csi-controller-sa -n kube-system app.kubernetes.io/managed-by=Helm

serviceaccount/ebs-csi-controller-sa labeled


In [40]:
!helm upgrade --install aws-ebs-csi-driver \
    --namespace kube-system \
    aws-ebs-csi-driver/aws-ebs-csi-driver

Release "aws-ebs-csi-driver" does not exist. Installing it now.
NAME: aws-ebs-csi-driver
LAST DEPLOYED: Sat Feb 22 19:09:49 2025
NAMESPACE: kube-system
STATUS: deployed
REVISION: 1
NOTES:
To verify that aws-ebs-csi-driver has started, run:

    kubectl get pod -n kube-system -l "app.kubernetes.io/name=aws-ebs-csi-driver,app.kubernetes.io/instance=aws-ebs-csi-driver"

[ACTION REQUIRED] Update to the EBS CSI Driver IAM Policy

Due to an upcoming change in handling of IAM polices for the CreateVolume API when creating a volume from an EBS snapshot, a change to your EBS CSI Driver policy may be needed. For more information and remediation steps, see GitHub issue #2190 (https://github.com/kubernetes-sigs/aws-ebs-csi-driver/issues/2190). This change affects all versions of the EBS CSI Driver and action may be required even on clusters where the driver is not upgraded.


### == End of Code Specific to AWS ==

# Install Nvidia Device Plugin

In [41]:
!kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/heads/main/deployments/static/nvidia-device-plugin.yml

daemonset.apps/nvidia-device-plugin-daemonset created


# Apply Helm Charts

In [42]:
volume_id

'vol-021e0e2f73fe34170'

In [43]:
!helm install storage /helm/storage/ --set volumeID=$volume_id

NAME: storage
LAST DEPLOYED: Sat Feb 22 19:52:35 2025
NAMESPACE: default
STATUS: deployed
REVISION: 1
TEST SUITE: None


In [44]:
!helm install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx --create-namespace

NAME: ingress-nginx
LAST DEPLOYED: Sat Feb 22 19:52:40 2025
NAMESPACE: ingress-nginx
STATUS: deployed
REVISION: 1
TEST SUITE: None
NOTES:
The ingress-nginx controller has been installed.
It may take a few minutes for the load balancer IP to be available.
You can watch the status by running 'kubectl get service --namespace ingress-nginx ingress-nginx-controller --output wide --watch'

An example Ingress that makes use of the controller:
  apiVersion: networking.k8s.io/v1
  kind: Ingress
  metadata:
    name: example
    namespace: foo
  spec:
    ingressClassName: nginx
    rules:
      - host: www.example.com
        http:
          paths:
            - pathType: Prefix
              backend:
                service:
                  name: exampleService
                  port:
                    number: 80
              path: /
    # This section is only required if TLS is to be enabled for the Ingress
    tls:
      - hosts:
        - www.example.com
        secretName: example-tls

In [45]:
# !helm upgrade --install lb /helm/lb-ingress/


Release "lb" does not exist. Installing it now.
NAME: lb
LAST DEPLOYED: Sat Feb 22 19:53:00 2025
NAMESPACE: default
STATUS: deployed
REVISION: 1
TEST SUITE: None


In [62]:
# !helm delete lb

Error: uninstall: Release not loaded: lb: release: not found


In [96]:
!helm upgrade --install ingress /helm/eberron-agent-ingress/


Release "ingress" has been upgraded. Happy Helming!
NAME: ingress
LAST DEPLOYED: Sun Feb 23 00:35:47 2025
NAMESPACE: default
STATUS: deployed
REVISION: 18
TEST SUITE: None


In [39]:
!helm upgrade --install fastapi /helm/eberron-agent-server/ --set awsAccountId=$aws_account_id --set region=$REGION


Release "fastapi" has been upgraded. Happy Helming!
NAME: fastapi
LAST DEPLOYED: Sat Feb 22 23:59:59 2025
NAMESPACE: default
STATUS: deployed
REVISION: 2
TEST SUITE: None


In [40]:
!helm upgrade --install frontend /helm/eberron-agent-frontend/ --set awsAccountId=$aws_account_id --set region=$REGION


Release "frontend" has been upgraded. Happy Helming!
NAME: frontend
LAST DEPLOYED: Sun Feb 23 00:00:01 2025
NAMESPACE: default
STATUS: deployed
REVISION: 2
TEST SUITE: None


# Get URL

In [41]:
# TODO: Update get_one_running_pod to use staring with names.
wait_until(get_one_running_pod, {'prefix': 'fastapi-'}, lambda x: x is not None, timeout=8 * 60)
pod_name = get_one_running_pod()
assert pod_name is not None
pod_name

'fastapi-677965765c-qkfn4'

In [42]:
# output = !kubectl get service lb --no-headers
# fields = re.split(r'\s+', output[0])
# external_url = fields[3]
# port = fields[4].split('/')[0].split(':')[0]
# external_url, port

In [78]:

output = !kubectl get service ingress-nginx-controller --namespace ingress-nginx --no-headers
fields = re.split(r'\s+', output[0])
external_url = fields[3]
port = fields[4].split('/')[0].split(':')[0]
external_url, port

('aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com',
 '80')

In [79]:
url = f'http://{external_url}:{port}'
url

'http://aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com:80'

# Test

In [45]:
# Defaulted container "kubyterlab-llm" out of: kubyterlab-llm, init-container (init)
# True
# 1
!kubectl exec $pod_name bash -- python3 -c 'import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())'

True
1


In [46]:
!kubectl exec $pod_name bash -- bash -c 'nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv'

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 4087 MiB, 11009 MiB


In [97]:
import requests

from IPython.display import clear_output, display, HTML, Markdown

response = requests.post(f"{url}/respond", 
                         headers={"Content-Type": "application/json", 
                                  "Accept": "text/event-stream"}, 
                         json={"content": "What are the languages in Eberron?"}, 
                         stream=True)
md = ""
for chunk in response:
    md += chunk.decode("utf-8")
    clear_output()
    display(Markdown(md.replace('\\n', '\n')))

The languages in Eberron include, but are not limited to:

1. Abyssal: Common tongue of all fiends, also known as "Khyber’s Speech."
2. Argon: Spoken by barbarians of Argonnessen.
3. Auran: Spoken by air-based creatures like elves.
4. Common: The language of the Five Nations and the language of trade in Khorvaire, known by most of its people.
5. Daan: Spoken by formians, lawful outsiders, and other daelkyr aberrations.
6. Daelkyr: Spoken by the daelkyr, mind flayers, and other creatures of Xoriat.
7. Draconic: Spoken by kobolds, troglodytes, lizardfolk, dragons, and others.
8. Druidic: Spoken by druids (only).
9. Dwarven: Spoken by dwarves.
10. Elven: Spoken by elves and drow.
11. Giant: Spoken by ogres, giants, and drow.
12. Gnoll: Spoken by gnolls.
13. Goblin: Spoken by goblins, hobgoblins, and bugbears.
14. Halfling: Spoken by halflings.
15. Ignan: Spoken by fire-based creatures.
16. Infernal: Spoken by devils of Shavarath.
17. Irial: Spoken by Ravids, positive energy users.
18. Kythric: Spoken by Slaadi, chaotic outsiders.
19. Mabran: Spoken by Nightshades, shadows, and draconic creatures of Mabar.
20. Ore: Spoken by orcs.
21. Quori: Spoken by the Inspired, kalashtar.
22. Riedran: Spoken by the lower classes of Sarlona.
23. Risian: Spoken by ice-based creatures.
24. Sylvan: Spoken by dryads, eladrins, and creatures of Thelanis.
25. Syranian: Spoken by angels of Syrania.
26. Terran: Spoken by xorns and other earth-based creatures.
27. Undercommon: Spoken by chokers and underground Daelkyr denizens.

This list is not exhaustive, and there may be other languages in Eberron. The use of a language often reflects the culture and geography of the character or creature. For example, an orc from Droaam likely speaks Goblin instead of Giant.

# == End of Procedure ==

## == Do Not Continue: rest of the code is only for reference for troubleshooting ==

# Troubleshoot

In [None]:
# curl -X POST -H "Content-Type: application/json" -H "Accept: text/event-stream" -d '{"key":"value"}' http://your-api-endpoint

In [None]:
# Note that for the following to work, you need to attach the policy AmazonSSMManagedInstanceCore, in the relevant cell above, during the deployment.

In [51]:
output = !aws ssm send-command \
    --document-name "AWS-RunShellScript" \
    --targets "Key=instanceIds,Values=i-02f3c1f8a118eb352" \
    --parameters 'commands=["nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv"]' \
    --region $REGION
command_id = json.loads(''.join(output))['Command']['CommandId']
command_id


In [None]:
output = !aws ssm list-command-invocations \
    --command-id $command_id \
    --details
output = json.loads(''.join(output))['CommandInvocations'][0]['CommandPlugins'][0]['Output']
for line in output.split('\n'):
    print(line)

In [68]:
!kubectl get events --sort-by='.lastTimestamp'


LAST SEEN   TYPE      REASON                    OBJECT                                             MESSAGE
59m         Normal    Starting                  node/ip-10-0-1-161.ca-central-1.compute.internal   
58m         Normal    Starting                  node/ip-10-0-1-137.ca-central-1.compute.internal   
58m         Normal    Starting                  node/ip-10-0-1-15.ca-central-1.compute.internal    
59m         Normal    Starting                  node/ip-10-0-1-161.ca-central-1.compute.internal   Starting kubelet.
59m         Normal    NodeAllocatableEnforced   node/ip-10-0-1-161.ca-central-1.compute.internal   Updated Node Allocatable limit across pods
59m         Normal    NodeHasSufficientPID      node/ip-10-0-1-161.ca-central-1.compute.internal   Node ip-10-0-1-161.ca-central-1.compute.internal status is now: NodeHasSufficientPID
59m         Normal    NodeHasNoDiskPressure     node/ip-10-0-1-161.ca-central-1.compute.internal   Node ip-10-0-1-161.ca-central-1.compute.internal st

In [52]:
!kubectl get pod -n kube-system -l "app.kubernetes.io/name=aws-ebs-csi-driver,app.kubernetes.io/instance=aws-ebs-csi-driver"

NAME                                 READY   STATUS    RESTARTS   AGE
ebs-csi-controller-fcb84d9bc-g9pdj   5/5     Running   0          4h52m
ebs-csi-controller-fcb84d9bc-hp5lz   5/5     Running   0          4h52m
ebs-csi-node-frpqr                   3/3     Running   0          4h52m
ebs-csi-node-rh8fz                   3/3     Running   0          4h52m
ebs-csi-node-zvvhd                   3/3     Running   0          4h52m


In [53]:
# NAME                                          GPU
# ip-10-0-1-129.ca-central-1.compute.internal   <none>
# ip-10-0-1-223.ca-central-1.compute.internal   <none>
# ip-10-0-1-8.ca-central-1.compute.internal     1z
!kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"


NAME                                          GPU
ip-10-0-1-137.ca-central-1.compute.internal   <none>
ip-10-0-1-15.ca-central-1.compute.internal    <none>
ip-10-0-1-161.ca-central-1.compute.internal   1


In [54]:
# NAME                                          TAINTS
# ip-10-0-1-107.ca-central-1.compute.internal   <none>
# ip-10-0-1-175.ca-central-1.compute.internal   [map[effect:NoSchedule key:nvidia.com/gpu value:true]]
# ip-10-0-1-90.ca-central-1.compute.internal    <none>
!kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints

NAME                                          TAINTS
ip-10-0-1-137.ca-central-1.compute.internal   <none>
ip-10-0-1-15.ca-central-1.compute.internal    <none>
ip-10-0-1-161.ca-central-1.compute.internal   [map[effect:NoSchedule key:nvidia.com/gpu value:true]]


In [55]:
# NAME                     TYPE           CLUSTER-IP     EXTERNAL-IP                                                                  PORT(S)          AGE
# kubernetes               ClusterIP      172.20.0.1     <none>                                                                       443/TCP          37m
# kubyterlab-llm-service   LoadBalancer   172.20.129.8   a4740e3e56bfe40ac81121bd46071903-1377611187.ca-central-1.elb.amazonaws.com   8888:31434/TCP   13s
!kubectl get service

NAME         TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)    AGE
fastapi      ClusterIP   172.20.157.106   <none>        8000/TCP   4h8m
frontend     ClusterIP   172.20.163.190   <none>        4200/TCP   4h8m
kubernetes   ClusterIP   172.20.0.1       <none>        443/TCP    4h55m


In [56]:
# NAME               READY   UP-TO-DATE   AVAILABLE   AGE
# kubyterlab-llm-pod   1/1     1            1           31m
!kubectl get deployments

NAME       READY   UP-TO-DATE   AVAILABLE   AGE
fastapi    1/1     1            1           4h8m
frontend   1/1     1            1           4h8m


In [57]:
!kubectl get ingress

NAME      CLASS   HOSTS   ADDRESS                                                                     PORTS   AGE
ingress   nginx   *       aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com   80      3h24m


In [58]:
!kubectl get namespace

NAME              STATUS   AGE
default           Active   4h55m
ingress-nginx     Active   4h9m
kube-node-lease   Active   4h55m
kube-public       Active   4h55m
kube-system       Active   4h55m


In [59]:
!kubectl get ingress --show-labels

NAME      CLASS   HOSTS   ADDRESS                                                                     PORTS   AGE     LABELS
ingress   nginx   *       aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com   80      3h24m   app.kubernetes.io/managed-by=Helm


In [60]:
!kubectl get svc -n ingress-nginx


NAME                                 TYPE           CLUSTER-IP       EXTERNAL-IP                                                                 PORT(S)                      AGE
ingress-nginx-controller             LoadBalancer   172.20.108.128   aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com   80:32121/TCP,443:30845/TCP   4h9m
ingress-nginx-controller-admission   ClusterIP      172.20.2.80      <none>                                                                      443/TCP                      4h9m


In [62]:
!kubectl logs -l app.kubernetes.io/name=ingress-nginx -n ingress-nginx


10.0.1.161 - - [23/Feb/2025:00:01:16 +0000] "{\x22params\x22:[],\x22id\x22:\x22echo\x22,\x22method\x22:\x22echo\x22}" 400 150 "-" "-" 0 0.014 [] [] - - - - d417d63eb079e0181b7c9bf24f4ae483
I0223 00:01:41.207939       7 main.go:107] "successfully validated configuration, accepting" ingress="default/ingress"
I0223 00:01:41.216036       7 controller.go:196] "Configuration changes detected, backend reload required"
I0223 00:01:41.216226       7 event.go:377] Event(v1.ObjectReference{Kind:"Ingress", Namespace:"default", Name:"ingress", UID:"32b88182-7182-4606-90af-267f226b32e2", APIVersion:"networking.k8s.io/v1", ResourceVersion:"64151", FieldPath:""}): type: 'Normal' reason: 'Sync' Scheduled for sync
I0223 00:01:41.255519       7 controller.go:216] "Backend successfully reloaded"
I0223 00:01:41.255960       7 event.go:377] Event(v1.ObjectReference{Kind:"Pod", Namespace:"ingress-nginx", Name:"ingress-nginx-controller-7657f6db5f-s8bmd", UID:"bf3c1154-788c-4528-8a34-b580223c8e50", APIVersion:

In [63]:
!helm list -A


NAME              	NAMESPACE    	REVISION	UPDATED                                	STATUS  	CHART                    	APP VERSION
aws-ebs-csi-driver	kube-system  	1       	2025-02-22 19:09:49.193714285 +0000 UTC	deployed	aws-ebs-csi-driver-2.40.0	1.40.0     
fastapi           	default      	2       	2025-02-22 23:59:59.878444676 +0000 UTC	deployed	fastapi-0.1.0            	1.0        
frontend          	default      	2       	2025-02-23 00:00:01.859164008 +0000 UTC	deployed	frontend-0.1.0           	1.0        
ingress           	default      	10      	2025-02-23 00:01:41.147457177 +0000 UTC	deployed	ingress-0.1.0            	1.0.0      
ingress-nginx     	ingress-nginx	1       	2025-02-22 19:52:40.900849212 +0000 UTC	deployed	ingress-nginx-4.12.0     	1.12.0     
storage           	default      	1       	2025-02-22 19:52:35.81714641 +0000 UTC 	deployed	storage-0.1.0            	           


In [64]:
!helm get values ingress-nginx -n ingress-nginx

USER-SUPPLIED VALUES:
null


In [65]:
!kubectl get ingressclass

NAME    CONTROLLER             PARAMETERS   AGE
nginx   k8s.io/ingress-nginx   <none>       4h9m


In [66]:
!kubectl get ingress 

NAME      CLASS   HOSTS   ADDRESS                                                                     PORTS   AGE
ingress   nginx   *       aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com   80      3h25m


In [67]:
!kubectl describe ingress ingress

Name:             ingress
Labels:           app.kubernetes.io/managed-by=Helm
Namespace:        default
Address:          aa703202c6c1849ce9814d2c1fbe6a9c-697758537.ca-central-1.elb.amazonaws.com
Ingress Class:    nginx
Default backend:  <default>
Rules:
  Host        Path  Backends
  ----        ----  --------
  *           
              /respond   fastapi:8000 (10.0.1.49:8000)
              /          frontend:4200 (10.0.1.208:4200)
Annotations:  meta.helm.sh/release-name: ingress
              meta.helm.sh/release-namespace: default
              nginx.ingress.kubernetes.io/cors-allow-methods: POST,GET,PUT,DELETE
              nginx.ingress.kubernetes.io/enable-methods: POST,GET,PUT,DELETE
              nginx.ingress.kubernetes.io/proxy-read-timeout: 3600
              nginx.ingress.kubernetes.io/proxy-send-timeout: 3600
              nginx.ingress.kubernetes.io/rewrite-target: /
Events:
  Type    Reason  Age                  From                      Message
  ----    ------  ----

In [82]:
output = !kubectl get pods -n ingress-nginx --no-headers
fields = re.split(r'\s+', output[0])
ingress_controller_pod = fields[0]
ingress_controller_pod

'ingress-nginx-controller-7657f6db5f-s8bmd'

In [71]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v frontend.default.svc.cluster.local:4200

* Host frontend.default.svc.cluster.local:4200 was resolved.
* IPv6: (none)
* IPv4: 172.20.163.190
*   Trying 172.20.163.190:4200...
* connect to 172.20.163.190 port 4200 from 10.0.1.58 port 41446 failed: Connection refused
* Failed to connect to frontend.default.svc.cluster.local port 4200 after 4 ms: Could not connect to server
* closing connection #0
curl: (7) Failed to connect to frontend.default.svc.cluster.local port 4200 after 4 ms: Could not connect to server
command terminated with exit code 7


In [69]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v fastapi.default.svc.cluster.local:8000

* Host fastapi.default.svc.cluster.local:8000 was resolved.
* IPv6: (none)
* IPv4: 172.20.157.106
*   Trying 172.20.157.106:8000...
* Connected to fastapi.default.svc.cluster.local (172.20.157.106) port 8000
* using HTTP/1.x
> GET / HTTP/1.1
> Host: fastapi.default.svc.cluster.local:8000
> User-Agent: curl/8.11.1
> Accept: */*
> 
* Request completely sent off
< HTTP/1.1 200 OK
< date: Sun, 23 Feb 2025 00:02:47 GMT
< server: uvicorn
< content-length: 27
< content-type: application/json
< 
* Connection #0 to host fastapi.default.svc.cluster.local left intact
{"message":"Hello, world!"}

In [70]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v -X POST fastapi.default.svc.cluster.local:8000/respond --no-buffer

* Host fastapi.default.svc.cluster.local:8000 was resolved.
* IPv6: (none)
* IPv4: 172.20.157.106
*   Trying 172.20.157.106:8000...
* Connected to fastapi.default.svc.cluster.local (172.20.157.106) port 8000
* using HTTP/1.x
> POST /respond HTTP/1.1
> Host: fastapi.default.svc.cluster.local:8000
> User-Agent: curl/8.11.1
> Accept: */*
> 
* Request completely sent off
< HTTP/1.1 422 Unprocessable Entity
< date: Sun, 23 Feb 2025 00:02:53 GMT
< server: uvicorn
< content-length: 82
< content-type: application/json
< 
{"detail":[{"type":"missing","loc":["body"],"msg":"Field required","input":null}]}* Connection #0 to host fastapi.default.svc.cluster.local left intact


In [203]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v -X POST fastapi.default.svc.cluster.local:8000/respond --no-buffer -H "Content-Type: application/json" -d '{"content": "What are the languages in Eberron?"}'

error: pod, type/name or --filename must be specified


In [200]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v 10.0.1.49:8000

*   Trying 10.0.1.49:8000...
* Connected to 10.0.1.49 (10.0.1.49) port 8000
* using HTTP/1.x
> GET / HTTP/1.1
> Host: 10.0.1.49:8000
> User-Agent: curl/8.11.1
> Accept: */*
> 
* Request completely sent off
< HTTP/1.1 200 OK
< date: Sat, 22 Feb 2025 21:27:40 GMT
< server: uvicorn
< content-length: 27
< content-type: application/json
< 
* Connection #0 to host 10.0.1.49 left intact
{"message":"Hello, world!"}

In [94]:
# !kubectl exec $ingress_controller_pod -n ingress-nginx -- cat /etc/nginx/nginx.conf

In [192]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v -x POST 10.0.1.49:8000/respond --no-buffer -H "Content-Type: application/json" -d '{"content": "What are the languages in Eberron?"}'

* Could not resolve proxy: POST
* shutting down connection #0
curl: (5) Could not resolve proxy: POST
command terminated with exit code 5


In [177]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- nslookup fastapi.default.svc.cluster.local

Server:		172.20.0.10
Address:	172.20.0.10:53


Name:	fastapi.default.svc.cluster.local
Address: 172.20.157.106



In [178]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- nslookup fastapi.default.svc.cluster.local

Server:		172.20.0.10
Address:	172.20.0.10:53


Name:	fastapi.default.svc.cluster.local
Address: 172.20.157.106



In [179]:
!kubectl get svc fastapi -n default

NAME      TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)    AGE
fastapi   ClusterIP   172.20.157.106   <none>        8000/TCP   82m


In [181]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v 172.20.157.106:8000

*   Trying 172.20.157.106:8000...
* Connected to 172.20.157.106 (172.20.157.106) port 8000
* using HTTP/1.x
> GET / HTTP/1.1
> Host: 172.20.157.106:8000
> User-Agent: curl/8.11.1
> Accept: */*
> 
* Request completely sent off
< HTTP/1.1 200 OK
< date: Sat, 22 Feb 2025 21:16:29 GMT
< server: uvicorn
< content-length: 27
< content-type: application/json
< 
* Connection #0 to host 172.20.157.106 left intact
{"message":"Hello, world!"}

In [193]:
!kubectl exec -it $pod_name -n default -- curl -v fastapi:8000

*   Trying 172.20.157.106:8000...
* Connected to fastapi (172.20.157.106) port 8000 (#0)
> GET / HTTP/1.1
> Host: fastapi:8000
> User-Agent: curl/7.81.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< date: Sat, 22 Feb 2025 21:25:46 GMT
< server: uvicorn
< content-length: 27
< content-type: application/json
< 
* Connection #0 to host fastapi left intact
{"message":"Hello, world!"}

In [196]:
!kubectl exec -it $pod_name -n default -- curl -v -X POST fastapi:8000/respond --no-buffer 

*   Trying 172.20.157.106:8000...
* Connected to fastapi (172.20.157.106) port 8000 (#0)
> POST /respond HTTP/1.1
> Host: fastapi:8000
> User-Agent: curl/7.81.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 422 Unprocessable Entity
< date: Sat, 22 Feb 2025 21:26:46 GMT
< server: uvicorn
< content-length: 82
< content-type: application/json
< 
{"detail":[{"type":"missing","loc":["body"],"msg":"Field required","input":null}]}* Connection #0 to host fastapi left intact


In [None]:
!kubectl exec -it $pod_name -n default -- curl -v -X POST fastapi:8000/respond --no-buffer -H "Content-Type: application/json" -d '{"content": "What are the languages in Eberron?"}'

In [183]:
!kubectl exec -it $ingress_controller_pod -n ingress-nginx -- curl -v fastapi.default.svc.cluster.local:8000

* Host fastapi.default.svc.cluster.local:8000 was resolved.
* IPv6: (none)
* IPv4: 172.20.157.106
*   Trying 172.20.157.106:8000...
* Connected to fastapi.default.svc.cluster.local (172.20.157.106) port 8000
* using HTTP/1.x
> GET / HTTP/1.1
> Host: fastapi.default.svc.cluster.local:8000
> User-Agent: curl/8.11.1
> Accept: */*
> 
* Request completely sent off
< HTTP/1.1 200 OK
< date: Sat, 22 Feb 2025 21:19:01 GMT
< server: uvicorn
< content-length: 27
< content-type: application/json
< 
* Connection #0 to host fastapi.default.svc.cluster.local left intact
{"message":"Hello, world!"}

In [162]:
!kubectl get endpoints fastapi -n default

NAME      ENDPOINTS        AGE
fastapi   10.0.1.49:8000   77m


In [None]:
# !kubectl describe deployment kubyterlab-llm-pod

In [71]:
# !kubectl describe pod frontend-75669c8cd8-5kkmk

In [126]:
!kubectl get pods

NAME                        READY   STATUS    RESTARTS   AGE
fastapi-677965765c-z5kc6    1/1     Running   0          77m
frontend-74b7bcf65b-lknld   1/1     Running   0          33m


In [60]:
!kubectl get pvc

NAME      STATUS   VOLUME   CAPACITY   ACCESS MODES   STORAGECLASS   VOLUMEATTRIBUTESCLASS   AGE
pvc-llm   Bound    pv-llm   500Gi      RWO            manual         <unset>                 98m


In [61]:
!kubectl get pv

NAME     CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS   CLAIM             STORAGECLASS   VOLUMEATTRIBUTESCLASS   REASON   AGE
pv-llm   500Gi      RWO            Retain           Bound    default/pvc-llm   manual         <unset>                          98m


In [95]:
!kubectl logs $pod_name


== CUDA ==

CUDA Version 12.6.1

Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.

2025-02-22 19:57:40.261589: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-22 19:57:40.277019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been regist

In [64]:
!kubectl describe pod $pod_name

Name:             fastapi-698c4dc749-l5qsk
Namespace:        default
Priority:         0
Service Account:  default
Node:             ip-10-0-1-228.ca-central-1.compute.internal/10.0.1.228
Start Time:       Sun, 09 Feb 2025 15:09:55 +0000
Labels:           app=fastapi
                  pod-template-hash=698c4dc749
Annotations:      <none>
Status:           Running
IP:               10.0.1.238
IPs:
  IP:           10.0.1.238
Controlled By:  ReplicaSet/fastapi-698c4dc749
Containers:
  fastapi:
    Container ID:   containerd://a15615861206a39f38823254d93970ca29766b92277c793c2856fb57bc0cfd11
    Image:          275678099358.dkr.ecr.ca-central-1.amazonaws.com/multi-agent/eberron-agent-server:25.02
    Image ID:       275678099358.dkr.ecr.ca-central-1.amazonaws.com/multi-agent/eberron-agent-server@sha256:0a7ec392240e2f7a4ba9ce05b279df68209e085074bbe4fb8f465e047156b2ff
    Port:           80/TCP
    Host Port:      0/TCP
    State:          Running
      Started:      Sun, 09 Feb 2025 15:14:49

In [None]:
attachdetach-controller  AttachVolume.Attach failed for volume "pv-llm" : rpc error: code = Internal desc = Could not attach volume "volume_id" to node "i-055aa2853ecdeac3b": could not attach volume "volume_id" to node "i-055aa2853ecdeac3b": operation error EC2: AttachVolume, https response error StatusCode: 400, RequestID: a7250de3-c285-4963-84eb-5c46b28c2b96, api error InvalidParameterValue: The volume ID 'volume_id' is malformed

In [None]:
!kubectl exec $pod_name bash -- nvidia-smi --version

In [63]:
!kubectl exec $pod_name bash -- nvidia-smi

Defaulted container "kubyterlab-llm" out of: kubyterlab-llm, init-container (init)
Mon Jan 13 22:15:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       On  |   00000000:00:1E.0 Off |                    0 |
| N/A   25C    P8             11W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+--------------

In [62]:
!kubectl exec $pod_name bash -- nvidia-smi -L

Defaulted container "kubyterlab-llm" out of: kubyterlab-llm, init-container (init)
GPU 0: Tesla T4 (UUID: GPU-fe1acad6-41f9-c441-44ed-d4ead6db6dc2)


In [61]:
!kubectl exec $pod_name bash -- nvcc --version

Defaulted container "kubyterlab-llm" out of: kubyterlab-llm, init-container (init)
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Aug_14_10:10:22_PDT_2024
Cuda compilation tools, release 12.6, V12.6.68
Build cuda_12.6.r12.6/compiler.34714021_0


In [None]:
!kubectl describe pod $pod_name

In [113]:
!kubectl get pod -n kube-system

NAME                                   READY   STATUS    RESTARTS   AGE
aws-node-cvzp2                         2/2     Running   0          4m59s
aws-node-kmnjj                         2/2     Running   0          4m57s
aws-node-zlmn8                         2/2     Running   0          6m35s
coredns-749d5dbdd9-cdd8v               1/1     Running   0          8m29s
coredns-749d5dbdd9-xtrc8               1/1     Running   0          8m29s
ebs-csi-controller-59b6797bf-fhcph     5/5     Running   0          4m6s
ebs-csi-controller-59b6797bf-w9mns     5/5     Running   0          4m6s
ebs-csi-node-mp9sr                     3/3     Running   0          4m6s
ebs-csi-node-mqt55                     3/3     Running   0          4m6s
ebs-csi-node-q8dfl                     3/3     Running   0          4m6s
kube-proxy-ctjcd                       1/1     Running   0          6m35s
kube-proxy-nv8tx                       1/1     Running   0          4m57s
kube-proxy-qmwmh                       1/1   

In [177]:
!kubectl describe pvc pvc-llm

Name:          pvc-llm
Namespace:     default
StorageClass:  manual
Status:        Bound
Volume:        pv-llm
Labels:        <none>
Annotations:   pv.kubernetes.io/bind-completed: yes
               pv.kubernetes.io/bound-by-controller: yes
Finalizers:    [kubernetes.io/pvc-protection]
Capacity:      10Gi
Access Modes:  RWO
VolumeMode:    Filesystem
Used By:       kubyterlab-llm-pod-67f5cf95dc-s2mrn
Events:        <none>


In [178]:
!kubectl describe pv pv-llm

Name:            pv-llm
Labels:          <none>
Annotations:     pv.kubernetes.io/bound-by-controller: yes
Finalizers:      [kubernetes.io/pv-protection]
StorageClass:    manual
Status:          Bound
Claim:           default/pvc-llm
Reclaim Policy:  Retain
Access Modes:    RWO
VolumeMode:      Filesystem
Capacity:        10Gi
Node Affinity:   <none>
Message:         
Source:
    Type:       AWSElasticBlockStore (a Persistent Disk resource in AWS)
    VolumeID:   vol-04ce08a2c05b8e1da
    FSType:     ext4
    Partition:  0
    ReadOnly:   false
Events:         <none>


In [169]:
!kubectl describe deployment kubyterlab-llm-pod

Name:                   kubyterlab-llm-pod
Namespace:              default
CreationTimestamp:      Mon, 02 Dec 2024 20:05:56 +0000
Labels:                 <none>
Annotations:            deployment.kubernetes.io/revision: 1
Selector:               app=kubyterlab-llm
Replicas:               1 desired | 1 updated | 1 total | 0 available | 1 unavailable
StrategyType:           RollingUpdate
MinReadySeconds:        0
RollingUpdateStrategy:  25% max unavailable, 25% max surge
Pod Template:
  Labels:  app=kubyterlab-llm
  Containers:
   kubyterlab-llm:
    Image:      kubyterlab:24.10
    Port:       8888/TCP
    Host Port:  0/TCP
    Limits:
      cpu:             1
      memory:          8Gi
      nvidia.com/gpu:  1
    Requests:
      cpu:             1
      memory:          8Gi
      nvidia.com/gpu:  1
    Environment:
      JUPYTERLAB_SETTINGS_DIR:  /jupyterlab/config
      MISTRAL_MODEL:            /models/mistral
    Mounts:
      /corpus from pv-llm (rw)
      /jupyterlab from pv-llm

In [59]:
!kubectl get nodes

NAME                                          STATUS   ROLES    AGE   VERSION
ip-10-0-1-100.ca-central-1.compute.internal   Ready    <none>   47m   v1.30.7-eks-59bf375
ip-10-0-1-25.ca-central-1.compute.internal    Ready    <none>   47m   v1.30.7-eks-59bf375
ip-10-0-1-50.ca-central-1.compute.internal    Ready    <none>   48m   v1.30.7-eks-59bf375


In [58]:
!kubectl get configmap

NAME               DATA   AGE
kube-root-ca.crt   1      50m


In [57]:
!kubectl get all -l app.kubernetes.io/name=aws-ebs-csi-driver -n kube-system

NAME                                     READY   STATUS    RESTARTS   AGE
pod/ebs-csi-controller-59b6797bf-vfvs2   5/5     Running   0          47m
pod/ebs-csi-controller-59b6797bf-xt7fv   5/5     Running   0          47m
pod/ebs-csi-node-dwzz7                   3/3     Running   0          47m
pod/ebs-csi-node-gplqp                   3/3     Running   0          47m
pod/ebs-csi-node-l5nh6                   3/3     Running   0          47m

NAME                          DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR            AGE
daemonset.apps/ebs-csi-node   3         3         3       3            3           kubernetes.io/os=linux   47m

NAME                                 READY   UP-TO-DATE   AVAILABLE   AGE
deployment.apps/ebs-csi-controller   2/2     2            2           47m

NAME                                           DESIRED   CURRENT   READY   AGE
replicaset.apps/ebs-csi-controller-59b6797bf   2         2         2       47m


In [56]:
!aws iam list-open-id-connect-providers

{
    "OpenIDConnectProviderList": [
        {
            "Arn": "arn:aws:iam::275678099358:oidc-provider/oidc.eks.ca-central-1.amazonaws.com/id/8D7619520212428BD59C46B20BF19338"
        }
    ]
}


In [51]:
!kubectl logs deployment/ebs-csi-controller -n kube-system -c ebs-plugin

Found 2 pods, using pod/ebs-csi-controller-59b6797bf-vfvs2
I0113 21:28:06.936951       1 main.go:153] "Initializing metadata"
I0113 21:28:06.943481       1 metadata.go:48] "Retrieved metadata from IMDS"
I0113 21:28:06.944585       1 driver.go:69] "Driver Information" Driver="ebs.csi.aws.com" Version="v1.38.1"
I0113 22:01:16.236299       1 controller.go:410] "ControllerPublishVolume: attaching" volumeID="vol-0ed5d3cc8cb5a989e" nodeID="i-0fd8ede93d6ac59a4"
I0113 22:01:17.882498       1 controller.go:419] "ControllerPublishVolume: attached" volumeID="vol-0ed5d3cc8cb5a989e" nodeID="i-0fd8ede93d6ac59a4" devicePath="/dev/xvdaa"


In [52]:
!kubectl logs daemonset/ebs-csi-node -n kube-system -c ebs-plugin

Found 3 pods, using pod/ebs-csi-node-l5nh6
I0113 21:27:55.828513       1 main.go:153] "Initializing metadata"
I0113 21:27:55.832956       1 metadata.go:48] "Retrieved metadata from IMDS"
I0113 21:27:55.833450       1 driver.go:69] "Driver Information" Driver="ebs.csi.aws.com" Version="v1.38.1"
E0113 21:27:56.896438       1 node.go:856] "Unexpected failure when attempting to remove node taint(s)" err="isAllocatableSet: driver not found on node ip-10-0-1-50.ca-central-1.compute.internal"
I0113 21:27:57.410789       1 node.go:936] "CSINode Allocatable value is set" nodeName="ip-10-0-1-50.ca-central-1.compute.internal" count=24
I0113 22:01:18.817732       1 node.go:204] "NodeStageVolume: invalid partition config, will ignore." partition="0"
I0113 22:01:18.980008       1 mount_linux.go:295] Detected OS without systemd


In [53]:
!kubectl logs deployment/ebs-csi-controller -n kube-system -c csi-provisioner

Found 2 pods, using pod/ebs-csi-controller-59b6797bf-vfvs2
W0113 21:28:09.566495       1 feature_gate.go:354] Setting GA feature gate Topology=true. It will be removed in a future release.
I0113 21:28:09.566789       1 feature_gate.go:387] feature gates: {map[Topology:true]}
I0113 21:28:09.566840       1 csi-provisioner.go:154] Version: v5.1.0
I0113 21:28:09.566851       1 csi-provisioner.go:177] Building kube configs for running in cluster...
I0113 21:28:09.568886       1 common.go:143] "Probing CSI driver for readiness"
I0113 21:28:09.598446       1 csi-provisioner.go:230] Detected CSI driver ebs.csi.aws.com
I0113 21:28:09.598619       1 csi-provisioner.go:240] Supports migration from in-tree plugin: kubernetes.io/aws-ebs
I0113 21:28:09.600672       1 common.go:143] "Probing CSI driver for readiness"
I0113 21:28:09.603385       1 csi-provisioner.go:299] CSI driver supports PUBLISH_UNPUBLISH_VOLUME, watching VolumeAttachments
I0113 21:28:09.604750       1 controller.go:744] "Using sav

In [54]:
!kubectl logs deployment/ebs-csi-controller -n kube-system -c csi-attacher

Found 2 pods, using pod/ebs-csi-controller-59b6797bf-vfvs2
I0113 21:28:12.211572       1 main.go:109] "Version" version="v4.7.0"
I0113 21:28:12.215865       1 common.go:143] "Probing CSI driver for readiness"
I0113 21:28:12.238251       1 main.go:169] "CSI driver name" driver="ebs.csi.aws.com"
I0113 21:28:12.240448       1 common.go:143] "Probing CSI driver for readiness"
I0113 21:28:12.242734       1 main.go:249] "CSI driver supports ControllerPublishUnpublish, using real CSI handler" driver="ebs.csi.aws.com"
I0113 21:28:12.243624       1 leaderelection.go:254] attempting to acquire leader lease kube-system/external-attacher-leader-ebs-csi-aws-com...
I0113 21:28:12.265555       1 leaderelection.go:268] successfully acquired lease kube-system/external-attacher-leader-ebs-csi-aws-com
I0113 21:28:12.266787       1 leader_election.go:184] "became leader, starting"
I0113 21:28:12.266811       1 controller.go:129] "Starting CSI attacher"
I0113 21:28:12.267257       1 envvar.go:172] "Feature

In [55]:
!kubectl get sa ebs-csi-controller-sa -n kube-system -o jsonpath='{.metadata.annotations}'

{"eks.amazonaws.com/role-arn":"","meta.helm.sh/release-name":"aws-ebs-csi-driver","meta.helm.sh/release-namespace":"kube-system"}