In [4]:
import boto3
import pandas as pd
import logging
import json
import warnings
import concurrent.futures

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger()

# Suppress boto3 DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
def list_resources(region):
    logger.info(f"Listing resources in region {region}...")
    resourcegroupstaggingapi_client = boto3.client('resourcegroupstaggingapi', region_name=region)
    response = resourcegroupstaggingapi_client.get_resources()
    return response['ResourceTagMappingList']

# Get a list of all AWS regions
ec2_client = boto3.client('ec2')
regions = [region['RegionName'] for region in ec2_client.describe_regions()['Regions']]

INFO: Found credentials in shared credentials file: ~/.aws/credentials


In [6]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks for each region
    futures = [executor.submit(list_resources, region) for region in regions]

    # Wait for all tasks to complete and get the results
    all_resources = []
    for future in concurrent.futures.as_completed(futures):
        all_resources.extend(future.result())

# Create a DataFrame from the resource data
df = pd.DataFrame(all_resources)



INFO: Listing resources in region ap-south-1...
INFO: Listing resources in region eu-north-1...
INFO: Listing resources in region eu-west-3...
INFO: Listing resources in region eu-west-2...
INFO: Listing resources in region eu-west-1...
INFO: Listing resources in region ap-northeast-3...
INFO: Listing resources in region ap-northeast-2...
INFO: Listing resources in region ap-northeast-1...
INFO: Listing resources in region ca-central-1...
INFO: Listing resources in region sa-east-1...
INFO: Listing resources in region ap-southeast-1...
INFO: Listing resources in region ap-southeast-2...
INFO: Listing resources in region eu-central-1...
INFO: Listing resources in region us-east-1...
INFO: Listing resources in region us-east-2...
INFO: Listing resources in region us-west-1...
INFO: Listing resources in region us-west-2...


In [7]:
df

Unnamed: 0,ResourceARN,Tags
0,arn:aws:acm:ap-south-1:331061725593:certificat...,"[{'Key': 'Info', 'Value': 'For Autograder'}, {..."
1,arn:aws:acm:ap-south-1:331061725593:certificat...,"[{'Key': 'createdBy', 'Value': 'sadhak-team@sa..."
2,arn:aws:apigateway:ap-south-1::/apis/tsazuw5m8...,"[{'Key': 'ghr:environment', 'Value': 'sadhak-c..."
3,arn:aws:apigateway:ap-south-1::/apis/wtwybkii0...,"[{'Key': 'ghr:environment', 'Value': 'gh-ci'},..."
4,arn:aws:apigateway:ap-south-1::/apis/nd63lcppv...,"[{'Key': 'ghr:environment', 'Value': 'gh-ci'},..."
...,...,...
461,arn:aws:kms:ap-northeast-2:331061725593:key/7c...,"[{'Key': 'createdBy', 'Value': 'mahesh'}, {'Ke..."
462,arn:aws:logs:ap-northeast-2:331061725593:log-g...,"[{'Key': 'UsedBy', 'Value': 'Sadhak'}, {'Key':..."
463,arn:aws:cloudformation:ap-northeast-2:33106172...,"[{'Key': 'displayName', 'Value': 'clr'}]"
464,arn:aws:ec2:ap-northeast-2:331061725593:intern...,"[{'Key': 'UsedBy', 'Value': 'Sadhak'}, {'Key':..."


In [8]:
from pandas import json_normalize

tags_df = pd.DataFrame(df['Tags'].apply(lambda tags: {tag['Key']: tag['Value'] for tag in tags}).tolist())
df = pd.concat([df['ResourceARN'], tags_df], axis=1)


In [9]:
df

Unnamed: 0,ResourceARN,Info,Contact,createdBy,project,ghr:environment,CreatedBy,Purpose,owner,environment,...,ingress.k8s.aws/resource,elbv2.k8s.aws/cluster,service.k8s.aws/stack,service.k8s.aws/resource,kubernetes.io/cluster/cloud-enabled-8cn,kubernetes.io/cluster/humming-bird-yov,HummingBird,kubernetes.io/cluster/cloud-enabled-jra,kubernetes.io/cluster/humming-bird-cp-301,kubernetes.io/cluster/c-m-n9kc4b7x
0,arn:aws:acm:ap-south-1:331061725593:certificat...,For Autograder,Kartikay/Naman,,,,,,,,...,,,,,,,,,,
1,arn:aws:acm:ap-south-1:331061725593:certificat...,,,sadhak-team@sahaj.ai,sadhak,,,,,,...,,,,,,,,,,
2,arn:aws:apigateway:ap-south-1::/apis/tsazuw5m8...,,,,,sadhak-ci,Dnyaneshwar Ware,Sadhak Github Actions Runner,,,...,,,,,,,,,,
3,arn:aws:apigateway:ap-south-1::/apis/wtwybkii0...,,,,,gh-ci,Dnyaneshwar Ware,Sadhak Github Runner,,,...,,,,,,,,,,
4,arn:aws:apigateway:ap-south-1::/apis/nd63lcppv...,,,,,gh-ci,Dnyaneshwar Ware,Sadhak Github Runner,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,arn:aws:kms:ap-northeast-2:331061725593:key/7c...,,,mahesh,hummingbird,,,,,,...,,,,,,,,,,
462,arn:aws:logs:ap-northeast-2:331061725593:log-g...,,,,,,,,,,...,,,,,,,,,,
463,arn:aws:cloudformation:ap-northeast-2:33106172...,,,,,,,,,,...,,,,,,,,,,
464,arn:aws:ec2:ap-northeast-2:331061725593:intern...,,,,,,,,,,...,,,,,,,,,,


In [10]:
df.columns.unique()

Index(['ResourceARN', 'Info', 'Contact', 'createdBy', 'project',
       'ghr:environment', 'CreatedBy', 'Purpose', 'owner', 'environment',
       ...
       'ingress.k8s.aws/resource', 'elbv2.k8s.aws/cluster',
       'service.k8s.aws/stack', 'service.k8s.aws/resource',
       'kubernetes.io/cluster/cloud-enabled-8cn',
       'kubernetes.io/cluster/humming-bird-yov', 'HummingBird',
       'kubernetes.io/cluster/cloud-enabled-jra',
       'kubernetes.io/cluster/humming-bird-cp-301',
       'kubernetes.io/cluster/c-m-n9kc4b7x'],
      dtype='object', length=129)

In [11]:
# Define the list of column names to check
columns_to_match = ['Name', 'Owner', 'Project']
columns_to_match_lower = [col.lower() for col in columns_to_match]

# Get unique columns that case-insensitively match the list
matched_columns = [col for col in df.columns if col.lower() in columns_to_match_lower]
matched_columns

['project', 'owner', 'Name', 'Owner', 'Project']

In [12]:
# Define the list of columns to check

# Check if any of the columns don't exist or are falsy
filter_mask = df.apply(lambda x: any(col not in x.index or pd.isnull(x[col]) for col in matched_columns), axis = 1)

# Filter the DataFrame
filtered_df = df[filter_mask]

filtered_df['NullValues'] = filtered_df.apply(lambda x: ', '.join(col for col in matched_columns if col not in x.index or pd.isnull(x[col]) or x[col] == ''), axis=1)

filtered_df


Unnamed: 0,ResourceARN,Info,Contact,createdBy,project,ghr:environment,CreatedBy,Purpose,owner,environment,...,elbv2.k8s.aws/cluster,service.k8s.aws/stack,service.k8s.aws/resource,kubernetes.io/cluster/cloud-enabled-8cn,kubernetes.io/cluster/humming-bird-yov,HummingBird,kubernetes.io/cluster/cloud-enabled-jra,kubernetes.io/cluster/humming-bird-cp-301,kubernetes.io/cluster/c-m-n9kc4b7x,NullValues
0,arn:aws:acm:ap-south-1:331061725593:certificat...,For Autograder,Kartikay/Naman,,,,,,,,...,,,,,,,,,,"project, owner, Name, Owner, Project"
1,arn:aws:acm:ap-south-1:331061725593:certificat...,,,sadhak-team@sahaj.ai,sadhak,,,,,,...,,,,,,,,,,"owner, Name, Owner, Project"
2,arn:aws:apigateway:ap-south-1::/apis/tsazuw5m8...,,,,,sadhak-ci,Dnyaneshwar Ware,Sadhak Github Actions Runner,,,...,,,,,,,,,,"project, owner, Name, Owner, Project"
3,arn:aws:apigateway:ap-south-1::/apis/wtwybkii0...,,,,,gh-ci,Dnyaneshwar Ware,Sadhak Github Runner,,,...,,,,,,,,,,"project, owner, Name, Owner, Project"
4,arn:aws:apigateway:ap-south-1::/apis/nd63lcppv...,,,,,gh-ci,Dnyaneshwar Ware,Sadhak Github Runner,,,...,,,,,,,,,,"project, owner, Name, Owner, Project"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,arn:aws:kms:ap-northeast-2:331061725593:key/7c...,,,mahesh,hummingbird,,,,,,...,,,,,,,,,,"owner, Name, Owner, Project"
462,arn:aws:logs:ap-northeast-2:331061725593:log-g...,,,,,,,,,,...,,,,,,,,,,"project, owner, Owner, Project"
463,arn:aws:cloudformation:ap-northeast-2:33106172...,,,,,,,,,,...,,,,,,,,,,"project, owner, Name, Owner, Project"
464,arn:aws:ec2:ap-northeast-2:331061725593:intern...,,,,,,,,,,...,,,,,,,,,,"project, owner, Owner, Project"


In [13]:
filtered_df[['ResourceARN', 'NullValues']].to_csv('missing-tags.csv', index=False)
