In [3]:
import configparser
import boto3
import pandas as pd

# Load AWS credentials
config = configparser.ConfigParser()
config.read('aws.cfg')

aws_access_key = config['AWS']['aws_access_key_id']
aws_secret_key = config['AWS']['aws_secret_access_key']



In [4]:
# Initialize the S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key
)

In [5]:
response = s3.list_buckets()

In [6]:
print(response['Buckets'])

[{'Name': 'alina-week6-bucket', 'CreationDate': datetime.datetime(2024, 7, 23, 14, 59, 20, tzinfo=tzlocal())}, {'Name': 'amsu-aws-s3-assessment', 'CreationDate': datetime.datetime(2023, 9, 8, 16, 49, 14, tzinfo=tzlocal())}, {'Name': 'amsu-s3-cf-website', 'CreationDate': datetime.datetime(2023, 9, 7, 13, 50, 19, tzinfo=tzlocal())}, {'Name': 'andrea-lm-code-bucket', 'CreationDate': datetime.datetime(2020, 2, 6, 17, 40, 27, tzinfo=tzlocal())}, {'Name': 'andy-wrangler-bucket', 'CreationDate': datetime.datetime(2024, 7, 23, 15, 21, 45, tzinfo=tzlocal())}, {'Name': 'apiproject-build-bucket-su05297', 'CreationDate': datetime.datetime(2024, 7, 12, 18, 26, 43, tzinfo=tzlocal())}, {'Name': 'aryan-techcatalyst-awswrangler-lab', 'CreationDate': datetime.datetime(2024, 7, 23, 15, 8, 47, tzinfo=tzlocal())}, {'Name': 'aryan-techcatalyst-lab', 'CreationDate': datetime.datetime(2024, 7, 22, 16, 9, 42, tzinfo=tzlocal())}, {'Name': 'austin-lambda-s3', 'CreationDate': datetime.datetime(2023, 9, 19, 16, 14

In [7]:
# getting the location of a specified bucket:
response = s3.get_bucket_location(Bucket='techcatalyst-public')
print(response['LocationConstraint'])

us-west-2


In [8]:
bucket_name = 'sriya-awswrangler-bucket'
response = s3.create_bucket(Bucket=bucket_name)
print(response)

{'ResponseMetadata': {'RequestId': '6ZPW3WJS4D6RBYTZ', 'HostId': 'IHo3masgwx12yvaUeKPQJ0nZAqUN8wQQXZCa8PuXZt+PUgfqvaMbwhtsShbW/mtAPvyAfpsEypE=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'IHo3masgwx12yvaUeKPQJ0nZAqUN8wQQXZCa8PuXZt+PUgfqvaMbwhtsShbW/mtAPvyAfpsEypE=', 'x-amz-request-id': '6ZPW3WJS4D6RBYTZ', 'date': 'Tue, 23 Jul 2024 21:16:51 GMT', 'location': '/sriya-awswrangler-bucket', 'server': 'AmazonS3', 'content-length': '0'}, 'RetryAttempts': 0}, 'Location': '/sriya-awswrangler-bucket'}


In [11]:
# uploading a file using the upload_file method: 
filename = 'cute_dogs.jpg'  # Replace with your file path
key = 'cute_dogs.jpg'  # Replace with your file key in S3

# Upload the file to S3
s3.upload_file(filename, bucket_name, key)

In [12]:
# uploading a file using the put_object method:
key = 'cute_koala.jpg'
with open(key, 'rb') as f:
    s3.put_object(Bucket=bucket_name, Key=key, Body=f)
# Print a confirmation message
print(f'File uploaded to bucket {bucket_name} with key {key}')

File uploaded to bucket sriya-awswrangler-bucket with key cute_koala.jpg


In [9]:
import awswrangler as wr

boto3.setup_default_session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    region_name='us-east-1'
)

In [42]:
%pip install numpy --upgrade

Note: you may need to restart the kernel to use updated packages.


In [11]:
try:
    df = wr.s3.read_parquet("s3://techcatalyst-raw/yellow_tripdata_2024-01.parquet")
except Exception as e:
    print('error')
    print(e)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               Int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        Int64         
 4   trip_distance          float64       
 5   RatecodeID             Int64         
 6   store_and_fwd_flag     string        
 7   PULocationID           Int32         
 8   DOLocationID           Int32         
 9   payment_type           Int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [24]:
wr.s3.to_parquet(
    df= df,
    path= "s3://sriya-awswrangler-bucket/",
    dataset=True,
    mode='overwrite'
)

{'paths': ['s3://sriya-awswrangler-bucket/af315604b05648f981d826e9f722e8c0.snappy.parquet'],
 'partitions_values': {}}

In [25]:
databases = wr.catalog.databases()
print(databases)

                    Database            Description
0     alina-awswrangler_test                       
1           awswrangler_test                       
2                    default  Default Hive database
3      gina-awswrangler_test                       
4        jc-awswrangler_test                       
5   kaitlyn-awswrangler_test                       
6   nyctaxi-gluestudio-samee                       
7     peter_awswrangler_test                       
8    smalik-awswrangler-test                       
9                  tatwan-db                       
10                taxi-atwan                       


In [26]:
if "`sriya-awswrangler-test`" not in databases.values:
    # YOUR CODE
    wr.catalog.create_database(
        name='sriya-awswrangler-test'
    )
else:
    print("Database awswrangler_test already exists")

In [28]:
tables = wr.catalog.tables(database='sriya-awswrangler-test')

Empty DataFrame
Columns: [Database, Table, Description, TableType, Columns, Partitions]
Index: []


In [1]:
desc = "This the Taxi table for January, 2024."
param = {"source": "NYC Taxi Web Service https://www.nyc.gov", "class": "e-commerce"}
comments = {
    "vendorid":"A code indicating the TPEP provider that provided the record",
    "tpep_pickup_datetime": "The date and time when the meter was engaged.",
    "pulocationid": "TLC Taxi Zone in which the taximeter was engaged",
    "payment_type": "A numeric code signifying how the passenger paid for the trip",
    "fare_amount": "The time-and-distance fare calculated by the meter.",
    "passenger_count":"The number of passengers in the vehicle ",
    "extra":"Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight chargeswas engaged.",
    "pulocationid": "TLC Taxi Zone in which the taximeter was engaged",
    "payment_type": "A numeric code signifying how the passenger paid for the trip",
    "fare_amount": "The time-and-distance fare calculated by the meter.",
    "passenger_count":"The number of passengers in the vehicle ",
    "extra":"Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges",
    "tip_amount":"Tip amount - This field is automatically populated for credit card tips. Cash tips are not included .",
    "tolls_amount":"Total amount of all tolls paid in trip.",
    "airport_fee":"$1.25 for pickup only at LaGuardia and John F.Kennedy Airports"}

In [13]:

res = wr.s3.to_parquet(
    df=df,
    path="s3://sriya-awswrangler-bucket/",
    dataset=True,
    database='sriya-awswrangler-test',
    table="rides",
    mode="overwrite",
    glue_table_settings=wr.typing.GlueTableSettings(description=desc, parameters=param, columns_comments=comments)
)

In [15]:
df_athena = wr.athena.read_sql_query(
    sql="SELECT * FROM RIDES LIMIT 10",
    database="sriya-awswrangler-test",
    ctas_approach=True
)
print(df_athena)

   vendorid tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2024-01-24 15:17:12   2024-01-24 15:34:53                1   
1         2  2024-01-24 15:42:55   2024-01-24 15:51:35                1   
2         2  2024-01-24 15:52:23   2024-01-24 16:12:53                1   
3         1  2024-01-24 15:30:55   2024-01-24 16:38:46                1   
4         2  2024-01-24 15:21:48   2024-01-24 15:59:06                2   
5         2  2024-01-24 15:52:24   2024-01-24 16:01:39                1   
6         2  2024-01-24 15:08:55   2024-01-24 15:31:35                1   
7         2  2024-01-24 15:47:59   2024-01-24 16:12:38                1   
8         2  2024-01-24 15:55:32   2024-01-24 16:23:01                1   
9         1  2024-01-24 15:02:22   2024-01-24 15:13:11                1   

   trip_distance  ratecodeid store_and_fwd_flag  pulocationid  dolocationid  \
0           3.33           1                  N           239           246   
1           0.95