In [0]:
%sql
DROP CATALOG enterprise_modernization CASCADE;

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS enterprise_modernization
COMMENT "Central catalog for Smart Enterprise Modernization Hackathon prototype"
-- MANAGED LOCATION 's3://smart-enterprise-modernization-data/';


In [0]:
# --- Set AWS Credentials (only for demo/prototype purposes) ---
# access_key = ''
# secret_key = ''
# aws_region = 'ap-south-1'  

# --- Spark S3 Configuration ---
spark.conf.set("fs.s3a.access.key", access_key)
spark.conf.set("fs.s3a.secret.key", secret_key)
# spark.conf.set("fs.s3a.endpoint", f"s3.{aws_region}.amazonaws.com")
spark.conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# --- Test Read S3 File (CSV Example) ---
# s3_path = "s3a://YOUR_BUCKET_NAME/path/to/yourfile.csv"

# df = spark.read.option("header", True).csv(s3_path)
# df.show(n=5)


In [0]:
# Replace the following URL with your actual public CSV URL
public_url = "https://tatamotors-my.sharepoint.com/:x:/p/s0001223_ttl/Ed031G6Im8dGgsv4L6dHzOEB-QQAnPNHtDBlWKywlgPDHA?e=uSGHjy" 
dbfs_path = "dbfs:/FileStore/tables/Shruti_Hackathon_accessKeys.csv"

dbutils.fs.cp(public_url, dbfs_path)

# Confirm the file is where you expect
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))


In [0]:
# Read the credentials from the table you uploaded
creds_df = spark.table("delta_lake.default.shruti_hackathon_access_keys")
creds_df.show()  # For debugging: see the structure

# Extract credentials as local Python variables
creds_row = creds_df.first()
access_key = creds_row['Access key ID']
secret_key = creds_row['Secret access key']

print("Access Key:", access_key)
print("Secret Key:", secret_key)


In [0]:
import boto3

# Create a session with the extracted credentials
session = boto3.Session(
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

s3 = session.resource('s3')
bucket = s3.Bucket('smart-enterprise-modernization-data')

# List objects as a test
for obj in bucket.objects.all():
    print(obj.key)


In [0]:
import requests
import pandas as pd

api_url = "https://fakestoreapi.com/products"
response = requests.get(api_url)
if response.ok:
    data = response.json()
    api_df = pd.DataFrame(data)
    # ... (as before)
else:
    print("Request failed due to network restrictions in CE. Using uploaded table as a stand-in.")
    # Fallback: Read from uploaded source
    sap_df = spark.table("enterprise_modernization.default.sap_data")
    sap_df.show(5)



In [0]:
from pyspark.sql.functions import col

# Replace spaces and other invalid chars with underscores for all columns
def clean_colname(name):
    return name.strip().replace(' ', '_').replace('-', '_').replace('/', '_').replace('.', '_')

new_columns = [clean_colname(c) for c in sap_df.columns]   

df_clean = sap_df.toDF(*new_columns)

df_clean.write.mode("overwrite").saveAsTable("enterprise_modernization.bronze.bronze_sap")


In [0]:
df = spark.table("enterprise_modernization.bronze.bronze_sap")

print(df.columns)

print(df.count())

df.show(5)