In [None]:
# Install necessary libraries
!pip install -q boto3 matplotlib pandas seaborn

In [None]:
import os
import boto3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io

In [None]:
# Define a function to list bucket contents
def list_bucket_contents():
    # Load environment variables
    key_id = os.getenv("AWS_ACCESS_KEY_ID")
    secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    region = os.getenv("AWS_DEFAULT_REGION")
    endpoint = os.getenv("AWS_S3_ENDPOINT")
    bucket_name = os.getenv("AWS_S3_BUCKET")

    if not all([key_id, secret_key, region, endpoint, bucket_name]):
        print("One or more environment variables are missing. Please check your setup.")
        return

    # Create an S3 client
    s3 = boto3.client(
        's3',
        aws_access_key_id=key_id,
        aws_secret_access_key=secret_key,
        region_name=region,
        endpoint_url=endpoint
    )

    try:
        print(f"Contents of bucket '{bucket_name}':")
        response = s3.list_objects_v2(Bucket=bucket_name)

        if 'Contents' in response:
            for obj in response['Contents']:
                print(f" - {obj['Key']} ({obj['Size']} bytes)")
        else:
            print("The bucket is empty or does not exist.")

    except Exception as e:
        print(f"An error occurred: {e}")


def load_csv_from_s3(file_key):
    """
    file_key: path to the file in the S3 bucket, e.g., 'input/sales_data.csv'.
    """
    key_id = os.getenv("AWS_ACCESS_KEY_ID")
    secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    region = os.getenv("AWS_DEFAULT_REGION")
    endpoint = os.getenv("AWS_S3_ENDPOINT")
    bucket_name = os.getenv("AWS_S3_BUCKET")

    if not all([key_id, secret_key, region, endpoint, bucket_name]):
        raise ValueError("One or more S3-related environment variables are not set.")

    s3 = boto3.client(
        's3',
        aws_access_key_id=key_id,
        aws_secret_access_key=secret_key,
        region_name=region,
        endpoint_url=endpoint
    )

    try:
        # Retrieve the file object
        obj = s3.get_object(Bucket=bucket_name, Key=file_key)
        # Read the CSV data
        df = pd.read_csv(io.BytesIO(obj['Body'].read()))
        return df
    except Exception as e:
        raise ValueError(f"Error reading {file_key} from bucket {bucket_name}: {e}")

In [None]:
# First, list the contents of the bucket to verify connectivity and file presence
list_bucket_contents()

# Define the S3 key (path within the bucket)
file_key = "input/sales_data.csv"

# Load the CSV into a pandas DataFrame
df = load_csv_from_s3(file_key)

# ----- Your Plotting Code -----

# Verify that the data is loaded correctly
print(df.head())

In [None]:
# Create the graph using the correct variable name
plt.figure(figsize=(10, 6))
sales_by_region = df.groupby('Region')['Sales_Amount'].sum().reset_index()
sales_by_region = sales_by_region.sort_values(by='Sales_Amount', ascending=False)
sns.barplot(data=sales_by_region, x='Sales_Amount', y='Region', palette="Blues_r")
plt.title('Total Sales by Region', fontsize=16)
plt.xlabel('Sales Amount', fontsize=14)
plt.ylabel('Region', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Plotting Sales Amount by Product Category
plt.figure(figsize=(10, 6))
sales_by_category = df.groupby('Product_Category')['Sales_Amount'].sum().reset_index()
sales_by_category = sales_by_category.sort_values(by='Sales_Amount', ascending=False)
sns.barplot(data=sales_by_category, x='Sales_Amount', y='Product_Category', palette="Blues_r")
plt.title('Sales Amount by Product Category', fontsize=16)
plt.xlabel('Sales Amount', fontsize=14)
plt.ylabel('Product Category', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
import warnings  # Import the warnings module to suppress warnings

# Suppress the specific FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

# Ensure no 'inf' values in the DataFrame, convert to NaN if present
df.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

# Convert 'Sales_Amount' to a numeric column, coercing errors to NaN
df['Sales_Amount'] = pd.to_numeric(df['Sales_Amount'], errors='coerce')

# Drop rows with NaN values to ensure clean data
df.dropna(inplace=True)

# Plotting Sales Trend Over Time
plt.figure(figsize=(12, 6))
df['Sale_Month'] = pd.to_datetime(df['Sale_Date']).dt.to_period('M')

# Group by 'Sale_Month' and handle any potential 'inf' during aggregation
sales_trend = df.groupby('Sale_Month')['Sales_Amount'].sum().reset_index()
sales_trend['Sale_Month'] = sales_trend['Sale_Month'].astype(str)

# Clean 'sales_trend' DataFrame again to remove any 'inf' or 'NaN'
sales_trend['Sales_Amount'] = pd.to_numeric(sales_trend['Sales_Amount'], errors='coerce')
sales_trend.dropna(inplace=True)

# Plot the graph
sns.lineplot(data=sales_trend, x='Sale_Month', y='Sales_Amount', marker='o')
plt.xticks(rotation=45)
plt.title('Sales Trend Over Time', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Sales Amount', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Ensure no 'inf' values in the DataFrame, convert to NaN if present
df.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

# Convert 'Sales_Amount' to a numeric column, coercing errors to NaN
df['Sales_Amount'] = pd.to_numeric(df['Sales_Amount'], errors='coerce')

# Drop rows with NaN values to ensure clean data
df.dropna(inplace=True)

# Plotting Sales by Sales Representative
plt.figure(figsize=(10, 6))
sales_by_rep = df.groupby('Sales_Rep')['Sales_Amount'].sum().reset_index()
sales_by_rep = sales_by_rep.sort_values(by='Sales_Amount', ascending=False)
sns.barplot(data=sales_by_rep, x='Sales_Amount', y='Sales_Rep', palette="Blues_r")
plt.title('Sales by Sales Representative', fontsize=16)
plt.xlabel('Sales Amount', fontsize=14)
plt.ylabel('Sales Representative', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Ensure no 'inf' values in the DataFrame, convert to NaN if present
df.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

# Convert 'Sales_Amount' to a numeric column, coercing errors to NaN
df['Sales_Amount'] = pd.to_numeric(df['Sales_Amount'], errors='coerce')

# Drop rows with NaN values to ensure clean data
df.dropna(inplace=True)

# Plotting Sales Channel Distribution
plt.figure(figsize=(8, 6))
sales_channel_dist = df['Sales_Channel'].value_counts().reset_index()
sales_channel_dist.columns = ['Sales_Channel', 'Count']
sns.barplot(data=sales_channel_dist, x='Count', y='Sales_Channel', palette="Blues_r")
plt.title('Sales Channel Distribution', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Sales Channel', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# 1. Sales Amount Distribution by Product Category and Sales Channel
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Product_Category', y='Sales_Amount', hue='Sales_Channel', palette='Blues')
plt.title('Sales Amount Distribution by Product Category and Sales Channel', fontsize=16)
plt.xlabel('Product Category', fontsize=14)
plt.ylabel('Sales Amount', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 2. Heatmap of Sales Amount by Region and Sales Representative
pivot_table = df.pivot_table(values='Sales_Amount', index='Region', columns='Sales_Rep', aggfunc='sum', fill_value=0)
plt.figure(figsize=(8, 6))
sns.heatmap(pivot_table, annot=True, cmap='Blues', fmt='.0f')
plt.title('Heatmap of Sales Amount by Region and Sales Representative', fontsize=16)
plt.xlabel('Sales Representative', fontsize=14)
plt.ylabel('Region', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# 3. Scatter Plot of Sales Amount vs. Quantity Sold with Product Category Hue
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Quantity_Sold', y='Sales_Amount', hue='Product_Category', palette='bright', alpha=0.7)
plt.title('Sales Amount vs. Quantity Sold by Product Category', fontsize=16)
plt.xlabel('Quantity Sold', fontsize=14)
plt.ylabel('Sales Amount', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# 4. Pairplot to Visualize Relationships Between Numeric Variables
sns.pairplot(df, vars=['Sales_Amount', 'Quantity_Sold', 'Unit_Cost', 'Unit_Price'], hue='Product_Category', palette='bright')
plt.suptitle('Pairplot of Key Metrics by Product Category', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# 5. Violin Plot to Show Sales Amount Distribution by Customer Type
plt.figure(figsize=(8, 6))
sns.violinplot(data=df, x='Customer_Type', y='Sales_Amount', palette='Blues_r')
plt.title('Sales Amount Distribution by Customer Type', fontsize=16)
plt.xlabel('Customer Type', fontsize=14)
plt.ylabel('Sales Amount', fontsize=14)
plt.tight_layout()
plt.show()