In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')


In [None]:
# Load CSV file
data = pd.read_csv('AusApparalSales4thQrt2020.csv')

# Display basic info
print("Dataset shape:", data.shape)
data.head()


In [None]:
# Check for missing values
print("Missing values:\n", data.isna().sum())

# Check data types and duplicates
print("\nData types:\n", data.dtypes)
print("\nDuplicate entries:", data.duplicated().sum())

# Drop duplicates if any
data.drop_duplicates(inplace=True)

# Fill or drop missing values
# Example: drop rows with null Sales or Unit
# Harmonize column name for units if needed
if 'Units' in data.columns and 'Unit' not in data.columns:
    data.rename(columns={'Units': 'Unit'}, inplace=True)

data.dropna(subset=['Sales', 'Unit'], inplace=True)

# Reset index
data.reset_index(drop=True, inplace=True)


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data[['Sales', 'Units']] = scaler.fit_transform(data[['Sales', 'Units']])

print("Normalized Data Sample:")
data.head()


In [None]:
# Group by State and Customer Group
state_sales = data.groupby('State')['Sales'].sum().sort_values(ascending=False)
# Harmonize customer group column name
group_column = 'Group' if 'Group' in data.columns else 'CustomerGroup'
group_sales = data.groupby(group_column)['Sales'].sum().sort_values(ascending=False)

print("State-wise Sales:\n", state_sales)
print("\nCustomer Group-wise Sales:\n", group_sales)


In [None]:
# Descriptive statistics for Sales and Units
print("Descriptive Statistics:\n", data[['Sales', 'Units']].describe())

# Mean, median, mode, std
print("\nMean Sales:", data['Sales'].mean())
print("Median Sales:", data['Sales'].median())
print("Mode Sales:", data['Sales'].mode()[0])
print("Std Dev Sales:", data['Sales'].std())


In [None]:
highest_state = state_sales.idxmax()
lowest_state = state_sales.idxmin()

highest_group = group_sales.idxmax()
lowest_group = group_sales.idxmin()

print(f"Highest Revenue State: {highest_state}")
print(f"Lowest Revenue State: {lowest_state}")
print(f"Highest Revenue Group: {highest_group}")
print(f"Lowest Revenue Group: {lowest_group}")


In [None]:
# Convert Date column if present
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    data['Week'] = data['Date'].dt.isocalendar().week
    data['Month'] = data['Date'].dt.month
    data['Quarter'] = data['Date'].dt.quarter

    weekly = data.groupby('Week')['Sales'].sum()
    monthly = data.groupby('Month')['Sales'].sum()
    quarterly = data.groupby('Quarter')['Sales'].sum()

    print("Weekly Summary:\n", weekly)
    print("\nMonthly Summary:\n", monthly)
    print("\nQuarterly Summary:\n", quarterly)
else:
    print("⚠️ No Date column found for time-based analysis.")


In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=state_sales.index, y=state_sales.values)
plt.title('State-wise Total Sales')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(x=group_sales.index, y=group_sales.values)
plt.title('Customer Group-wise Sales')
plt.show()


In [None]:
# Box plot for descriptive statistics
plt.figure(figsize=(8,5))
sns.boxplot(data=data[['Sales', 'Units']])
plt.title('Descriptive Statistics - Box Plot')
plt.show()

# Distribution plot for Sales
sns.displot(data['Sales'], kde=True)
plt.title('Sales Distribution')
plt.show()


In [None]:
if 'Time' in data.columns:
    data['Hour'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.hour
    hourly_sales = data.groupby('Hour')['Sales'].sum()

    plt.figure(figsize=(10,5))
    sns.lineplot(x=hourly_sales.index, y=hourly_sales.values, marker='o')
    plt.title('Time-of-Day Sales Trend')
    plt.xlabel('Hour of Day')
    plt.ylabel('Total Sales')
    plt.show()
else:
    print("⚠️ No Time column found for hourly sales analysis.")


In [None]:
print(f"""
Recommendations:
1. Focus more marketing efforts on low-revenue states like {lowest_state}.
2. Expand successful programs from {highest_state} to weaker regions.
3. Target group "{highest_group}" for future promotions as they generate the highest revenue.
4. Identify peak sales hours from time analysis for personalized offers.
5. Maintain data normalization for consistency in future analysis.
""")
