In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style for better aesthetics
sns.set_style("whitegrid")

# Task 1: Load and Explore the Dataset
def load_and_clean_data(url):
    try:
        df = pd.read_csv(url, parse_dates=['Order Date'])
        print("Dataset loaded successfully!\n")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Dataset URL
url = "https://raw.githubusercontent.com/KeithGalli/pandas/master/pandas%20video%20datasets/sales_data.csv"

# Load the dataset
df = load_and_clean_data(url)
if df is None:
    raise SystemExit("Exiting due to data loading error.")

# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Explore dataset structure
print("\nDataset structure:")
print(df.info())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Clean missing values by dropping rows with missing data
df_clean = df.dropna()
print("\nMissing values after cleaning:")
print(df_clean.isnull().sum())

# Task 2: Basic Data Analysis
# Basic statistics for numerical columns
print("\nBasic statistics for numerical columns:")
print(df_clean.describe())

# Group by 'Product' and compute mean sales
product_sales = df_clean.groupby('Product')['Sales'].mean().sort_values(ascending=False)
print("\nAverage sales per product:")
print(product_sales)

# Task 3: Data Visualization
# Visualization 1: Line chart (Monthly Sales Trend)
plt.figure(figsize=(12, 6))
monthly_sales = df_clean.groupby('Month')['Sales'].sum()
monthly_sales.plot(kind='line', marker='o', color='b')
plt.title('Monthly Sales Trend in 2019')
plt.xlabel('Month')
plt.ylabel('Total Sales ($)')
plt.xticks(range(1, 13))
plt.grid(True)
plt.show()

# Visualization 2: Bar chart (Average Sales by Product)
plt.figure(figsize=(12, 6))
product_sales.plot(kind='bar', color='skyblue')
plt.title('Average Sales by Product')
plt.xlabel('Product')
plt.ylabel('Average Sales ($)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Visualization 3: Histogram (Quantity Ordered Distribution)
plt.figure(figsize=(12, 6))
sns.histplot(df_clean['Quantity Ordered'], bins=20, kde=True, color='green')
plt.title('Distribution of Quantity Ordered')
plt.xlabel('Quantity Ordered')
plt.ylabel('Frequency')
plt.show()

# Visualization 4: Scatter plot (Price vs. Quantity Ordered)
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Price Each', y='Quantity Ordered', data=df_clean, alpha=0.6, color='purple')
plt.title('Price vs. Quantity Ordered')
plt.xlabel('Price Each ($)')
plt.ylabel('Quantity Ordered')
plt.show()

Matplotlib is building the font cache; this may take a moment.
