### **Data Visualization**

#### Example of Data Visualization

In [1]:
"""
Execute this cell before continue
""" 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Data
months = ['January', 'February', 'March']
revenue = [5000, 7000, 6500]

# Create figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Misleading chart (left)
axes[0].plot(months, revenue, marker='o', color='orange')
axes[0].tick_params(left=False, bottom=False)
axes[0].set_xticks([])
axes[0].set_yticks([])

# Correct chart (right)
axes[1].plot(months, revenue, marker='o', color='green')
axes[1].set_xlabel("Month")
axes[1].set_ylabel("Revenue ($)")
axes[1].grid(True)

# Add overall title and layout
plt.tight_layout()
plt.show()


In [None]:
# Product data
labels = ['iPhone 15', 'Samsung Galaxy S23', 'Google Pixel 8', 'Xiaomi 13']
sales = [35, 30, 20, 15]

# Create figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

axes[0].pie(
    sales,
    labels=labels,
    startangle=90,
    colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff9999']
)

axes[1].bar(labels, sales, color='#66b3ff')
axes[1].set_ylabel("Units Sold")
axes[1].grid(axis='y', linestyle='--', alpha=0.6)

# Overall title and layout
plt.tight_layout()
plt.show()


In [None]:
# Data
products = ['iPhone 14', 'iPhone 15']
sales = [980, 1020]

# Create figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].bar(products, sales, color='orange')
axes[0].set_ylim(970, 1030)  # Truncated y-axis
axes[0].set_ylabel("Units Sold")

# Correct chart (right)
axes[1].bar(products, sales, color='green')
axes[1].set_ylim(0, max(sales) + 50)  # Starts at 0

# Add overall title and layout
fig.suptitle("Smartphone Sales Visualizations", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Realistic product data
labels = ['iPhone 15', 'Samsung Galaxy S23', 'Google Pixel 8', 'Xiaomi 13']
sales = [35, 30, 20, 15]

# Create figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

axes[0].pie(
    sales,
    labels=labels,
    startangle=90,
    colors=['#ff0000', '#00ff00', '#0000ff', '#ffff00']  # Bright, clashing colors
)

axes[1].pie(
    sales,
    labels=labels,
    startangle=90,
    colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff9999']  # Softer, distinct colors
)

# Overall title and layout
plt.tight_layout()
plt.show()


In [None]:
# Example data
coffee_consumption = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])  # Cups of coffee
academic_performance = np.array([50, 55, 60, 65, 70, 75, 80, 85, 90, 95])  # Academic scores

# Create figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Scatter plot of coffee consumption vs academic performance
ax.scatter(coffee_consumption, academic_performance, color='blue', label='Coffee consumption vs Academic Performance')

# Add labels and title
ax.set_xlabel('Cups of Coffee Consumed')
ax.set_ylabel('Academic Performance (Grade)')
ax.set_title('Coffee Consumption and Academic Performance')
ax.legend()

plt.grid(True)
plt.show()


In [None]:
# Simulating a large number of students and their grades vs. study hours
study_hours = np.random.normal(5, 2, 1000)  # Simulating 1000 students with varying study hours
grades = np.random.normal(75, 10, 1000)  # Simulating student grades with some random variation

# Create a scatter plot
fig, ax = plt.subplots(figsize=(8, 6))

ax.scatter(study_hours, grades, color='blue', alpha=0.3)  # Using alpha to make points semi-transparent

# Add labels and title
ax.set_xlabel('Study Hours')
ax.set_ylabel('Grades')

plt.grid(True)
plt.show()

In [None]:
# Set seed for reproducibility
np.random.seed(42)

# Simulate 60 months of sales with a general decline, plus a small final uptick
months = np.arange(1, 61)
trend = np.linspace(1000, 700, 55)  # Steady decline for first 55 months
uptick = np.linspace(705, 740, 5)   # Last 5 months show a mild recovery

# Combine the data
sales = np.concatenate([trend, uptick])
sales += np.random.normal(0, 10, size=sales.shape)  # Add some noise for realism

# --- CHART 1: Misleading View (Last 5 periods only) ---
plt.figure(figsize=(12, 6))
plt.plot(months[-5:], sales[-5:], label='Recent Sales Growth (Last 5 Months)', color='green', marker='o')
plt.xlabel('Month')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True)
plt.show()

# --- CHART 2: Full View (All 60 periods) ---
plt.figure(figsize=(12, 6))
plt.plot(months, sales, label='Full Sales Data (5-Year Trend)', color='red', marker='o')
plt.xlabel('Month')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True)
plt.show()


#### Data Visualization Principle
1. Tell the Truth
2. Choose the Right Chart Type
3. Label and Context
4. Simplify
5. Use Proper Scale and Axis
6. Use Color Carefully
7. Test Readability


#### Choosing the Right Chart

In [None]:
# Sample Data Setup
np.random.seed(42)
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 12, 36]
time = pd.date_range(start='2020-01', periods=12, freq='M')
trend = np.cumsum(np.random.randn(12) * 10 + 50)  # simulate trend
x = np.random.randn(100)
y = x * 2 + np.random.randn(100)

In [None]:
# 1. Bar Chart – Compare quantities across categories
plt.figure()
plt.bar(categories, values)
plt.title('Bar Chart - Category Comparison')
plt.xlabel('Category')
plt.ylabel('Value')
# Use when comparing discrete categories
plt.show()

In [None]:
# 2. Line Chart – Show trends over time
plt.figure()
plt.plot(time, trend, marker='o')
plt.title('Line Chart - Time Series Trend')
plt.xlabel('Time')
plt.ylabel('Sales')
# Use for continuous time-based trends (monthly sales, stock prices)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 3. Pie Chart – Show parts of a whole
plt.figure()
plt.pie(values, labels=categories, autopct='%1.1f%%')
plt.title('Pie Chart - Proportion of Categories')
# Use only for small number of parts (max 4–5), where proportions matter
plt.show()

In [None]:
# 4. Histogram – Show distribution of values
plt.figure()
plt.hist(x, bins=10, color='skyblue', edgecolor='black')
plt.title('Histogram - Distribution of a Variable')
plt.xlabel('Value')
plt.ylabel('Frequency')
# Use to understand the spread or shape of data (e.g., exam scores)
plt.show()

In [None]:
# 5. Box Plot – Show distribution + outliers
plt.figure()
sns.boxplot(data=[x, y])
plt.title('Box Plot - Summary Stats + Outliers')
# Use to compare distribution between groups or detect outliers
plt.xticks([0, 1], ['X', 'Y'])
plt.show()

In [None]:
# 6. Scatter Plot – Show correlation/relationship
plt.figure()
plt.scatter(x, y, alpha=0.6)
plt.title('Scatter Plot - Correlation Between Variables')
plt.xlabel('X')
plt.ylabel('Y')
# Use to explore correlation and spot clusters/outliers
plt.show()

#### Data Visualization Types
1. Univariate Visualization     
    Purpose: Understand distribution, central tendency, or frequency.       
    Common Chart Types:     
    - Histogram – distribution of numeric values      
    - Box Plot – summary stats + outliers     
    - Bar Chart – frequency of categories     
    - Pie Chart – proportion of categories        

2. Bivariate Visualization      
    Purpose: Explore relationships, comparisons, or trends.     
    Common Chart Types:
    - Scatter Plot – correlation between two numeric variables
    - Line Chart – trend over time (time vs metric)
    - Grouped Bar Chart – comparison between groups
    - Box Plot (by category) – distribution comparison

3. Multivariate Visualization       
    Purpose: Show interaction or pattern among multiple features.       
    Common Chart Types:
    - Bubble Chart – scatter plot with 3rd variable as size
    - Colored Scatter Plot – 3rd variable as color
    - Stacked Bar/Area Chart – parts of whole over time
    - Facet Grid (Seaborn) – small multiples by category

#### Data Visualization Practice

In [None]:
# Load iris dataset
df = sns.load_dataset('iris')

# Preview the first 5 rows
df.head()

In [None]:
""" 
Objective: Points show the relationship between length and width of petals for all species.
"""

plt.scatter(df['petal_length'], df['petal_width'])
plt.title("Petal Length vs Petal Width")
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.show()

In [None]:
""" 
Objective: Color shows species differentiation.
"""

# Map species to colors
species_codes = df['species'].astype('category').cat.codes

plt.scatter(df['petal_length'], df['petal_width'], c=species_codes, cmap='viridis')
plt.title("Petal Length vs Width (by Species)")
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.colorbar(label='Species Code')
plt.show()


In [None]:
""" 
Objective: Improve visibility of overlapping points
"""

plt.figure(figsize=(8, 5))

# Improved scatter with edge outlines and labels
plt.scatter(df['petal_length'], df['petal_width'],
            c=species_codes, cmap='Set1',
            s=60, edgecolor='black', alpha=0.5)

plt.title("Iris Dataset: Petal Length vs Width", fontsize=14)
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.grid(True)
plt.tight_layout()
plt.colorbar(label="Species Code")
plt.show()

In [None]:
""" 
Objective: Automate using Seaborn
"""

sns.scatterplot(data=df, x='petal_length', y='petal_width', hue='species', palette='Set2', alpha=0.7)
plt.title("Petal Length vs Width by Species")
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.grid(True)
plt.show()


In [None]:
"""
Class Activity: Visualizing the data
"""

# Sample dataset creation (simulate student data)
data = pd.DataFrame({
    'Score': np.random.normal(75, 10, 100),
    'StudyHours': np.random.normal(5, 1.5, 100),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Grade': np.random.choice(['A', 'B', 'C', 'D'], 100)
})

# TODO: Answer the following questions using visualizations
# Question 1: What is the distribution of student scores?
# Question 2: Is there a relationship between study hours and scores?
# Question 3: How do study hours and scores vary by gender?


### **Reflection**
If you have spare time before presenting your data, which would you choose: improving the design (color, layout, composition, etc.), testing readability by asking for feedback, or doing a deeper self-thought about the data? Explain why?

(answer here)

### **Exploration**
A data dashboard allows you to display key insights, metrics, and visualizations in an interactive and user-friendly format. Streamlit makes it easy to turn Python code into fully functional, shareable web apps for visualizing data.
- https://streamlit.io/
- https://dash.plotly.com/