# Module 2, Lesson 3: Data Sources, APIs, and Data Quality

## Setup

In [None]:
import pandas as pd
import numpy as np
import json
import requests
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("Libraries loaded successfully!")

---

## Part 1: Understanding Data Sources
**Internal vs External Data**

### Example 1: Internal Data Sources
Internal data comes from within your organization. This example simulates different internal systems that a company might have.

In [None]:
# Simulating internal company data sources

# Source 1: Sales Database
sales_data = pd.DataFrame({
    'transaction_id': range(1000, 1010),
    'date': pd.date_range('2024-01-01', periods=10),
    'product_id': np.random.choice(['P001', 'P002', 'P003'], 10),
    'quantity': np.random.randint(1, 5, 10),
    'revenue': np.random.uniform(100, 1000, 10).round(2)
})

# Source 2: HR System
hr_data = pd.DataFrame({
    'employee_id': ['E001', 'E002', 'E003', 'E004', 'E005'],
    'name': ['Alice Smith', 'Bob Jones', 'Carol White', 'David Brown', 'Eve Davis'],
    'department': ['Sales', 'IT', 'Sales', 'HR', 'IT'],
    'hire_date': pd.to_datetime(['2020-01-15', '2019-06-01', '2021-03-20', '2018-11-10', '2022-02-01']),
    'salary': [65000, 85000, 62000, 70000, 90000]
})

# Source 3: Customer CRM
crm_data = pd.DataFrame({
    'customer_id': ['C001', 'C002', 'C003', 'C004'],
    'company': ['TechCorp', 'RetailMax', 'FinanceHub', 'HealthPlus'],
    'contact_email': ['contact@techcorp.com', 'info@retailmax.com', 'hello@financehub.com', 'support@healthplus.com'],
    'account_status': ['Active', 'Active', 'Pending', 'Inactive'],
    'last_contact': pd.to_datetime(['2024-01-10', '2024-01-08', '2024-01-05', '2023-12-15'])
})

print("INTERNAL DATA SOURCES")
print("=" * 50)
print("\n1. Sales Database (Transactional System):")
print(sales_data.head())
print(f"\nShape: {sales_data.shape}")

print("\n2. HR System (Employee Data):")
print(hr_data)
print(f"\nShape: {hr_data.shape}")

print("\n3. CRM System (Customer Data):")
print(crm_data)
print(f"\nShape: {crm_data.shape}")

### Example 2: External Data Sources
External data comes from outside your organization. These are public or purchased datasets that provide additional context.

In [None]:
# Simulating external data sources

# External Source 1: Government Economic Data
economic_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=6, freq='M'),
    'unemployment_rate': [3.7, 3.8, 3.6, 3.5, 3.6, 3.7],
    'inflation_rate': [2.1, 2.2, 2.0, 2.3, 2.4, 2.2],
    'gdp_growth': [2.5, 2.4, 2.6, 2.7, 2.5, 2.8]
})

# External Source 2: Industry Benchmarks
benchmark_data = pd.DataFrame({
    'metric': ['Customer Acquisition Cost', 'Churn Rate', 'Average Order Value', 'Conversion Rate'],
    'industry_average': [150, 0.05, 225, 0.03],
    'top_performer': [100, 0.02, 350, 0.05],
    'our_company': [175, 0.04, 245, 0.035]
})

# External Source 3: Social Media Sentiment
social_data = pd.DataFrame({
    'week': pd.date_range('2024-01-01', periods=4, freq='W'),
    'mentions': [234, 189, 267, 312],
    'positive_sentiment': [0.65, 0.72, 0.58, 0.70],
    'negative_sentiment': [0.20, 0.15, 0.25, 0.18],
    'neutral_sentiment': [0.15, 0.13, 0.17, 0.12]
})

print("EXTERNAL DATA SOURCES")
print("=" * 50)
print("\n1. Government Economic Data:")
print(economic_data)

print("\n2. Industry Benchmarks:")
print(benchmark_data)

print("\n3. Social Media Analytics:")
print(social_data)

---

## Part 2: Working with Different File Formats
**Common Data File Types**

### Example 3: CSV Files
CSV is the most universal format. It's human-readable but doesn't preserve data types or handle special characters well.

In [None]:
# Create sample data
sample_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=5),
    'product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Cable'],
    'price': [999.99, 29.99, 79.99, 299.99, 9.99],
    'quantity': [2, 10, 5, 3, 20],
    'in_stock': [True, True, False, True, True]
})

# Save to CSV
sample_data.to_csv('sample_data.csv', index=False)
print("Original data types:")
print(sample_data.dtypes)
print("\nData preview:")
print(sample_data)

# Read back from CSV
csv_data = pd.read_csv('sample_data.csv')
print("\n" + "=" * 50)
print("After reading from CSV:")
print(csv_data.dtypes)
print("\nNotice: Date became string, boolean became bool (lucky!)")

# Proper way to read CSV with correct types
csv_data_correct = pd.read_csv('sample_data.csv', parse_dates=['date'])
print("\nWith parse_dates parameter:")
print(csv_data_correct.dtypes)

### Example 4: JSON Files
JSON is perfect for nested/hierarchical data and APIs. It preserves structure but can be verbose.

In [None]:
# Create nested JSON data
json_data = {
    "company": "DataCorp",
    "founded": 2020,
    "employees": [
        {
            "id": 1,
            "name": "Alice Johnson",
            "role": "Data Scientist",
            "skills": ["Python", "SQL", "Machine Learning"],
            "projects": {
                "current": "Customer Segmentation",
                "completed": 5
            }
        },
        {
            "id": 2,
            "name": "Bob Smith",
            "role": "Data Engineer",
            "skills": ["SQL", "Spark", "AWS"],
            "projects": {
                "current": "Data Pipeline",
                "completed": 8
            }
        }
    ],
    "metrics": {
        "revenue": 1500000,
        "growth_rate": 0.25,
        "client_count": 42
    }
}

# Save to JSON file
with open('company_data.json', 'w') as f:
    json.dump(json_data, f, indent=2)

print("JSON Structure:")
print(json.dumps(json_data, indent=2))

# Read and parse JSON
with open('company_data.json', 'r') as f:
    loaded_json = json.load(f)

# Extract nested data into DataFrame
employees_df = pd.DataFrame(loaded_json['employees'])
print("\n" + "=" * 50)
print("Employees extracted to DataFrame:")
print(employees_df)

# Normalize nested JSON to flat table
employees_normalized = pd.json_normalize(loaded_json['employees'])
print("\nNormalized (flattened) employee data:")
print(employees_normalized)

### Example 5: Excel Files
Excel files can have multiple sheets, formatting, and formulas. Great for business users but slower to process.

In [None]:
# Create multi-sheet Excel file
with pd.ExcelWriter('company_report.xlsx', engine='openpyxl') as writer:
    # Sheet 1: Sales data
    sales = pd.DataFrame({
        'Month': pd.date_range('2024-01-01', periods=6, freq='M'),
        'Revenue': [45000, 52000, 48000, 61000, 58000, 65000],
        'Costs': [30000, 35000, 32000, 38000, 36000, 40000]
    })
    sales['Profit'] = sales['Revenue'] - sales['Costs']
    sales.to_excel(writer, sheet_name='Sales', index=False)
    
    # Sheet 2: Employee data
    employees = pd.DataFrame({
        'Name': ['Alice', 'Bob', 'Charlie'],
        'Department': ['Sales', 'IT', 'Finance'],
        'Performance': [92, 88, 95]
    })
    employees.to_excel(writer, sheet_name='Employees', index=False)
    
    # Sheet 3: Summary
    summary = pd.DataFrame({
        'Metric': ['Total Revenue', 'Total Costs', 'Total Profit', 'Avg Performance'],
        'Value': [sales['Revenue'].sum(), sales['Costs'].sum(), 
                 sales['Profit'].sum(), employees['Performance'].mean()]
    })
    summary.to_excel(writer, sheet_name='Summary', index=False)

print("Excel file created with multiple sheets")

# Read Excel file - all sheets
all_sheets = pd.read_excel('company_report.xlsx', sheet_name=None)
print("\nSheets in Excel file:", list(all_sheets.keys()))

# Read specific sheet
sales_data = pd.read_excel('company_report.xlsx', sheet_name='Sales')
print("\nSales Sheet:")
print(sales_data)

# Read multiple sheets at once
print("\nSummary Sheet:")
print(all_sheets['Summary'])

---

## Part 3: Working with APIs
**Getting Data from Web Services**

### Example 6: Understanding API Responses
APIs return structured data (usually JSON) that we can parse and analyze. This simulates a typical API response.

In [None]:
# Simulate an API response (what you'd get from a real API)
api_response = {
    "status": "success",
    "timestamp": "2024-01-15T10:30:00Z",
    "data": {
        "user_stats": {
            "total_users": 15234,
            "active_users": 8921,
            "new_users_today": 127
        },
        "recent_activity": [
            {"user_id": "U001", "action": "login", "time": "10:25:00"},
            {"user_id": "U002", "action": "purchase", "time": "10:26:00"},
            {"user_id": "U003", "action": "view_product", "time": "10:27:00"},
            {"user_id": "U004", "action": "logout", "time": "10:28:00"},
            {"user_id": "U005", "action": "add_to_cart", "time": "10:29:00"}
        ],
        "performance_metrics": {
            "response_time_ms": 245,
            "uptime_percentage": 99.95,
            "error_rate": 0.002
        }
    },
    "pagination": {
        "page": 1,
        "per_page": 5,
        "total_pages": 10
    }
}

print("RAW API RESPONSE:")
print(json.dumps(api_response, indent=2))

# Extract specific data
print("\n" + "=" * 50)
print("EXTRACTED DATA:")
print(f"\nStatus: {api_response['status']}")
print(f"Total Users: {api_response['data']['user_stats']['total_users']:,}")
print(f"Active Users: {api_response['data']['user_stats']['active_users']:,}")

# Convert activity to DataFrame
activity_df = pd.DataFrame(api_response['data']['recent_activity'])
print("\nRecent Activity (as DataFrame):")
print(activity_df)

### Example 7: Real API Call (Public API)
Let's make a real API call to a public service that doesn't require authentication.

In [None]:
# Using a public API that doesn't require authentication
# JSONPlaceholder - Fake REST API for testing

try:
    # Make API request
    response = requests.get('https://jsonplaceholder.typicode.com/posts?userId=1')
    
    # Check if request was successful
    if response.status_code == 200:
        posts = response.json()
        
        print(f"Successfully retrieved {len(posts)} posts")
        print("\nFirst post (raw JSON):")
        print(json.dumps(posts[0], indent=2))
        
        # Convert to DataFrame
        posts_df = pd.DataFrame(posts)
        print("\n" + "=" * 50)
        print("Posts as DataFrame:")
        print(posts_df[['id', 'title']].head())
        
        # Basic analysis
        print("\nPost Statistics:")
        print(f"Total posts: {len(posts_df)}")
        print(f"Avg title length: {posts_df['title'].str.len().mean():.1f} characters")
        print(f"Avg body length: {posts_df['body'].str.len().mean():.1f} characters")
        
    else:
        print(f"API request failed with status code: {response.status_code}")
        
except requests.exceptions.RequestException as e:
    print(f"Error making API request: {e}")
    print("This is normal if you're offline or the API is down")

### Example 8: API Best Practices
When working with APIs, you need to handle errors, rate limits, and authentication properly.

In [None]:
def safe_api_call(url, max_retries=3, timeout=5):
    """
    Make an API call with proper error handling and retries.
    This is a template for production API calls.
    """
    
    for attempt in range(max_retries):
        try:
            # Make request with timeout
            response = requests.get(url, timeout=timeout)
            
            # Check status code
            if response.status_code == 200:
                return {'success': True, 'data': response.json()}
            elif response.status_code == 429:
                # Rate limited
                return {'success': False, 'error': 'Rate limited. Try again later.'}
            elif response.status_code == 404:
                return {'success': False, 'error': 'Resource not found'}
            else:
                return {'success': False, 'error': f'HTTP {response.status_code}'}
                
        except requests.exceptions.Timeout:
            if attempt == max_retries - 1:
                return {'success': False, 'error': 'Request timed out'}
            continue
            
        except requests.exceptions.ConnectionError:
            return {'success': False, 'error': 'Connection failed'}
            
        except Exception as e:
            return {'success': False, 'error': str(e)}
    
    return {'success': False, 'error': 'Max retries exceeded'}

# Test the function
print("Testing safe API call function:")
print("=" * 50)

# Good URL
result = safe_api_call('https://jsonplaceholder.typicode.com/posts/1')
if result['success']:
    print("✓ Successful call:")
    print(f"  Retrieved post with title: {result['data']['title'][:50]}...")
else:
    print(f"✗ Failed: {result['error']}")

# Bad URL (404)
result = safe_api_call('https://jsonplaceholder.typicode.com/posts/99999')
print(f"\n404 Test: {result}")

# Timeout test (unreachable URL)
result = safe_api_call('https://10.255.255.1', timeout=1)
print(f"\nTimeout Test: {result}")

---

## Part 4: Data Quality Assessment
**Identifying and Measuring Data Quality Issues**

### Example 9: Creating Messy Data
Let's create a dataset with common quality issues to learn how to identify and fix them.

In [None]:
# Create a dataset with various quality issues
np.random.seed(42)

messy_data = pd.DataFrame({
    'order_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010,
                 1002, 1012, 1013, 1014, 1015],  # Duplicate 1002
    'customer_name': ['John Smith', 'jane doe', 'Bob JONES', '  Alice Brown  ', 'Charlie Wilson',
                     'Diana Prince', 'Edward Norton', None, 'Fiona Green', 'George Harris',
                     'Jane Doe', 'Helen Troy', 'Ian Malcolm', '', 'Kate Johnson'],  # Inconsistent, missing
    'email': ['john@email.com', 'jane@email', 'bob@email.com', 'alice@email.com', 'NOT PROVIDED',
             'diana@email.com', 'edward@', None, 'fiona@email.com', 'george@email.com',
             'jane@email.com', 'helen@email.com', 'ian@invalid', 'kate@email.com', 'kate@email.com'],  # Invalid emails
    'order_amount': ['100.50', '200', 150.75, '99.99', -50,  # Mixed types, negative value
                    300, '0', 425.50, 180.25, 999999,  # Outlier
                    200.00, 175.50, None, 225.75, 150.00],  # Missing value
    'order_date': ['2024-01-15', '2024-01-16', '2024/01/17', '01-18-2024', '2024-01-19',
                  '2024-01-20', '2024-01-21', '2024-01-22', '2025-12-31', '2024-01-24',  # Future date
                  '2024-01-16', '2024-01-26', '2024-01-27', '1900-01-01', '2024-01-29'],  # Old date
    'product_category': ['Electronics', 'electronics', 'ELECTRONICS', 'Clothing', 'clothing',
                        'Food', 'food', 'Electronics', 'Books', 'books',
                        'Electronics', 'Clothing', None, 'Unknown', 'Food']  # Inconsistent case
})

print("Dataset with Quality Issues:")
print(messy_data)
print("\nData types:")
print(messy_data.dtypes)

### Example 10: Completeness Check
Completeness measures how much of your required data is actually present.

In [None]:
def check_completeness(df):
    """
    Assess the completeness of each column in the dataset.
    """
    print("COMPLETENESS ASSESSMENT")
    print("=" * 60)
    
    # Overall completeness
    total_cells = df.size
    missing_cells = df.isna().sum().sum()
    # Count empty strings as missing too
    empty_strings = (df == '').sum().sum()
    total_missing = missing_cells + empty_strings
    
    completeness = ((total_cells - total_missing) / total_cells) * 100
    
    print(f"Overall Completeness: {completeness:.1f}%")
    print(f"Total cells: {total_cells}")
    print(f"Missing values: {missing_cells}")
    print(f"Empty strings: {empty_strings}")
    
    print("\nPer-Column Completeness:")
    print("-" * 40)
    
    for col in df.columns:
        missing = df[col].isna().sum()
        empty = (df[col] == '').sum() if df[col].dtype == 'object' else 0
        total_issues = missing + empty
        pct_complete = ((len(df) - total_issues) / len(df)) * 100
        
        status = "✓" if pct_complete == 100 else "✗"
        print(f"{status} {col:20} {pct_complete:5.1f}% complete")
        if total_issues > 0:
            print(f"    → {missing} missing, {empty} empty")
    
    return completeness

completeness_score = check_completeness(messy_data)

### Example 11: Validity Check
Validity checks whether data values are within acceptable ranges and formats.

In [None]:
def check_validity(df):
    """
    Check for invalid values in the dataset.
    """
    print("\nVALIDITY ASSESSMENT")
    print("=" * 60)
    
    issues = []
    
    # Check email format
    if 'email' in df.columns:
        email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
        invalid_emails = df[~df['email'].str.match(email_pattern, na=False) & df['email'].notna()]
        if len(invalid_emails) > 0:
            print(f"Invalid emails found: {len(invalid_emails)}")
            print(f"  Examples: {invalid_emails['email'].head(3).tolist()}")
            issues.append(f"{len(invalid_emails)} invalid emails")
    
    # Check for negative amounts
    if 'order_amount' in df.columns:
        # Convert to numeric first
        amounts = pd.to_numeric(df['order_amount'], errors='coerce')
        negative_amounts = amounts[amounts < 0]
        if len(negative_amounts) > 0:
            print(f"\nNegative amounts found: {len(negative_amounts)}")
            print(f"  Values: {negative_amounts.tolist()}")
            issues.append(f"{len(negative_amounts)} negative amounts")
    
    # Check for future dates
    if 'order_date' in df.columns:
        dates = pd.to_datetime(df['order_date'], errors='coerce')
        future_dates = dates[dates > pd.Timestamp.now()]
        if len(future_dates) > 0:
            print(f"\nFuture dates found: {len(future_dates)}")
            print(f"  Dates: {future_dates.dt.date.tolist()}")
            issues.append(f"{len(future_dates)} future dates")
        
        # Check for very old dates
        old_dates = dates[dates < pd.Timestamp('2000-01-01')]
        if len(old_dates) > 0:
            print(f"\nSuspiciously old dates found: {len(old_dates)}")
            print(f"  Dates: {old_dates.dt.date.tolist()}")
            issues.append(f"{len(old_dates)} old dates")
    
    validity_score = max(0, 100 - (len(issues) * 10))
    print(f"\nValidity Score: {validity_score}%")
    print(f"Total issues found: {len(issues)}")
    
    return validity_score

validity_score = check_validity(messy_data)

### Example 12: Consistency Check
Consistency ensures that similar data is represented the same way throughout the dataset.

In [None]:
def check_consistency(df):
    """
    Check for inconsistent data representations.
    """
    print("\nCONSISTENCY ASSESSMENT")
    print("=" * 60)
    
    issues = []
    
    # Check text columns for inconsistent capitalization
    for col in df.select_dtypes(include=['object']).columns:
        if df[col].notna().any():
            # Get unique values when standardized to lowercase
            original_unique = df[col].dropna().unique()
            lowercase_unique = df[col].dropna().str.lower().unique()
            
            if len(original_unique) > len(lowercase_unique):
                print(f"\nInconsistent capitalization in '{col}':")
                # Group by lowercase to show variations
                for value in lowercase_unique[:3]:  # Show first 3 examples
                    variations = df[df[col].str.lower() == value][col].unique()
                    if len(variations) > 1:
                        print(f"  '{value}' appears as: {variations.tolist()}")
                issues.append(f"Inconsistent capitalization in {col}")
    
    # Check date formats
    if 'order_date' in df.columns:
        date_formats = []
        for date_str in df['order_date'].dropna():
            if '/' in str(date_str):
                date_formats.append('YYYY/MM/DD')
            elif str(date_str).count('-') == 2:
                if str(date_str).index('-') == 4:
                    date_formats.append('YYYY-MM-DD')
                else:
                    date_formats.append('MM-DD-YYYY')
        
        unique_formats = set(date_formats)
        if len(unique_formats) > 1:
            print(f"\nMultiple date formats detected: {unique_formats}")
            issues.append("Multiple date formats")
    
    consistency_score = max(0, 100 - (len(issues) * 15))
    print(f"\nConsistency Score: {consistency_score}%")
    
    return consistency_score

consistency_score = check_consistency(messy_data)

### Example 13: Uniqueness Check
Uniqueness identifies duplicate records that shouldn't exist.

In [None]:
def check_uniqueness(df, key_column='order_id'):
    """
    Check for duplicate records in the dataset.
    """
    print("\nUNIQUENESS ASSESSMENT")
    print("=" * 60)
    
    # Check primary key uniqueness
    if key_column in df.columns:
        total_records = len(df)
        unique_records = df[key_column].nunique()
        duplicate_records = total_records - unique_records
        
        print(f"Primary Key: {key_column}")
        print(f"Total records: {total_records}")
        print(f"Unique values: {unique_records}")
        print(f"Duplicate records: {duplicate_records}")
        
        if duplicate_records > 0:
            # Show duplicates
            duplicates = df[df.duplicated(subset=[key_column], keep=False)]
            print(f"\nDuplicate {key_column} values:")
            duplicate_ids = duplicates[key_column].value_counts()
            for id_val, count in duplicate_ids.items():
                print(f"  {id_val}: appears {count} times")
        
        uniqueness_score = (unique_records / total_records) * 100
    else:
        print(f"Key column '{key_column}' not found")
        uniqueness_score = 0
    
    # Check for complete row duplicates
    complete_duplicates = df.duplicated().sum()
    if complete_duplicates > 0:
        print(f"\nComplete row duplicates: {complete_duplicates}")
    
    print(f"\nUniqueness Score: {uniqueness_score:.1f}%")
    
    return uniqueness_score

uniqueness_score = check_uniqueness(messy_data)

### Example 14: Accuracy Check (Outliers)
Accuracy involves identifying values that might be correct but are suspicious outliers.

In [None]:
def check_accuracy(df):
    """
    Check for potential accuracy issues like outliers.
    """
    print("\nACCURACY ASSESSMENT (Outlier Detection)")
    print("=" * 60)
    
    # Check numeric columns for outliers
    if 'order_amount' in df.columns:
        amounts = pd.to_numeric(df['order_amount'], errors='coerce')
        
        # Calculate statistics
        Q1 = amounts.quantile(0.25)
        Q3 = amounts.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Find outliers
        outliers = amounts[(amounts < lower_bound) | (amounts > upper_bound)]
        
        print(f"Order Amount Analysis:")
        print(f"  Mean: ${amounts.mean():.2f}")
        print(f"  Median: ${amounts.median():.2f}")
        print(f"  Std Dev: ${amounts.std():.2f}")
        print(f"\n  IQR Method:")
        print(f"    Q1 (25%): ${Q1:.2f}")
        print(f"    Q3 (75%): ${Q3:.2f}")
        print(f"    IQR: ${IQR:.2f}")
        print(f"    Normal range: ${lower_bound:.2f} to ${upper_bound:.2f}")
        
        if len(outliers) > 0:
            print(f"\n  Outliers detected: {len(outliers)}")
            print(f"    Values: {outliers.tolist()}")
        else:
            print("\n  No outliers detected")
        
        # Z-score method
        z_scores = np.abs((amounts - amounts.mean()) / amounts.std())
        z_outliers = amounts[z_scores > 3]
        
        print(f"\n  Z-Score Method (|z| > 3):")
        if len(z_outliers) > 0:
            print(f"    Extreme outliers: {len(z_outliers)}")
            print(f"    Values: {z_outliers.tolist()}")
        else:
            print("    No extreme outliers")
    
    accuracy_score = 100 - (len(outliers) * 5) if 'outliers' in locals() else 100
    accuracy_score = max(0, accuracy_score)
    
    print(f"\nAccuracy Score: {accuracy_score}%")
    
    return accuracy_score

accuracy_score = check_accuracy(messy_data)

### Example 15: Comprehensive Data Quality Report
Combine all quality dimensions into a single report card for the dataset.

In [None]:
def generate_quality_report(df):
    """
    Generate a comprehensive data quality report.
    """
    print("\n" + "=" * 70)
    print(" " * 20 + "DATA QUALITY REPORT CARD")
    print("=" * 70)
    print(f"Dataset: Order Data")
    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    print(f"Records: {len(df)}")
    print(f"Columns: {len(df.columns)}")
    print("-" * 70)
    
    # Run all checks
    scores = {}
    
    # Quick versions of checks (without printing)
    # Completeness
    total_missing = df.isna().sum().sum() + (df == '').sum().sum()
    scores['Completeness'] = ((df.size - total_missing) / df.size) * 100
    
    # Validity (simplified)
    scores['Validity'] = validity_score  # Use previously calculated
    
    # Consistency (simplified)
    scores['Consistency'] = consistency_score  # Use previously calculated
    
    # Uniqueness
    scores['Uniqueness'] = uniqueness_score  # Use previously calculated
    
    # Accuracy
    scores['Accuracy'] = accuracy_score  # Use previously calculated
    
    # Calculate overall score
    overall_score = np.mean(list(scores.values()))
    
    # Print scores
    print("\nQuality Dimensions:")
    print("-" * 40)
    
    def get_grade(score):
        if score >= 95: return 'A+ 🌟'
        elif score >= 90: return 'A'
        elif score >= 85: return 'B+'
        elif score >= 80: return 'B'
        elif score >= 75: return 'C+'
        elif score >= 70: return 'C'
        elif score >= 65: return 'D'
        else: return 'F ⚠️'
    
    for dimension, score in scores.items():
        bar = '█' * int(score / 5) + '░' * (20 - int(score / 5))
        print(f"{dimension:15} {bar} {score:5.1f}% ({get_grade(score)})")
    
    print("-" * 40)
    print(f"\n{'OVERALL SCORE:':15} {'█' * int(overall_score / 5)}{'░' * (20 - int(overall_score / 5))} {overall_score:5.1f}% ({get_grade(overall_score)})")
    
    print("\n" + "=" * 70)
    
    print("\nTop Priority Issues to Address:")
    print("-" * 40)
    
    priorities = []
    if scores['Completeness'] < 90:
        priorities.append("1. Fill missing values or remove incomplete records")
    if scores['Validity'] < 90:
        priorities.append("2. Fix invalid email formats and date values")
    if scores['Consistency'] < 90:
        priorities.append("3. Standardize text capitalization and date formats")
    if scores['Uniqueness'] < 100:
        priorities.append("4. Remove or merge duplicate records")
    if scores['Accuracy'] < 90:
        priorities.append("5. Investigate and handle outliers")
    
    for priority in priorities[:3]:  # Show top 3 priorities
        print(f"  {priority}")
    
    return overall_score

overall_quality_score = generate_quality_report(messy_data)

### Cleanup
Remove temporary files created during the lesson.

In [None]:
# Clean up files created during the lesson
import os

files_to_remove = ['sample_data.csv', 'company_data.json', 'company_report.xlsx']

for file in files_to_remove:
    try:
        if os.path.exists(file):
            os.remove(file)
            print(f"Removed: {file}")
    except:
        pass

print("\nCleanup complete!")