In [None]:
# =============================================================================
# STEP 1: IMPORT LIBRARIES AND SETUP
# =============================================================================
# Import necessary libraries for web scraping and data processing
import pandas as pd  # For data manipulation and analysis
import ssl           # For handling SSL certificates
import warnings      # For suppressing warnings

# Suppress warnings to keep output clean
warnings.filterwarnings('ignore')

# Handle SSL certificate issues (needed for some websites)
# This allows us to access HTTPS websites without certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

print("✅ Libraries imported successfully!")
print("📚 Ready to start web scraping lottery data...")

In [None]:
# =============================================================================
# STEP 2: DEFINE DATA CLEANING FUNCTIONS
# =============================================================================

def clean_lottery_data(df):
    """
    Clean the lottery data by removing monthly header rows and invalid data
    
    Parameters:
    df (DataFrame): Raw lottery data from web scraping
    
    Returns:
    DataFrame: Cleaned data with monthly headers removed
    """
    print(f"📊 Original data shape: {df.shape}")
    
    # Create a copy to avoid modifying the original data
    df_clean = df.copy()
    
    # Find and remove monthly header rows (e.g., "September 2025", "August 2025")
    monthly_rows = []
    for idx, row in df_clean.iterrows():
        # Get all non-null values in the row
        non_null_values = [str(val) for val in row if pd.notna(val)]
        
        if len(non_null_values) > 0:
            # Check if all values are the same and contain month names
            # This identifies rows like "September 2025, September 2025, September 2025"
            if len(set(non_null_values)) == 1 and any(month in non_null_values[0] for month in 
                ['January', 'February', 'March', 'April', 'May', 'June', 
                 'July', 'August', 'September', 'October', 'November', 'December']):
                monthly_rows.append(idx)
    
    # Remove the identified monthly header rows
    df_clean = df_clean.drop(monthly_rows)
    
    # Remove rows where Draw Number doesn't contain a slash (invalid lottery numbers)
    # Valid lottery numbers should look like "25/096", "25/095", etc.
    df_clean = df_clean[df_clean['Draw Number'].str.contains('/', na=False)]
    
    # Reset the index after dropping rows (important for data integrity)
    df_clean = df_clean.reset_index(drop=True)
    
    print(f"🗑️  Removed {len(monthly_rows)} monthly header rows")
    print(f"📊 Cleaned data shape: {df_clean.shape}")
    
    return df_clean

def split_balls_drawn(balls_str):
    """
    Split balls drawn string into individual numbers
    
    Parameters:
    balls_str (str): String containing lottery numbers (e.g., "5 18 23 24 29 49 11")
    
    Returns:
    list: List of 7 individual numbers
    """
    # Handle missing values
    if pd.isna(balls_str):
        return [None] * 7
    
    # Remove commas and split by spaces, then filter out empty strings
    numbers = [num.strip() for num in str(balls_str).replace(',', '').split() if num.strip()]
    
    # Ensure we have exactly 7 numbers (pad with None if less)
    while len(numbers) < 7:
        numbers.append(None)
    
    # Return only the first 7 numbers (in case there are more)
    return numbers[:7]

print("✅ Data cleaning functions defined successfully!")
print("🔧 Functions ready: clean_lottery_data() and split_balls_drawn()")


In [None]:
# =============================================================================
# STEP 3: WEB SCRAPING - FETCH LOTTERY DATA
# =============================================================================

# Define the URL to scrape
url = "https://lottery.hk/en/mark-six/results/2025"

try:
    print("🌐 Fetching data from:", url)
    print("⏳ Please wait while we scrape the lottery results...")
    
    # Use pandas.read_html() to automatically extract tables from the webpage
    # This function finds all <table> elements and converts them to DataFrames
    scraped = pd.read_html(url)
    
    print(f"✅ Successfully scraped {len(scraped)} tables from the website")
    
    # Check if we got any data
    if scraped:
        # Get the first (and usually only) table containing the lottery results
        df = scraped[0]
        
        print(f"\n📊 Original table information:")
        print(f"   - Shape: {df.shape} (rows × columns)")
        print(f"   - Columns: {list(df.columns)}")
        
        print(f"\n👀 First 10 rows of raw data:")
        print(df.head(10))
        
    else:
        print("❌ No tables found on the webpage")
        df = None
        
except Exception as e:
    print(f"❌ Error occurred during web scraping: {e}")
    print("💡 Make sure you have the required dependencies installed:")
    print("   pip install pandas lxml html5lib")
    df = None


In [None]:
# =============================================================================
# STEP 4: DATA CLEANING - REMOVE MONTHLY HEADERS
# =============================================================================

# Only proceed if we successfully scraped data
if df is not None:
    print("🧹 Starting data cleaning process...")
    
    # Clean the data using our custom function
    # This removes monthly header rows and invalid lottery entries
    df_clean = clean_lottery_data(df)
    
    print(f"\n📊 After cleaning:")
    print(f"   - Shape: {df_clean.shape} (rows × columns)")
    print(f"   - Columns: {list(df_clean.columns)}")
    
    print(f"\n👀 First 10 rows of cleaned data:")
    print(df_clean.head(10))
    
    print(f"\n✅ Data cleaning completed successfully!")
    
else:
    print("❌ Cannot proceed with data cleaning - no data available")
    df_clean = None


In [None]:
# =============================================================================
# STEP 5: REMOVE UNNECESSARY COLUMNS
# =============================================================================

# Only proceed if we have cleaned data
if df_clean is not None:
    print("🗑️  Removing unnecessary columns...")
    
    # Drop the 'Draw Number' column as it's not needed for analysis
    # Draw numbers like "25/096", "25/095" are just identifiers
    df_clean = df_clean.drop(columns=['Draw Number'])
    
    print(f"✅ Removed 'Draw Number' column")
    print(f"📊 New shape: {df_clean.shape} (rows × columns)")
    print(f"📋 Remaining columns: {list(df_clean.columns)}")
    
    print(f"\n👀 Data after removing 'Draw Number' column:")
    print(df_clean.head())
    
else:
    print("❌ Cannot proceed - no cleaned data available")


In [None]:
# =============================================================================
# STEP 6: FORMAT LOTTERY NUMBERS WITH COMMAS
# =============================================================================

# Only proceed if we have data
if df_clean is not None:
    print("🎯 Formatting lottery numbers with commas...")
    
    def format_balls_drawn(balls_str):
        """
        Format balls drawn string to have commas between numbers
        
        Parameters:
        balls_str (str): String containing lottery numbers (e.g., "5 18 23 24 29 49 11")
        
        Returns:
        str: Formatted string with commas (e.g., "5, 18, 23, 24, 29, 49, 11")
        """
        # Handle missing values
        if pd.isna(balls_str):
            return balls_str
        
        # Split by spaces and filter out empty strings
        numbers = [num.strip() for num in str(balls_str).split() if num.strip()]
        
        # Join the numbers with commas and spaces
        return ', '.join(numbers)
    
    # Apply the formatting function to the 'Balls Drawn' column
    df_clean['Balls Drawn'] = df_clean['Balls Drawn'].apply(format_balls_drawn)
    
    print(f"✅ Formatted lottery numbers with commas")
    print(f"📊 Shape remains: {df_clean.shape}")
    
    print(f"\n👀 Data after formatting numbers with commas:")
    print(df_clean.head())
    
else:
    print("❌ Cannot proceed - no data available")


In [None]:
# =============================================================================
# STEP 7: SEPARATE LOTTERY NUMBERS INTO INDIVIDUAL COLUMNS
# =============================================================================

# Only proceed if we have data
if df_clean is not None:
    print("🔢 Separating lottery numbers into individual columns...")
    
    # Apply our split_balls_drawn function to create separate number columns
    # This converts "5, 18, 23, 24, 29, 49, 11" into 7 separate columns
    balls_data = df_clean['Balls Drawn'].apply(split_balls_drawn)
    
    # Create 7 new columns (num1, num2, num3, num4, num5, num6, num7)
    # Each column will contain one lottery number
    for i in range(7):
        df_clean[f'num{i+1}'] = balls_data.apply(lambda x: x[i])
        print(f"   ✅ Created column 'num{i+1}'")
    
    # Remove the original 'Balls Drawn' column since we now have separate columns
    df_clean = df_clean.drop(columns=['Balls Drawn'])
    
    print(f"\n✅ Successfully separated lottery numbers into 7 individual columns")
    print(f"📊 New shape: {df_clean.shape} (rows × columns)")
    print(f"📋 New columns: {list(df_clean.columns)}")
    
    print(f"\n👀 Data after separating numbers:")
    print(df_clean.head())
    
else:
    print("❌ Cannot proceed - no data available")


In [None]:
# =============================================================================
# STEP 8: FINAL CLEANUP - REMOVE DETAILS COLUMN
# =============================================================================

# Only proceed if we have data
if df_clean is not None:
    print("🧹 Final cleanup - removing Details column...")
    
    # Drop the 'Details' column as it contains no useful information (all NaN values)
    df_clean = df_clean.drop(columns=['Details'])
    
    print(f"✅ Removed 'Details' column")
    print(f"📊 Final shape: {df_clean.shape} (rows × columns)")
    print(f"📋 Final columns: {list(df_clean.columns)}")
    
    print(f"\n👀 Final cleaned data:")
    print(df_clean.head())
    
    print(f"\n🎉 Data processing completed successfully!")
    
else:
    print("❌ Cannot proceed - no data available")


In [None]:
# =============================================================================
# STEP 9: SAVE DATA AND DISPLAY SUMMARY
# =============================================================================

# Only proceed if we have final data
if df_clean is not None:
    print("💾 Saving final data to CSV file...")
    
    # Define the output filename
    output_file = 'lottery_results_2025_final.csv'
    
    # Save the cleaned data to CSV file
    df_clean.to_csv(output_file, index=False)
    
    print(f"✅ Final clean data saved to '{output_file}'")
    
    # Display data types for each column
    print(f"\n📊 Data types:")
    print(df_clean.dtypes)
    
    # Display sample of final data
    print(f"\n👀 Sample of final clean data (first 10 rows):")
    print(df_clean.head(10))
    
    # Display comprehensive summary
    print(f"\n📈 SUMMARY STATISTICS:")
    print(f"   🎯 Total lottery draws: {len(df_clean)}")
    print(f"   📅 Date range: {df_clean['Draw Date'].min()} to {df_clean['Draw Date'].max()}")
    print(f"   🔢 Number columns: {[col for col in df_clean.columns if col.startswith('num')]}")
    print(f"   📊 Total columns: {len(df_clean.columns)}")
    print(f"   📁 File saved as: {output_file}")
    
    print(f"\n🎉 WEB SCRAPING AND DATA PROCESSING COMPLETED SUCCESSFULLY!")
    print(f"📚 The data is now ready for analysis, machine learning, or further processing.")
    
else:
    print("❌ Cannot save data - no final data available")


In [2]:
# =============================================================================
# END OF NOTEBOOK
# =============================================================================
# This notebook demonstrates a complete web scraping and data processing pipeline
# for Hong Kong Mark Six lottery results. Each cell builds upon the previous one,
# making it easy for students to understand the step-by-step process.
