In [4]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os

print("="*60)
print("MyNeta Lok Sabha 2024 - FIXED Scraper")
print("="*60)

# Configuration
BASE_URL = "https://myneta.info/LokSabha2024/index.php"
DELAY_SECONDS = 3
OUTPUT_FILE = '../data/raw/lok_sabha_2024_full.xlsx'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}

def scrape_page(page_num):
    """Scrape a single page - FIXED VERSION"""
    try:
        # Build URL
        params = {
            'action': 'summary',
            'subAction': 'candidates_analyzed',
            'sort': 'candidate',
            'page': page_num
        }
        
        # Make request
        response = requests.get(BASE_URL, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the main data table (it has class 'mytable' or similar)
        # Let's find all tables and get the one with candidate data
        tables = soup.find_all('table')
        
        # The candidate table has headers: Sno, Candidate, Constituency, Party, etc.
        candidate_table = None
        for table in tables:
            # Check if this table has the right headers
            headers_row = table.find('tr')
            if headers_row:
                header_text = headers_row.get_text()
                if 'Candidate' in header_text and 'Constituency' in header_text and 'Party' in header_text:
                    candidate_table = table
                    break
        
        if candidate_table is None:
            print(f"  ‚úó Page {page_num}: Could not find candidate table")
            return None
        
        # Parse the table
        rows = []
        header_row = candidate_table.find('tr')
        headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
        
        # Get all data rows (skip header)
        for row in candidate_table.find_all('tr')[1:]:
            cols = row.find_all('td')
            if len(cols) > 0:
                row_data = []
                for col in cols:
                    # Get text, clean it up
                    text = col.get_text(strip=True)
                    # Remove extra whitespace
                    text = ' '.join(text.split())
                    row_data.append(text)
                rows.append(row_data)
        
        if len(rows) > 0:
            df = pd.DataFrame(rows, columns=headers[:len(rows[0])])
            print(f"  ‚úì Page {page_num}: {len(df)} records")
            return df
        else:
            print(f"  ‚úó Page {page_num}: No data rows found")
            return None
            
    except Exception as e:
        print(f"  ‚úó Page {page_num}: Error - {str(e)}")
        return None

# Test with just page 1 first
print("\nTesting with page 1...")
df_test = scrape_page(1)

if df_test is not None and len(df_test) > 0:
    print("\n‚úì SUCCESS! Here's what we got:")
    print(f"\nShape: {df_test.shape}")
    print(f"\nColumns: {df_test.columns.tolist()}")
    print(f"\nFirst 3 rows:")
    display(df_test.head(3))
    
    # Ask if user wants to continue
    print("\n" + "="*60)
    proceed = input("Data looks good? Continue with all 84 pages? (y/n): ")
    
    if proceed.lower() == 'y':
        print("\nStarting full scrape...")
        all_data = [df_test]  # Start with page 1
        
        for page in range(2, 85):  # Pages 2-84
            print(f"\n[{page}/84] Scraping page {page}...")
            df = scrape_page(page)
            
            if df is not None and len(df) > 0:
                all_data.append(df)
            
            # Save progress every 10 pages
            if page % 10 == 0:
                temp_df = pd.concat(all_data, ignore_index=True)
                temp_df.to_excel('../data/raw/lok_sabha_temp.xlsx', index=False)
                print(f"  üíæ Progress: {len(temp_df)} records saved")
            
            # Delay
            if page < 84:
                time.sleep(DELAY_SECONDS)
        
        # Combine all
        final_df = pd.concat(all_data, ignore_index=True)
        final_df = final_df.drop_duplicates()
        final_df.to_excel(OUTPUT_FILE, index=False)
        
        print(f"\n‚úì COMPLETE! {len(final_df)} records saved to {OUTPUT_FILE}")
    else:
        print("\nScraping cancelled. Fix any issues and try again.")
else:
    print("\n‚úó FAILED! The scraper needs more debugging.")
    print("\nLet me try a different approach...")

MyNeta Lok Sabha 2024 - FIXED Scraper

Testing with page 1...
  ‚úó Page 1: Error - cannot access local variable 'headers' where it is not associated with a value

‚úó FAILED! The scraper needs more debugging.

Let me try a different approach...


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://myneta.info/LokSabha2024/index.php?action=summary&subAction=candidates_analyzed&sort=candidate&page=1"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all tables
tables = soup.find_all('table')
print(f"Found {len(tables)} tables on the page\n")

# Check each table
for i, table in enumerate(tables):
    print(f"="*60)
    print(f"TABLE {i+1}:")
    print("="*60)
    
    # Get headers
    try:
        header_row = table.find('tr')
        if header_row:
            headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
            print(f"Headers: {headers[:10]}")  # First 10 headers
            
            # Count rows
            rows = table.find_all('tr')
            print(f"Total rows: {len(rows)}")
            
            # Show first data row
            if len(rows) > 1:
                first_data = [td.get_text(strip=True) for td in rows[1].find_all('td')]
                print(f"First row sample: {first_data[:5]}")
    except:
        print("Could not parse this table")
    
    print()

# Also try pd.read_html
print("\n" + "="*60)
print("PANDAS READ_HTML RESULTS:")
print("="*60)
dfs = pd.read_html(url)
print(f"Found {len(dfs)} dataframes")
for i, df in enumerate(dfs):
    print(f"\nDataFrame {i+1}: Shape {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    if len(df) > 0:
        print(df.head(2))

Found 8 tables on the page

TABLE 1:
Headers: ['', '']
Total rows: 1

TABLE 2:
Headers: ['', '']
Total rows: 1

TABLE 3:
Headers: ['HIGHLIGHTS OF CANDIDATES']
Total rows: 9
First row sample: []

TABLE 4:
Headers: ['HIGHLIGHTS OF WINNERS']
Total rows: 8
First row sample: ['Total winners analyzed by NEW', '543']

TABLE 5:
Headers: ['Sno', 'Candidate‚àá', 'Constituency', 'Party', 'Criminal Case', 'Education', 'Total Assets', 'Liabilities']
Total rows: 1

TABLE 6:
Headers: ['DONATE NOW√óShare On:', 'DONATE NOW√ó', 'Share On:', '', '', '', 'Download AppFollow us on', 'Download App', '', 'Follow us on']
Total rows: 3
First row sample: ['DONATE NOW√ó', 'Share On:', '', '', '']

TABLE 7:
Headers: ['DONATE NOW√ó', 'Share On:', '', '', '']
Total rows: 1

TABLE 8:
Headers: ['Download App', '', 'Follow us on', '', '', '', '', '', '']
Total rows: 1


PANDAS READ_HTML RESULTS:
Found 8 dataframes

DataFrame 1: Shape (1, 2)
Columns: [0, 1]
    0   1
0 NaN NaN

DataFrame 2: Shape (1, 2)
Columns: [0, 1]

In [6]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime

print("="*60)
print("MyNeta Scraper - Table 5 Extractor")
print("="*60)

def scrape_page(page_num):
    """Scrape page and get table 5 data"""
    try:
        url = f"https://myneta.info/LokSabha2024/index.php?action=summary&subAction=candidates_analyzed&sort=candidate&page={page_num}"
        
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        # Use pandas read_html to get all tables
        dfs = pd.read_html(response.text)
        
        # Table 5 is index 4 (0-indexed)
        if len(dfs) >= 5:
            df = dfs[4]  # 5th table (index 4)
            
            # Clean the data
            # Remove any empty rows
            df = df.dropna(how='all')
            
            print(f"  ‚úì Page {page_num}: {len(df)} records")
            return df
        else:
            print(f"  ‚úó Page {page_num}: Less than 5 tables found")
            return None
            
    except Exception as e:
        print(f"  ‚úó Page {page_num}: Error - {str(e)}")
        return None

# TEST with page 1
print("\nTesting page 1...")
df_test = scrape_page(1)

if df_test is not None and len(df_test) > 0:
    print("\n‚úì SUCCESS!\n")
    print(f"Shape: {df_test.shape}")
    print(f"Columns: {df_test.columns.tolist()}")
    print("\nFirst 5 rows:")
    display(df_test.head())
    
    print("\n" + "="*60)
    print("Sample data check:")
    print(f"- Candidates found: {len(df_test)}")
    print(f"- Sample candidate: {df_test['Candidate'].iloc[0] if 'Candidate' in df_test.columns else 'N/A'}")
    print(f"- Sample party: {df_test['Party'].iloc[0] if 'Party' in df_test.columns else 'N/A'}")
    print("="*60)
    
    proceed = input("\nLooks good? Scrape all 84 pages? (y/n): ")
    
    if proceed.lower() == 'y':
        print("\nüöÄ Starting full scrape (84 pages √ó 3 sec = ~4 mins)")
        print(f"Start time: {datetime.now().strftime('%H:%M:%S')}\n")
        
        all_data = [df_test]  # Start with page 1
        failed_pages = []
        
        for page in range(2, 85):
            print(f"[{page}/84] ", end="")
            
            df = scrape_page(page)
            
            if df is not None and len(df) > 0:
                all_data.append(df)
            else:
                failed_pages.append(page)
            
            # Save progress every 10 pages
            if page % 10 == 0:
                temp_df = pd.concat(all_data, ignore_index=True)
                temp_df.to_excel('../data/raw/lok_sabha_temp.xlsx', index=False)
                print(f"\n  üíæ Progress saved: {len(temp_df)} total records")
            
            # Respectful delay
            if page < 84:
                time.sleep(3)
        
        # Final combine
        print("\n" + "="*60)
        print("Combining all data...")
        final_df = pd.concat(all_data, ignore_index=True)
        
        # Remove duplicates
        initial = len(final_df)
        final_df = final_df.drop_duplicates()
        final = len(final_df)
        
        # Save
        final_df.to_excel('../data/raw/lok_sabha_2024_full.xlsx', index=False)
        
        print(f"\n‚úÖ SCRAPING COMPLETE!")
        print("="*60)
        print(f"Total records: {final}")
        print(f"Duplicates removed: {initial - final}")
        print(f"Failed pages: {len(failed_pages)}")
        if failed_pages:
            print(f"  Pages: {failed_pages}")
        print(f"Saved to: lok_sabha_2024_full.xlsx")
        print(f"End time: {datetime.now().strftime('%H:%M:%S')}")
        print("="*60)
        
        # Show final stats
        print("\nüìä DATASET SUMMARY:")
        print(f"Shape: {final_df.shape}")
        print(f"Columns: {final_df.columns.tolist()}")
        
        # Store in variable for next analysis
        df = final_df
        
    else:
        print("\nCancelled. Ready when you are!")
        df = df_test
else:
    print("\n‚ùå Failed to scrape page 1. Need to debug further.")

MyNeta Scraper - Table 5 Extractor

Testing page 1...
  ‚úì Page 1: 0 records

‚ùå Failed to scrape page 1. Need to debug further.


  dfs = pd.read_html(response.text)


In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://myneta.info/LokSabha2024/index.php?action=summary&subAction=candidates_analyzed&sort=candidate&page=1"

print("="*70)
print("COMPLETE DIAGNOSTIC")
print("="*70)

# Method 1: pandas read_html
print("\nüìä METHOD 1: pd.read_html()")
print("-"*70)
try:
    dfs = pd.read_html(url)
    print(f"Total tables found: {len(dfs)}\n")
    
    for i, df in enumerate(dfs):
        print(f"\nTable {i+1}:")
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {df.columns.tolist()}")
        if len(df) > 0:
            print(f"  First row: {df.iloc[0].tolist()[:5]}")  # First 5 values
        print()
except Exception as e:
    print(f"Error: {e}")

# Method 2: BeautifulSoup
print("\nüîç METHOD 2: BeautifulSoup")
print("-"*70)
try:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    tables = soup.find_all('table')
    print(f"Total <table> tags found: {len(tables)}\n")
    
    for i, table in enumerate(tables):
        print(f"\nTable {i+1}:")
        
        # Count rows
        all_rows = table.find_all('tr')
        print(f"  Total <tr> rows: {len(all_rows)}")
        
        # Get first row
        if len(all_rows) > 0:
            first_row = all_rows[0]
            cells = first_row.find_all(['th', 'td'])
            headers = [cell.get_text(strip=True) for cell in cells]
            print(f"  First row: {headers[:8]}")
        
        # Get second row (actual data)
        if len(all_rows) > 1:
            second_row = all_rows[1]
            cells = second_row.find_all('td')
            data = [cell.get_text(strip=True) for cell in cells]
            print(f"  Second row: {data[:8]}")
        
        print()
        
except Exception as e:
    print(f"Error: {e}")

# Method 3: Check page source
print("\nüìÑ METHOD 3: Page Source Check")
print("-"*70)
try:
    response = requests.get(url)
    html = response.text
    
    # Search for candidate names we know exist
    if "Abu Bakar" in html:
        print("‚úì Found 'Abu Bakar' in HTML")
    else:
        print("‚úó 'Abu Bakar' NOT in HTML")
    
    if "Candidate" in html:
        print("‚úì Found 'Candidate' header in HTML")
    else:
        print("‚úó 'Candidate' header NOT in HTML")
    
    # Count table tags
    table_count = html.count('<table')
    print(f"‚úì HTML contains {table_count} <table> tags")
    
    # Check for tbody
    tbody_count = html.count('<tbody')
    print(f"‚úì HTML contains {tbody_count} <tbody> tags")
    
except Exception as e:
    print(f"Error: {e}")

print("\n" + "="*70)
print("DIAGNOSTIC COMPLETE")
print("="*70)

COMPLETE DIAGNOSTIC

üìä METHOD 1: pd.read_html()
----------------------------------------------------------------------
Total tables found: 8


Table 1:
  Shape: (1, 2)
  Columns: [0, 1]
  First row: [nan, nan]


Table 2:
  Shape: (1, 2)
  Columns: [0, 1]
  First row: [nan, nan]


Table 3:
  Shape: (7, 2)
  Columns: [('HIGHLIGHTS OF CANDIDATES', 'Total number of constituencies analyzed'), ('Unnamed: 1_level_0', '543')]
  First row: ['Total candidates analyzed by NEW', '8338']


Table 4:
  Shape: (7, 2)
  Columns: ['HIGHLIGHTS OF WINNERS', 'Unnamed: 1']
  First row: ['Total winners analyzed by NEW', '543']


Table 5:
  Shape: (0, 8)
  Columns: ['Sno', 'Candidate‚àá', 'Constituency', 'Party', 'Criminal Case', 'Education', 'Total Assets', 'Liabilities']


Table 6:
  Shape: (1, 2)
  Columns: [0, 1]
  First row: ["DONATE NOW √ó  donate_now.onclick = function()  { var modal_2 = document.getElementById('myModal');  var span_2 = document.getElementsByClassName('close_2')[0];  modal_2.style.d

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

print("="*60)
print("MyNeta Scraper - Selenium Version")
print("="*60)

def setup_driver():
    """Setup Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in background
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

def scrape_page_selenium(driver, page_num):
    """Scrape a page using Selenium"""
    try:
        url = f"https://myneta.info/LokSabha2024/index.php?action=summary&subAction=candidates_analyzed&sort=candidate&page={page_num}"
        
        driver.get(url)
        
        # Wait for table to load (wait for rows to appear)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        
        # Give it extra time for JavaScript to populate
        time.sleep(2)
        
        # Get page source after JavaScript has loaded
        html = driver.page_source
        
        # Now use pandas to parse
        dfs = pd.read_html(html)
        
        # Table 5 should now have data
        if len(dfs) >= 5:
            df = dfs[4]  # 5th table (index 4)
            
            # Check if it has data
            if len(df) > 0:
                print(f"  ‚úì Page {page_num}: {len(df)} records")
                return df
            else:
                print(f"  ‚úó Page {page_num}: Table empty")
                return None
        else:
            print(f"  ‚úó Page {page_num}: Not enough tables")
            return None
            
    except Exception as e:
        print(f"  ‚úó Page {page_num}: Error - {str(e)}")
        return None

# Test with page 1
print("\nSetting up browser...")
driver = setup_driver()

print("Testing page 1...")
df_test = scrape_page_selenium(driver, 1)

if df_test is not None and len(df_test) > 0:
    print("\n‚úÖ SUCCESS!\n")
    print(f"Shape: {df_test.shape}")
    print(f"Columns: {df_test.columns.tolist()}")
    print("\nFirst 5 rows:")
    display(df_test.head())
    
    proceed = input("\n\nScrape all 84 pages? (y/n): ")
    
    if proceed.lower() == 'y':
        print("\nüöÄ Starting full scrape...")
        print("This will take ~10 minutes (84 pages √ó 7 seconds)")
        
        all_data = [df_test]
        
        for page in range(2, 85):
            print(f"\n[{page}/84] ", end="")
            df = scrape_page_selenium(driver, page)
            
            if df is not None and len(df) > 0:
                all_data.append(df)
            
            # Save progress every 10 pages
            if page % 10 == 0:
                temp = pd.concat(all_data, ignore_index=True)
                temp.to_excel('../data/raw/progress.xlsx', index=False)
                print(f"\n  üíæ Saved: {len(temp)} records")
            
            time.sleep(2)  # 2 seconds between pages
        
        # Close browser
        driver.quit()
        
        # Final save
        final_df = pd.concat(all_data, ignore_index=True)
        final_df = final_df.drop_duplicates()
        final_df.to_excel('../data/raw/lok_sabha_2024_full.xlsx', index=False)
        
        print(f"\n\n‚úÖ COMPLETE! {len(final_df)} records saved")
        df = final_df
        
    else:
        driver.quit()
        print("\nCancelled")
        df = df_test
else:
    driver.quit()
    print("\n‚ùå Failed. Need different approach.")

MyNeta Scraper - Selenium Version

Setting up browser...
Testing page 1...
  ‚úì Page 1: 100 records

‚úÖ SUCCESS!

Shape: (100, 8)
Columns: ['Sno', 'Candidate‚àá', 'Constituency', 'Party', 'Criminal Case', 'Education', 'Total Assets', 'Liabilities']

First 5 rows:


  dfs = pd.read_html(html)


Unnamed: 0,Sno,Candidate‚àá,Constituency,Party,Criminal Case,Education,Total Assets,Liabilities
0,1,Abu Bakar Rahmani,MADHUBANI,Country Citizen Party,0,Post Graduate,"Rs¬†13,58,312 ~ 13¬†Lacs+",Rs¬†0 ~
1,2,Adv Najib Shaikh,AKOLA,Indian National League,0,Graduate Professional,"Rs¬†25,87,782 ~ 25¬†Lacs+","Rs¬†18,00,000 ~ 18¬†Lacs+"
2,3,Advocate Balwinder Kumar,JALANDHAR (SC),BSP,1,Post Graduate,,
3,4,Anandswamy Gaddadevarmath,HAVERI,INC,1,Graduate,"Rs¬†56,81,54,912 ~ 56¬†Crore+","Rs¬†22,46,68,569 ~ 22¬†Crore+"
4,5,Bhagyaraj. J,VILUPPURAM (SC),AIADMK,0,12th Pass,"Rs¬†4,79,83,303 ~ 4¬†Crore+","Rs¬†1,43,64,469 ~ 1¬†Crore+"




Scrape all 84 pages? (y/n):  y



üöÄ Starting full scrape...
This will take ~10 minutes (84 pages √ó 7 seconds)

[2/84] 

  dfs = pd.read_html(html)


  ‚úì Page 2: 100 records

[3/84] 

  dfs = pd.read_html(html)


  ‚úì Page 3: 100 records

[4/84] 

  dfs = pd.read_html(html)


  ‚úì Page 4: 100 records

[5/84] 

  dfs = pd.read_html(html)


  ‚úì Page 5: 100 records

[6/84] 

  dfs = pd.read_html(html)


  ‚úì Page 6: 100 records

[7/84] 

  dfs = pd.read_html(html)


  ‚úì Page 7: 100 records

[8/84] 

  dfs = pd.read_html(html)


  ‚úì Page 8: 100 records

[9/84] 

  dfs = pd.read_html(html)


  ‚úì Page 9: 100 records

[10/84] 

  dfs = pd.read_html(html)


  ‚úì Page 10: 100 records

  üíæ Saved: 1000 records

[11/84] 

  dfs = pd.read_html(html)


  ‚úì Page 11: 100 records

[12/84] 

  dfs = pd.read_html(html)


  ‚úì Page 12: 100 records

[13/84] 

  dfs = pd.read_html(html)


  ‚úì Page 13: 100 records

[14/84] 

  dfs = pd.read_html(html)


  ‚úì Page 14: 100 records

[15/84] 

  dfs = pd.read_html(html)


  ‚úì Page 15: 100 records

[16/84] 

  dfs = pd.read_html(html)


  ‚úì Page 16: 100 records

[17/84] 

  dfs = pd.read_html(html)


  ‚úì Page 17: 100 records

[18/84] 

  dfs = pd.read_html(html)


  ‚úì Page 18: 100 records

[19/84] 

  dfs = pd.read_html(html)


  ‚úì Page 19: 100 records

[20/84] 

  dfs = pd.read_html(html)


  ‚úì Page 20: 100 records

  üíæ Saved: 2000 records

[21/84] 

  dfs = pd.read_html(html)


  ‚úì Page 21: 100 records

[22/84] 

  dfs = pd.read_html(html)


  ‚úì Page 22: 100 records

[23/84] 

  dfs = pd.read_html(html)


  ‚úì Page 23: 100 records

[24/84] 

  dfs = pd.read_html(html)


  ‚úì Page 24: 100 records

[25/84] 

  dfs = pd.read_html(html)


  ‚úì Page 25: 100 records

[26/84] 

  dfs = pd.read_html(html)


  ‚úì Page 26: 100 records

[27/84] 

  dfs = pd.read_html(html)


  ‚úì Page 27: 100 records

[28/84] 

  dfs = pd.read_html(html)


  ‚úì Page 28: 100 records

[29/84] 

  dfs = pd.read_html(html)


  ‚úì Page 29: 100 records

[30/84] 

  dfs = pd.read_html(html)


  ‚úì Page 30: 100 records

  üíæ Saved: 3000 records

[31/84] 

  dfs = pd.read_html(html)


  ‚úì Page 31: 100 records

[32/84] 

  dfs = pd.read_html(html)


  ‚úì Page 32: 100 records

[33/84] 

  dfs = pd.read_html(html)


  ‚úì Page 33: 100 records

[34/84] 

  dfs = pd.read_html(html)


  ‚úì Page 34: 100 records

[35/84] 

  dfs = pd.read_html(html)


  ‚úì Page 35: 100 records

[36/84] 

  dfs = pd.read_html(html)


  ‚úì Page 36: 100 records

[37/84] 

  dfs = pd.read_html(html)


  ‚úì Page 37: 100 records

[38/84] 

  dfs = pd.read_html(html)


  ‚úì Page 38: 100 records

[39/84] 

  dfs = pd.read_html(html)


  ‚úì Page 39: 100 records

[40/84] 

  dfs = pd.read_html(html)


  ‚úì Page 40: 100 records

  üíæ Saved: 4000 records

[41/84] 

  dfs = pd.read_html(html)


  ‚úì Page 41: 100 records

[42/84] 

  dfs = pd.read_html(html)


  ‚úì Page 42: 100 records

[43/84] 

  dfs = pd.read_html(html)


  ‚úì Page 43: 100 records

[44/84] 

  dfs = pd.read_html(html)


  ‚úì Page 44: 100 records

[45/84] 

  dfs = pd.read_html(html)


  ‚úì Page 45: 100 records

[46/84] 

  dfs = pd.read_html(html)


  ‚úì Page 46: 100 records

[47/84] 

  dfs = pd.read_html(html)


  ‚úì Page 47: 100 records

[48/84] 

  dfs = pd.read_html(html)


  ‚úì Page 48: 100 records

[49/84] 

  dfs = pd.read_html(html)


  ‚úì Page 49: 100 records

[50/84] 

  dfs = pd.read_html(html)


  ‚úì Page 50: 100 records

  üíæ Saved: 5000 records

[51/84] 

  dfs = pd.read_html(html)


  ‚úì Page 51: 100 records

[52/84] 

  dfs = pd.read_html(html)


  ‚úì Page 52: 100 records

[53/84] 

  dfs = pd.read_html(html)


  ‚úì Page 53: 100 records

[54/84] 

  dfs = pd.read_html(html)


  ‚úì Page 54: 100 records

[55/84] 

  dfs = pd.read_html(html)


  ‚úì Page 55: 100 records

[56/84] 

  dfs = pd.read_html(html)


  ‚úì Page 56: 100 records

[57/84] 

  dfs = pd.read_html(html)


  ‚úì Page 57: 100 records

[58/84] 

  dfs = pd.read_html(html)


  ‚úì Page 58: 100 records

[59/84] 

  dfs = pd.read_html(html)


  ‚úì Page 59: 100 records

[60/84] 

  dfs = pd.read_html(html)


  ‚úì Page 60: 100 records

  üíæ Saved: 6000 records

[61/84] 

  dfs = pd.read_html(html)


  ‚úì Page 61: 100 records

[62/84] 

  dfs = pd.read_html(html)


  ‚úì Page 62: 100 records

[63/84] 

  dfs = pd.read_html(html)


  ‚úì Page 63: 100 records

[64/84] 

  dfs = pd.read_html(html)


  ‚úì Page 64: 100 records

[65/84] 

  dfs = pd.read_html(html)


  ‚úì Page 65: 100 records

[66/84] 

  dfs = pd.read_html(html)


  ‚úì Page 66: 100 records

[67/84] 

  dfs = pd.read_html(html)


  ‚úì Page 67: 100 records

[68/84] 

  dfs = pd.read_html(html)


  ‚úì Page 68: 100 records

[69/84] 

  dfs = pd.read_html(html)


  ‚úì Page 69: 100 records

[70/84] 

  dfs = pd.read_html(html)


  ‚úì Page 70: 100 records

  üíæ Saved: 7000 records

[71/84] 

  dfs = pd.read_html(html)


  ‚úì Page 71: 100 records

[72/84] 

  dfs = pd.read_html(html)


  ‚úì Page 72: 100 records

[73/84] 

  dfs = pd.read_html(html)


  ‚úì Page 73: 100 records

[74/84] 

  dfs = pd.read_html(html)


  ‚úì Page 74: 100 records

[75/84] 

  dfs = pd.read_html(html)


  ‚úì Page 75: 100 records

[76/84] 

  dfs = pd.read_html(html)


  ‚úì Page 76: 100 records

[77/84] 

  dfs = pd.read_html(html)


  ‚úì Page 77: 100 records

[78/84] 

  dfs = pd.read_html(html)


  ‚úì Page 78: 100 records

[79/84] 

  dfs = pd.read_html(html)


  ‚úì Page 79: 100 records

[80/84] 

  dfs = pd.read_html(html)


  ‚úì Page 80: 100 records

  üíæ Saved: 8000 records

[81/84] 

  dfs = pd.read_html(html)


  ‚úì Page 81: 100 records

[82/84] 

  dfs = pd.read_html(html)


  ‚úì Page 82: 100 records

[83/84] 

  dfs = pd.read_html(html)


  ‚úì Page 83: 100 records

[84/84] 

  dfs = pd.read_html(html)


  ‚úì Page 84: 38 records


‚úÖ COMPLETE! 8338 records saved


In [11]:
final_df.head()

Unnamed: 0,Sno,Candidate‚àá,Constituency,Party,Criminal Case,Education,Total Assets,Liabilities
0,1,Abu Bakar Rahmani,MADHUBANI,Country Citizen Party,0,Post Graduate,"Rs¬†13,58,312 ~ 13¬†Lacs+",Rs¬†0 ~
1,2,Adv Najib Shaikh,AKOLA,Indian National League,0,Graduate Professional,"Rs¬†25,87,782 ~ 25¬†Lacs+","Rs¬†18,00,000 ~ 18¬†Lacs+"
2,3,Advocate Balwinder Kumar,JALANDHAR (SC),BSP,1,Post Graduate,,
3,4,Anandswamy Gaddadevarmath,HAVERI,INC,1,Graduate,"Rs¬†56,81,54,912 ~ 56¬†Crore+","Rs¬†22,46,68,569 ~ 22¬†Crore+"
4,5,Bhagyaraj. J,VILUPPURAM (SC),AIADMK,0,12th Pass,"Rs¬†4,79,83,303 ~ 4¬†Crore+","Rs¬†1,43,64,469 ~ 1¬†Crore+"


In [12]:
# Save the complete dataset
final_df.to_excel('../data/raw/lok_sabha_2024_full.xlsx', index=False)
final_df.to_csv('../data/raw/lok_sabha_2024_full.csv', index=False)  # Backup as CSV

print(f"‚úÖ Saved {len(final_df)} records!")
print(f"   Location: data/raw/lok_sabha_2024_full.xlsx")

‚úÖ Saved 8338 records!
   Location: data/raw/lok_sabha_2024_full.xlsx


In [13]:
import pandas as pd

df = final_df  # Your scraped data

print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"Total Records: {len(df)}")
print(f"Total Columns: {len(df.columns)}")
print(f"\nColumns: {df.columns.tolist()}")

print("\n" + "="*60)
print("DATA QUALITY CHECK")
print("="*60)

# Missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Duplicates
print(f"\nDuplicates: {df.duplicated().sum()}")

# Data types
print("\nData Types:")
print(df.dtypes)

print("\n" + "="*60)
print("QUICK STATISTICS")
print("="*60)

# Party distribution
print("\nTop 10 Parties:")
print(df['Party'].value_counts().head(10))

# Criminal cases
if 'Criminal Case' in df.columns:
    criminal = df['Criminal Case'].astype(str).str.extract('(\d+)')[0].astype(float)
    print(f"\nCandidates with criminal cases: {(criminal > 0).sum()}")
    print(f"Percentage: {(criminal > 0).sum() / len(df) * 100:.1f}%")

# Education
if 'Education' in df.columns:
    print("\nEducation Levels:")
    print(df['Education'].value_counts().head(5))

DATASET OVERVIEW
Total Records: 8338
Total Columns: 8

Columns: ['Sno', 'Candidate‚àá', 'Constituency', 'Party', 'Criminal Case', 'Education', 'Total Assets', 'Liabilities']

DATA QUALITY CHECK

Missing Values:
Sno                 0
Candidate‚àá          0
Constituency        0
Party               0
Criminal Case       0
Education           0
Total Assets     2779
Liabilities      2779
dtype: int64

Duplicates: 0

Data Types:
Sno               int64
Candidate‚àá       object
Constituency     object
Party            object
Criminal Case     int64
Education        object
Total Assets     object
Liabilities      object
dtype: object

QUICK STATISTICS

Top 10 Parties:
Party
IND                                    3907
BSP                                     488
BJP                                     440
INC                                     328
SUCI(C)                                 149
Peoples Party of India (Democratic)      79
SP                                       71
CPI(M)       

  criminal = df['Criminal Case'].astype(str).str.extract('(\d+)')[0].astype(float)
