In [8]:
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
base_url = 'https://laxmikanthsummary.wordpress.com/'

all_links = []

for page_num in range(1, 100):  # Adjust max pages as needed
    url = base_url if page_num == 1 else f'{base_url}/page/{page_num}/'
    print(url)
    print(f"Page {page_num}: {url}")
    
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Failed to load page {page_num}: Status code {response.status_code}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Get all h1.entry-title links
    for h1 in soup.find_all('h1', class_='entry-title'):
        a = h1.find('a', href=True)
        if a:
            all_links.append(a['href'])
            print(f"  {a.get_text(strip=True)}")
    
    
print(f"\n✅ Total: {len(all_links)} links")

# Print all URLs
for link in all_links:
    print(link)

https://laxmikanthsummary.wordpress.com/
Page 1: https://laxmikanthsummary.wordpress.com/
  Public Services (Ch-62)
  Official Languages (Ch-61)
  Cooperative Societies (Ch-60)
  Lokpal & Lokayuktas (Ch-59)
  Central Bureau of Investigation (Ch-58)
  Central Vigilance Commission (Ch-57)
  State Information Commissioner (Ch-56)
  Central Information Commission (Ch-55)
  State Human Rights Commission (Ch-54)
  National Human Rights Commission (Ch-53)
https://laxmikanthsummary.wordpress.com//page/2/
Page 2: https://laxmikanthsummary.wordpress.com//page/2/
  NITI Aayog (Ch-52)
  Advocate General of the State (Ch-51)
  Attorney General of India (Ch-50)
  Comptroller & Auditor General of India (Ch-49)
  Special Officer for Linguistic Minorities (Ch-48)
  National Commission for STs (Ch-47)
  National Commission for SCs (Ch-46)
  Finance Commission (Ch-45)
  State Public Service Commission (Ch-44)
  Union Public Service Commission (Ch-43)
https://laxmikanthsummary.wordpress.com//page/3/
Page 

Exception: Failed to load page 8: Status code 404

In [9]:
all_links

['https://laxmikanthsummary.wordpress.com/2019/04/19/public-services-ch-62/',
 'https://laxmikanthsummary.wordpress.com/2019/04/19/official-languages-ch-61/',
 'https://laxmikanthsummary.wordpress.com/2019/04/19/cooperative-societies-ch-60/',
 'https://laxmikanthsummary.wordpress.com/2019/04/19/lokpal-lokayuktas-ch-59/',
 'https://laxmikanthsummary.wordpress.com/2019/04/19/central-bureau-of-investigation-ch-58/',
 'https://laxmikanthsummary.wordpress.com/2019/04/19/central-vigilance-commission-ch-57/',
 'https://laxmikanthsummary.wordpress.com/2019/04/19/state-information-commissioner-ch-56/',
 'https://laxmikanthsummary.wordpress.com/2019/04/18/central-information-commission-ch-55/',
 'https://laxmikanthsummary.wordpress.com/2019/04/18/state-human-rights-commission-ch-54/',
 'https://laxmikanthsummary.wordpress.com/2019/04/18/national-human-rights-commission-ch-53/',
 'https://laxmikanthsummary.wordpress.com/2019/04/18/niti-aayog-ch-52/',
 'https://laxmikanthsummary.wordpress.com/2019

In [12]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

# Process all links
chapters_data = []

for idx, url in enumerate(all_links, 1):
    print(f"[{idx}/{len(all_links)}] Processing: {url}")
    
    try:
        # Extract chapter name from URL
        # Example: 'making-of-the-constitution-ch-02' -> 'Making Of The Constitution'
        url_parts = url.rstrip('/').split('/')[-1]
        
        # Remove -ch-XX part
        chapter_name = re.sub(r'-ch-\d+.*$', '', url_parts)
        # Convert to title case
        chapter_name = chapter_name.replace('-', ' ').title()
        
        # Get page content
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        content_div = soup.find('div', class_='entry-content')
        
        if content_div:
            # Remove unwanted elements
            for p in content_div.find_all('p', class_='has-background'):
                p.decompose()
            for unwanted in content_div.find_all(['script', 'div', 'span', 'a']):
                unwanted.decompose()
            
            # Get plain text
            content_text = content_div.get_text(separator=' ', strip=True)
            
            # Clean up whitespace
            content_text = re.sub(r'\s+', ' ', content_text)
            
            # Store in dictionary
            chapters_data.append({"chapter": chapter_name, "content": content_text})
            
            print(f"  ✓ {chapter_name} ({len(content_text)} chars)")
        else:
            print(f"  ✗ No content found")
        
        time.sleep(1)  # Be polite to server
        
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")
        continue

# Save to JSON
with open('laxmikanth_chapters.json', 'w', encoding='utf-8') as f:
    json.dump(chapters_data, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved {len(chapters_data)} chapters to laxmikanth_chapters.json")

# Display first 3 chapters
for i, chapter_data in enumerate(chapters_data[:3]):
    chapter = chapter_data["chapter"]
    content = chapter_data["content"]
    print(f"\n{chapter}:")
    print(f"{content[:200]}...")

[1/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/public-services-ch-62/
  ✓ Public Services (3536 chars)
[2/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/official-languages-ch-61/
  ✓ Official Languages (4366 chars)
[3/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/cooperative-societies-ch-60/
  ✓ Cooperative Societies (2484 chars)
[4/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/lokpal-lokayuktas-ch-59/
  ✓ Lokpal Lokayuktas (4985 chars)
[5/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/central-bureau-of-investigation-ch-58/
  ✓ Central Bureau Of Investigation (2025 chars)
[6/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/central-vigilance-commission-ch-57/
  ✓ Central Vigilance Commission (4594 chars)
[7/63] Processing: https://laxmikanthsummary.wordpress.com/2019/04/19/state-information-commissioner-ch-56/
  ✓ State Information Commissioner (3270 chars)
[8/63] P

In [5]:
from bs4 import BeautifulSoup
import requests
# Assuming you have the HTML in a variable or from a page
url = 'https://pwonlyias.com/upsc-notes/indian-economy-notes/'  # or the actual URL
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

chapters = []

for tr in soup.find_all('tr')[1:]:  # Skip header row
    tds = tr.find_all('td')
    if len(tds) >= 2:
        ch_number = tds[0].get_text(strip=True)
        a_tag = tds[1].find('a', href=True)
        
        if a_tag:
            chapters.append({
                'chapter_name': a_tag.get_text(strip=True),
                'url': a_tag['href']
            })
            print(f"{ch_number}. {a_tag.get_text(strip=True)}: {a_tag['href']}")



1. INDIAN ECONOMY- BEFORE AND AFTER INDEPENDENCE: https://pwonlyias.com/indian-economy-before-and-after-independence/
2. MONEY: https://pwonlyias.com/money/
3. BANKING IN INDIA: https://pwonlyias.com/banking-in-india/
4. FINANCIAL MARKETS: https://pwonlyias.com/financial-markets/
5. INSURANCE IN INDIA: https://pwonlyias.com/insurance-in-india/
6. FINANCIAL INCLUSION: https://pwonlyias.com/financial-inclusion/
7. BUDGET AND TAXATION: https://pwonlyias.com/budget-and-taxation/
8. BALANCE OF PAYMENT: https://pwonlyias.com/balance-of-payment/
9. EXCHANGE RATE SYSTEM: https://pwonlyias.com/exchage-rate-system/
10. INTERNATIONAL ORGANIZATIONS: https://pwonlyias.com/international-organizations/
11. AGRICULTURE: https://pwonlyias.com/agriculture/
12. MANUFACTURING AND INDUSTRIES: https://pwonlyias.com/manufacturing-and-industries/
13. ECONOMIC PLANNING: https://pwonlyias.com/economic-planning/
14. ECONOMIC INDICATORS: https://pwonlyias.com/economic-indicators/
15. INFRASTRUCTURE: https://pwonl

In [None]:
from bs4 import BeautifulSoup
import re
all_details = []
for chapter in chapters:    
    url = chapter['url']
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main content div
    content_div = soup.find('div', class_='pf-content')

    if content_div:
        # Remove the printfriendly div (last div with download button)
        for div in content_div.find_all('div', class_='printfriendly'):
            div.decompose()
        
        # Also remove any other unwanted elements
        for element in content_div.find_all(['script', 'style']):
            element.decompose()
        
        # Extract text with space separator
        content_text = content_div.get_text(separator=' ', strip=True)
        
        # Clean up excessive whitespace
        content_text = re.sub(r'\s+', ' ', content_text)
        
        all_details.append({
            'chapter_name': chapter['chapter_name'],
            'content': content_text
        })
        print(f"✓ Fetched: {chapter['chapter_name']} ({len(content_text)} chars)")

✓ Fetched: INDIAN ECONOMY- BEFORE AND AFTER INDEPENDENCE (97213 chars)
✓ Fetched: MONEY (98254 chars)
✓ Fetched: FINANCIAL INCLUSION (37625 chars)
✓ Fetched: BUDGET AND TAXATION (182 chars)
✓ Fetched: BALANCE OF PAYMENT (12524 chars)
✓ Fetched: INTERNATIONAL ORGANIZATIONS (94664 chars)
✓ Fetched: AGRICULTURE (94255 chars)
✓ Fetched: MANUFACTURING AND INDUSTRIES (182 chars)
✓ Fetched: ECONOMIC PLANNING (20424 chars)
✓ Fetched: ECONOMIC INDICATORS (88820 chars)
✓ Fetched: INFRASTRUCTURE (182 chars)
✓ Fetched: INCLUSIVE GROWTH (17991 chars)


In [9]:
import json
with open('indian_economy_chapters.json', 'w', encoding='utf-8') as f:
    json.dump(all_details, f, ensure_ascii=False, indent=2)

#get all the content 

In [14]:
url = 'https://pwonlyias.com/upsc-notes/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

links = soup.find_all('a', class_='docs-cat-link-btn betterdocs-category-link-btn')

# Get URLs and filter out encoded ones
urls = [link.get('href') for link in links if link.get('href') and '%' not in link.get('href')]

for url in urls:
    print(url)

https://pwonlyias.com/upsc-notes-category/indian-polity/
https://pwonlyias.com/upsc-notes-category/governance/
https://pwonlyias.com/upsc-notes-category/international-relations/
https://pwonlyias.com/upsc-notes-category/indian-economy/
https://pwonlyias.com/upsc-notes-category/agriculture/
https://pwonlyias.com/upsc-notes-category/science-and-technology/
https://pwonlyias.com/upsc-notes-category/enivornment-and-ecology/
https://pwonlyias.com/upsc-notes-category/internal-security/
https://pwonlyias.com/upsc-notes-category/disaster-management/
https://pwonlyias.com/upsc-notes-category/ancient-indian-history/
https://pwonlyias.com/upsc-notes-category/medieval-indian-history/
https://pwonlyias.com/upsc-notes-category/modern-indian-history/
https://pwonlyias.com/upsc-notes-category/post-independence-history-of-india/
https://pwonlyias.com/upsc-notes-category/indian-society/
https://pwonlyias.com/upsc-notes-category/physical-geography/
https://pwonlyias.com/upsc-notes-category/human-and-econ

In [20]:
#modern-india

In [25]:
url = 'https://vajiramandravi.com/upsc-exam/modern-history-notes/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <a> tags
all_links = soup.find_all('div', class_='audio')[0].find_all('a', href=True)

# Filter and extract data
data = []
for link in all_links:
    href = link['href']
    if '/upsc-exam/' in href:
        data.append({
            'text': link.get_text(strip=True),
            'url': href
        })

# Remove duplicates based on URL
seen = set()
unique_data = []
for item in data:
    if item['url'] not in seen:
        seen.add(item['url'])
        unique_data.append(item)

for item in unique_data:
    print(f"{item['text']}: {item['url']}")

print(f"\nTotal links: {len(unique_data)}")

Advent of Europeans in India: https://vajiramandravi.com/upsc-exam/advent-of-europeans-in-india/
Carnatic Wars: https://vajiramandravi.com/upsc-exam/carnatic-wars/
Establishment of British Rule in India - Colonialism in India: https://vajiramandravi.com/upsc-exam/british-rule-in-india/
Battle of Plassey: https://vajiramandravi.com/upsc-exam/battle-of-plassey/
Battle of Buxar: https://vajiramandravi.com/upsc-exam/battle-of-buxar/
Dual Government in Bengal: https://vajiramandravi.com/upsc-exam/dual-government-in-bengal/
Robert Clive: https://vajiramandravi.com/upsc-exam/robert-clive/
Anglo-Mysore Wars: https://vajiramandravi.com/upsc-exam/anglo-mysore-wars/
Tipu Sultan: https://vajiramandravi.com/upsc-exam/tipu-sultan/
Anglo Maratha Wars: https://vajiramandravi.com/upsc-exam/anglo-maratha-wars/
Maratha Empire: https://vajiramandravi.com/upsc-exam/maratha-empire/
Third Battle of Panipat: https://vajiramandravi.com/upsc-exam/third-battle-of-panipat/
Chhatrapati Shivaji Maharaj: https://vaj

In [26]:
all_data = []
for item in unique_data:
    url = item['url']
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    content_div = soup.find('div', class_='audio')

    if content_div:
        # First remove unwanted elements
        for element in content_div.find_all(['script', 'style', 'img', 'table']):
            element.decompose()
        for div in content_div.find_all('div', class_='printfriendly'):
            div.decompose()
        
        # Find the h2 with 'pyqs' in id and remove it and everything after it
        pyqs_h2 = None
        for h2 in content_div.find_all('h2'):
            if h2.get('id') and 'pyqs' in h2.get('id').lower():
                pyqs_h2 = h2
                break
        
        # Remove the h2 and all its following siblings
        if pyqs_h2:
            for sibling in list(pyqs_h2.next_siblings):
                if hasattr(sibling, 'decompose'):
                    sibling.decompose()
            pyqs_h2.decompose()
        
        # Now extract the text
        content_text = content_div.get_text(separator=' ', strip=True)
        content_text = re.sub(r'\s+', ' ', content_text)
        
        all_data.append({
            'chapter_name': item['text'],   
            'content': content_text
        })

        print(f"✓ Fetched: {item['text']} ({len(content_text)} chars)")


✓ Fetched: Advent of Europeans in India (13129 chars)
✓ Fetched: Carnatic Wars (10740 chars)
✓ Fetched: Establishment of British Rule in India - Colonialism in India (13756 chars)
✓ Fetched: Battle of Plassey (10844 chars)
✓ Fetched: Battle of Buxar (10344 chars)
✓ Fetched: Dual Government in Bengal (5571 chars)
✓ Fetched: Robert Clive (5177 chars)
✓ Fetched: Anglo-Mysore Wars (15928 chars)
✓ Fetched: Tipu Sultan (5664 chars)
✓ Fetched: Anglo Maratha Wars (18466 chars)
✓ Fetched: Maratha Empire (13248 chars)
✓ Fetched: Third Battle of Panipat (6726 chars)
✓ Fetched: Chhatrapati Shivaji Maharaj (12593 chars)
✓ Fetched: Chauth and Sardeshmukhi (3177 chars)
✓ Fetched: Battle of Bhima Koregaon (4671 chars)
✓ Fetched: Maharaja Ranjit Singh (9466 chars)
✓ Fetched: Anglo Nepal War (4484 chars)
✓ Fetched: Anglo Burmese War: First Anglo Burmese War, Second Anglo Burmese War (4307 chars)
✓ Fetched: Acts and Regulations under East India Company (11414 chars)
✓ Fetched: Regulating Act 1773 (7449 c

In [27]:
with open('modern_india_chapters.json', 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)