In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

In [2]:
def driver_start(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def extract_year_links():
    root_url = 'https://indiankanoon.org'
    year_links = []
    soup = driver_start('https://indiankanoon.org/browse/chennai/')
    table = soup.find('table')
    rows = table.find_all('tr')
    for row in rows:
        tds = row.find_all('td')
        for td in tds:
            a_tag = td.find('a')
            link = a_tag['href']
            year_links.append(root_url+link)
    return year_links

year_links = extract_year_links()

print(year_links)
print(len(year_links))

['https://indiankanoon.org/browse/chennai/202/', 'https://indiankanoon.org/browse/chennai/206/', 'https://indiankanoon.org/browse/chennai/680/', 'https://indiankanoon.org/browse/chennai/1017/', 'https://indiankanoon.org/browse/chennai/1800/', 'https://indiankanoon.org/browse/chennai/1867/', 'https://indiankanoon.org/browse/chennai/1868/', 'https://indiankanoon.org/browse/chennai/1871/', 'https://indiankanoon.org/browse/chennai/1879/', 'https://indiankanoon.org/browse/chennai/1880/', 'https://indiankanoon.org/browse/chennai/1881/', 'https://indiankanoon.org/browse/chennai/1882/', 'https://indiankanoon.org/browse/chennai/1883/', 'https://indiankanoon.org/browse/chennai/1884/', 'https://indiankanoon.org/browse/chennai/1885/', 'https://indiankanoon.org/browse/chennai/1886/', 'https://indiankanoon.org/browse/chennai/1887/', 'https://indiankanoon.org/browse/chennai/1888/', 'https://indiankanoon.org/browse/chennai/1889/', 'https://indiankanoon.org/browse/chennai/1890/', 'https://indiankanoon.

In [13]:
def extract_primary_page_links(year_links):
    root_url = 'https://indiankanoon.org'
    page_links = []
    for i in year_links:
        driver = driver_start(i)
        table = driver.find('table')
        try:
            row = table.find('tr')
        except:
            print(i, " can't extract")
        td = row.find('td')
        a_tag = td.find('a')
        link = root_url + a_tag['href']
        """
        Problem:
        The page link is like this "https://indiankanoon.org/search/?formInput=doctypes:chennai%20fromdate:1-1-202%20todate:%2031-12-202".
        But Beatifulsoup retrieves link like this "https://indiankanoon.org/search/?formInput=doctypes:chennai fromdate:1-1-202 todate: 31-12-202"
        
        Solution:
        Used the below library to decode the url correctly.
        from urllib.parse import quote
        decoded_url = quote(url, safe=":/?&=")
        """
        page_links.append(quote(link, safe=":/?&="))
        print(f"{i} extracted")
    return page_links
        
primary_page_links = extract_primary_page_links(year_links)


for i in primary_page_links:
    print(i, '\n')

print("Number of primary page links: ", len(primary_page_links))

https://indiankanoon.org/browse/chennai/202/ extracted
https://indiankanoon.org/browse/chennai/206/ extracted
https://indiankanoon.org/browse/chennai/680/ extracted
https://indiankanoon.org/browse/chennai/1017/ extracted
https://indiankanoon.org/browse/chennai/1800/ extracted
https://indiankanoon.org/browse/chennai/1867/ extracted
https://indiankanoon.org/browse/chennai/1868/ extracted
https://indiankanoon.org/browse/chennai/1871/ extracted
https://indiankanoon.org/browse/chennai/1879/ extracted
https://indiankanoon.org/browse/chennai/1880/ extracted
https://indiankanoon.org/browse/chennai/1881/ extracted
https://indiankanoon.org/browse/chennai/1882/ extracted
https://indiankanoon.org/browse/chennai/1883/ extracted
https://indiankanoon.org/browse/chennai/1884/ extracted
https://indiankanoon.org/browse/chennai/1885/ extracted
https://indiankanoon.org/browse/chennai/1886/ extracted
https://indiankanoon.org/browse/chennai/1887/ extracted
https://indiankanoon.org/browse/chennai/1888/ extra

In [37]:
def recursive_func(spans, sub_page, root_url, all_pages_links):
            for span in spans:
                a_tag = span.find('a')
                last_link = root_url + a_tag['href']
                all_pages_links.append(last_link)
                print(f"\tsub_page {sub_page} executed\n")
                sub_page += 1
            driver = driver_start(last_link)
            spans = driver.find_all('span', class_ = 'pagenum')
            if len(spans) > 4:
                recursive_func(spans[4:], sub_page, root_url, all_pages_links)
            return all_pages_links # return the appended list in the original code
                
def extract_all_pages(primary_page_links):               
    root_url = 'https://indiankanoon.org'
    page = 1
    all_pages_links = []
    for i in primary_page_links:
        print(i,'\n')
        driver = driver_start(i)
        spans = driver.find_all('span', class_ = 'pagenum')
        if spans == []:
            all_pages_links.append(i)
            print(f"page {page} executed\n\n")
            page += 1
            continue
        elif len(spans) == 9:
            print(f"page {page} executing\n")
            page += 1
            sub_page = 1
            all_pages_links.append(i)
            all_pages_links = recursive_func(spans, sub_page, root_url, all_pages_links)
            continue
        else:
            print(f"page {page} executing\n")
            page += 1
            sub_page = 1
            all_pages_links.append(i)
            for span in spans:
                a_tag = span.find('a')
                link = root_url + a_tag['href']
                all_pages_links.append(link)
                print(f"\tsub_page {sub_page} executed\n")
                sub_page += 1
    return all_pages_links
                
all_pages_links = extract_all_pages(primary_page_links)

https://indiankanoon.org/search/?formInput=doctypes:chennai%20fromdate:1-1-202%20todate:%2031-12-202 

page 1 executed


https://indiankanoon.org/search/?formInput=doctypes:chennai%20fromdate:1-1-206%20todate:%2031-12-206 

page 2 executed


https://indiankanoon.org/search/?formInput=doctypes:chennai%20fromdate:1-1-680%20todate:%2031-12-680 

page 3 executed


https://indiankanoon.org/search/?formInput=doctypes:chennai%20fromdate:1-1-1017%20todate:%2031-12-1017 

page 4 executing

	sub_page 1 executed

https://indiankanoon.org/search/?formInput=doctypes:chennai%20fromdate:1-1-1800%20todate:%2031-12-1800 

page 5 executing

	sub_page 1 executed

	sub_page 2 executed

	sub_page 3 executed

	sub_page 4 executed

	sub_page 5 executed

	sub_page 6 executed

	sub_page 7 executed

	sub_page 8 executed

	sub_page 9 executed

	sub_page 10 executed

	sub_page 11 executed

	sub_page 12 executed

	sub_page 13 executed

	sub_page 14 executed

	sub_page 15 executed

	sub_page 16 executed

	sub_page 

In [38]:
print(len(all_pages_links))

4853


In [46]:
def extract_all_cases_links(all_pages_links):
    case_links = []
    page = 1
    for i in all_pages_links:
        driver = driver_start(i)
        divs = driver.find_all('div', class_ = 'result_title')
        for div in divs:
            a_tag = div.find('a')
            link = root_url + a_tag['href']
            case_links.append(link)
        print(f"page {page} extracted")
        page += 1
    return case_links
            
all_cases_links = extract_all_cases_links(all_pages_links)
print("Total Number of Madras High Court Cases From 1867 to 2024: ", len(all_cases_links))

page 1 extracted
page 2 extracted
page 3 extracted
page 4 extracted
page 5 extracted
page 6 extracted
page 7 extracted
page 8 extracted
page 9 extracted
page 10 extracted
page 11 extracted
page 12 extracted
page 13 extracted
page 14 extracted
page 15 extracted
page 16 extracted
page 17 extracted
page 18 extracted
page 19 extracted
page 20 extracted
page 21 extracted
page 22 extracted
page 23 extracted
page 24 extracted
page 25 extracted
page 26 extracted
page 27 extracted
page 28 extracted
page 29 extracted
page 30 extracted
page 31 extracted
page 32 extracted
page 33 extracted
page 34 extracted
page 35 extracted
page 36 extracted
page 37 extracted
page 38 extracted
page 39 extracted
page 40 extracted
page 41 extracted
page 42 extracted
page 43 extracted
page 44 extracted
page 45 extracted
page 46 extracted
page 47 extracted
page 48 extracted
page 49 extracted
page 50 extracted
page 51 extracted
page 52 extracted
page 53 extracted
page 54 extracted
page 55 extracted
page 56 extracted
p

In [47]:
all_cases_links[:5]

['https://indiankanoon.org/docfragment/52091592/?formInput=doctypes%3A%20chennai%20fromdate%3A%201-1-202%20todate%3A%2031-12-202',
 'https://indiankanoon.org/docfragment/23969784/?formInput=doctypes%3A%20chennai%20fromdate%3A%201-1-202%20todate%3A%2031-12-202',
 'https://indiankanoon.org/docfragment/102815189/?formInput=doctypes%3A%20chennai%20fromdate%3A%201-1-202%20todate%3A%2031-12-202',
 'https://indiankanoon.org/docfragment/152517345/?formInput=doctypes%3A%20chennai%20fromdate%3A%201-1-206%20todate%3A%2031-12-206',
 'https://indiankanoon.org/docfragment/128046475/?formInput=doctypes%3A%20chennai%20fromdate%3A%201-1-680%20todate%3A%2031-12-680']

In [48]:
#Convert python list into csv file
import csv

# Define the file name
file_name = 'all_cases_links.csv'

# Write data to CSV file
with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)
    for row in all_cases_links:
        # Split each string by commas and write it as a row
        writer.writerow(row.split(','))

print(f"Data successfully written to {file_name}")

Data successfully written to all_cases_links.csv


In [56]:
#Convert the csv file into python list
import csv

# Initialize an empty list to store rows
data_list = []

# Read data from CSV file
with open(file_name, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # Join each row list into a single string with commas and add it to the list
        data_list.append(','.join(row))

print(data_list[-12])
print(len(data_list))


https://indiankanoon.org/docfragment/196703888/?formInput=doctypes%3A%20chennai%20fromdate%3A%201-1-2027%20todate%3A%2031-12-2027
46394


In [58]:
url = 'https://indiankanoon.org/doc/129868410/'
driver = driver_start(url)
blockquote_tags = driver.find_all('blockquote')
print(blockquote_tags[:5])

[<blockquote id="blockquote_1">                            (i) <a href="/doc/30032725/" id="a_10">The Right of Children to Free and Compulsory Education Act, 2009</a>
<a href="/doc/30032725/" id="a_11">(RTE Act</a>) was enacted by the Government of India and it has come into force

                  from 01.04.2010. The object of the Act is to provide free and compulsory

                  education to all children of the age of six to fourteen years. As per the RTE

https://www.mhc.tn.gov.in/judis


<span class="hidden_text" id="span_1">                  20/189</span>
                                                                     WP Nos. 26084 of 2023 etc. cases

                  Act, schools defined under <a href="/doc/23643703/" id="a_12">Section 2 (n)</a> should recruit teachers only with

                  qualification as defined under <a href="/doc/12722275/" id="a_13">Section 23 (1)</a> of the RTE Act. The Government

                  of India appointed the National Co