## Scraping the data

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Chrome()

driver.get("https://www.calcuttahighcourt.gov.in/Order-Judgments")

wait = WebDriverWait(driver, 20)

for i in range(106077):
    try:
        load_more_button = wait.until(EC.element_to_be_clickable((By.ID, "load_more")))
        load_more_button.click()
        print(f"Clicked 'Load More' {i + 1} times.")
        
        time.sleep(10)
    
    except Exception as e:
        print(f"Error clicking 'Load More' on iteration {i+1}: {e}")
        break

time.sleep(5)

hrefs = []
try:
    table = wait.until(EC.presence_of_element_located((By.ID, "datatable-table")))
    links = table.find_elements(By.TAG_NAME, "a")
    
    for link in links:
        href = link.get_attribute("href")
        if href:
            hrefs.append(href)

except Exception as e:
    print(f"Error extracting hrefs: {e}")

with open("collected_hrefs.txt", "w") as file:
    for href in hrefs:
        file.write(href + "\n")

print(f"Collected {len(hrefs)} hrefs. Saved to collected_hrefs.txt.")

driver.quit()

## differentiating and extracting the Judgements

In [None]:
import pdfplumber
import pandas as pd
import os
import re

df = pd.DataFrame()

def extract_text_between_headings(pdf_path, start_heading, end_heading):
    try:
        # Open the PDF file
        pdf = pdfplumber.open(pdf_path)
    except FileNotFoundError:
        print(f"The file {pdf_path} was not found.")
        return "", ""
    except Exception as e:
        print(f"An error occurred while opening the file: {e}")
        return "", ""

    text = ""
    text1 = ""
    extracting = False
    extract2 = False
    flag = 0
    # Iterate through the pages
    for page in pdf.pages:
        page_text = page.extract_text()
        lines = page_text.split('\n')
    
        for line in lines:
            # Skip lines with only "=" characters
            if end_heading in line or "In The High" in line or "In the High":
                extracting = True
            if(re.match(r'The Court:', line)):
                extract2=True
                extracting=False
            if extract2 and  len(line) > 1: 
                text1 += line + '\n'
            if extracting and len(line) > 1 :
                text += line + '\n'
            if start_heading == line or line.endswith(" J.  : –") or line.endswith(" J.:-") or line.startswith("THE COURT") or line.startswith("THHEE   CCOOUURRT") or line.endswith(" J.  : –") or line.endswith("J.:"):
                print(line)
                flag = 1
                extracting = False
                extract2 = True
            
    if flag == 0:
        return '', ''
            
    return text.strip(), text1.strip()


judgment = []
meta_data = []
missing_files = []
extracted_files = []
for i in files:
    pdf_path = 'missing/culcatta/' + i
    start_heading = 'JUDGMENT'
    end_heading = 'IN THE HIGH'
    extracted_text, extracted_judg = extract_text_between_headings(pdf_path, start_heading, end_heading)
    if extracted_text:
        print(i)
        extracted_files.append(i)
        judgment.append(extracted_judg)
        meta_data.append(extracted_text)
    else:
        missing_files.append(i)
        print(f"Could not extract text from file {i}")

df['meta_data'] = meta_data
df['judgment'] = judgment
df['file_name']=extracted_files
df.to_csv('culcatta_extract4.csv', index=False)

In [None]:
import shutil
import os

# Define source and destination folders
source_folder = 'missing/culcatta/'
destination_folder = 'extracted/culcatta'

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)
files=os.listdir(source_folder)
# Move files
for filename  in extracted_files:
    source_file = os.path.join(source_folder, filename)
    destination_file = os.path.join(destination_folder, filename)
    shutil.move(source_file, destination_file)

print("Files moved successfully!")


In [None]:
import shutil
import os

# Define source and destination folders
source_folder = 'culcatta'
destination_folder = '/sml2/Judgments/missing/culcatta'

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)
files=os.listdir(source_folder)
# Move files
for filename  in missing_files:
    source_file = os.path.join(source_folder, filename)
    destination_file = os.path.join(destination_folder, filename)
    shutil.move(source_file, destination_file)

print("Files moved successfully!")
