## Downloading the files

In [None]:
import os
import requests
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')


def download_pdf(url, folder):
    # Get the file name from the URL
    file_name = url.split('/')[-1]+'.pdf'
    file_path = os.path.join(folder, file_name)
    
    try:
        # Send HTTP request to the URL
        response = requests.get(url,verify=False)  # Verify is set to False to ignore SSL certificate errors (if any
        response.raise_for_status() 
        
        # Write the content of the response to a file
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f'Successfully downloaded: {file_name}')
    except requests.exceptions.RequestException as e:
        print(f'Failed to download: {file_name}. Error: {e}')

# Define the URL of the PDF and the folder to save it
url = 'https://mhc.tn.gov.in/judis/index.php/casestatus/viewpdf/'
folder = 'madras1'

# Create the folder if it doesn't exist
if not os.path.exists(folder):
    os.makedirs(folder)

# Download the PDF
for i in range(886002,900000):
    ur=url+str(i)
    download_pdf(ur, folder)


## listing the files in the folder

In [None]:
import os
folder_path = 'madras1'  
files = os.listdir(folder_path)
num_files = len(files)
files.sort(reverse=True)
print(files[0])
print(f"Number of files in folder: {num_files}")

In [None]:
import os
folder_path = 'extracted/madras'  
files1 = os.listdir(folder_path)
num_files = len(files1)
files1.sort(reverse=True)
print(files1[1])
print(f"Number of files in folder: {num_files}")
last_file = files1[-1] if num_files > 0 else None
print(f"Last file name: {last_file}")

## file sizes

In [None]:
import os

def get_file_size(file_path):
    try:
        size_bytes = os.path.getsize(file_path)
        size_kb = size_bytes / 1024 
        return size_kb
    except Exception as e:
        print(f"Error getting file size: {str(e)}")
        return None
li=[]
for i in files:
    size=get_file_size('madras1/'+(i))
    if(size<1):
        li.append(i)


## deleting the empty files

In [None]:
import os
import glob

def delete_specific_files(folder_path, pattern):
    files = glob.glob(os.path.join(folder_path, pattern))
    print(len(files))
    for file in files:

        try:
            if file[-10:] in li:
                os.remove(file)
                print(f"Deleted: {file}")
        except Exception as e:
            print(f"Failed to delete {file}. Reason: {e}")


folder_path = 'madras1'
pattern = '*.pdf'
delete_specific_files(folder_path, pattern)

## classifing as orders or Judgments

In [None]:
import pdfplumber
import pandas as pd
import os

df=pd.DataFrame()
def extract_text_between_headings(pdf_path):
    try:
        pdf = pdfplumber.open(pdf_path)
    except FileNotFoundError:
        print(f"The file {pdf_path} was not found.")
        return ""
    except Exception as e:
        print(f"An error occurred while opening the file: {e}")
        return ""

    text = ""
    for page in pdf.pages:
        page_text =page.extract_text()
        lines = page_text.split('\n')
        for line in lines:
           
            if line=="ORDER" or line== "O R D E R" or line=="COMMON ORDER" or line =="C O M M O N O R D E R" or line=="COMMON O R D E R" or line=="   RDER":
               print(line)
               return None
    return text.strip()
files=os.listdir('madras1')
print(len(files))
l=[]
orders=[]
for i in files:
  pdf_path = 'madras1/'+i
  extracted_text = extract_text_between_headings(pdf_path)
  if((extracted_text)!=None):
    l.append(i)
  else:
     orders.append(i)


## moving judgments to judgments folder

In [None]:
import shutil
import os

source_folder = 'madras'
destination_folder = 'madras_extracted3'

os.makedirs(destination_folder, exist_ok=True)

for filename in files:
    source_file = os.path.join(source_folder, filename)
    destination_file = os.path.join(destination_folder, filename)
    shutil.move(source_file, destination_file)

print("Files moved successfully!")


## moving orders to orders folder

In [None]:
import shutil
import os

# Define source and destination folders
source_folder = 'madras_extracted3'
destination_folder = 'extracted/madras'

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)
files=os.listdir(source_folder)
print(len(files))
# Move files
for filename in files:
    source_file = os.path.join(source_folder, filename)
    destination_file = os.path.join(destination_folder, filename)
    shutil.move(source_file, destination_file)

print("Files moved successfully!")


## extracting the judgements

In [None]:
import pdfplumber
import pandas as pd
import os

df=pd.DataFrame()
def extract_text_between_headings(pdf_path, start_heading):
    try:
        # Open the PDF file
        pdf = pdfplumber.open(pdf_path)
    except FileNotFoundError:
        print(f"The file {pdf_path} was not found.")
        return "",""
    except Exception as e:
        print(f"An error occurred while opening the file: {e}")
        return "",""
    text = ""
    text1=""
    extracting = True
    extract2=False
    flag=0
    # Iterate through the pages
    for page in pdf.pages:
        page_text =page.extract_text()
        lines = page_text.split('\n')
    
        for line in lines:
            #print(line)
            if extract2==True and line !="http://www.judis.nic.in" and len(line)>1: 
                text1+=line+'\n'
            if extracting == True and len(line)>1 and line !="http://www.judis.nic.in":
                text += line + '\n'
            if line=="ORDER" or line== "O R D E R" or line=="COMMON ORDER" or line =="C O M M O N O R D E R" :
                return 'order','order'
            if start_heading == line or line == "C O M M O N J U D G M E N T" or  line=="COMMON JUDGMENT" or line=="J U D G M E N T" or line =="C O M M O N   J U D G M E N T" or "Judgment was delivered by" in line or "J U D G E M E N T" == line or line =="COMMON JUDGEMENT" or line =="C O M M O N J U D G E M E N T" or line=="JUDGEMENT" or line=="COMMONJUDGMENT":
                print(line)
                flag=1
                extracting = False
                extract2=True
            if line=="To:" or line=="To":
                extract2=False
                extracting=True
    
    if flag==0:
        return '',''
            
    return text.strip(),text1.strip()


judgment=[]
meta_data=[]
missing_files=[]
extracted_files=[]
files=os.listdir('madras')
print(len(files))
for i in files:
  pdf_path = 'madras/'+i
  start_heading = 'JUDGMENT'
  extracted_text,extracted_judg = extract_text_between_headings(pdf_path, start_heading)
  if(extra)
  if(len(extracted_text)!=0) :
    print(i)
    extracted_files.append(i)
    judgment.append(extracted_judg)
    meta_data.append(extracted_text)

  else:
    missing_files.append(i)
    print(f"Could not extract text from file {i}")

df["meta data"]=meta_data
df["judgment"]=judgment
df["file_name"]=extracted_files
df.to_csv("madras_judgments4.csv",index=False)


In [None]:
import shutil
import os

source_folder = 'madras1'
destination_folder = 'Judgments/missing/madras'

os.makedirs(destination_folder, exist_ok=True)
files=os.listdir(source_folder)
for filename  in missing_files:
    source_file = os.path.join(source_folder, filename)
    destination_file = os.path.join(destination_folder, filename)
    shutil.move(source_file, destination_file)

print("Files moved successfully!")


## removing Dummy files

In [None]:
import os
import glob

def delete_specific_files(folder_path, pattern):
    files = glob.glob(os.path.join(folder_path, pattern))
    for file in files:
        try:
            if file[-10:] in missing_files:
                os.remove(file)
                print(f"Deleted: {file}")
        except Exception as e:
            print(f"Failed to delete {file}. Reason: {e}")

folder_path = 'madras_judgments'
pattern = '*.PDF'
delete_specific_files(folder_path, pattern)


## moving the extracted judgments to extracted folder

In [None]:
import shutil
import os

source_folder = 'madras'
destination_folder = 'orders/madras'

os.makedirs(destination_folder, exist_ok=True)
files=os.listdir(source_folder)
for filename  in files:
    source_file = os.path.join(source_folder, filename)
    destination_file = os.path.join(destination_folder, filename)
    shutil.move(source_file, destination_file)

print("Files moved successfully!")
