In [1]:
import spacy
import PyPDF2
import re


nlp = spacy.load("en_core_web_sm")


nlp.max_length = 8000000  


def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            content = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                content += page.extract_text()  
        return content
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None


def extract_authors(text):
    doc = nlp(text)
    authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    return authors


def extract_publication_year(text):
    match = re.search(r"\b(19|20)\d{2}\b", text)
    return match.group(0) if match else None


def extract_methods_results(text):
   
    methods_match = re.search(r"(methods|methodology)(.*?)(results|conclusion|discussion)", text, re.IGNORECASE | re.DOTALL)
    results_match = re.search(r"(results)(.*?)(conclusion|discussion)", text, re.IGNORECASE | re.DOTALL)
    
    methods_section = methods_match.group(0) if methods_match else "Methods section not found"
    results_section = results_match.group(0) if results_match else "Results section not found"
    return methods_section, results_section


def process_paper(pdf_path):
    # Extract text from the paper
    text = extract_text_from_pdf(pdf_path)
    if not text:
        return None 
    
   
    authors = extract_authors(text)
    publication_year = extract_publication_year(text)
    methods, results = extract_methods_results(text)

   
    extracted_info = {
        "Authors": authors,
        "Publication Year": publication_year,
        "Methods": methods,
        "Results": results
    }

    return extracted_info

if __name__ == "__main__":
    pdf_paths = [
        "C:/Users/pruth/AppData/Roaming/Jupyter/runtime/MLpaper1.pdf",
        "C:/Users/pruth/AppData/Roaming/Jupyter/runtime/MLpapre2.pdf",
        "C:/Users/pruth/AppData/Roaming/Jupyter/runtime/MLpaper3.pdf"
    ]

    for pdf_path in pdf_paths:
        
        extracted_info = process_paper(pdf_path)
        
        if extracted_info:
            print(f"\nExtracted Information for {pdf_path}:")
            for key, value in extracted_info.items():
                print(f"{key}: {value if value else 'Not found'}")
        else:
            print(f"\nFailed to process {pdf_path}")

  


Extracted Information for C:/Users/pruth/AppData/Roaming/Jupyter/runtime/MLpaper1.pdf:
Authors: ['SUPRIYA V. MAHADEVKAR1', 'ABDELKAREIM GABRALLA', 'Nourah Bint', 'PNURSP2022R178', 'Zhongyi Guo\n ', 'https://creativecommons.org/licenses/by/4.0/107293S. V. Mahadevkar', 'Scipy', 'Matplotlib', 'Keras', 'Kadane', 'Tensor Flow', 'V. Mahadevkar', 'CV', 'Spam', 'V. Mahadevkar', 'CV', 'V. Mahadevkar', 'Denny M.R', 'Vanderplas', 'Pu', 'Chen', 'Chen', 'Vygotsky', 'James Britton', 'Seltzer Donald S.', 'Littlestone Nick', 'Vladimir Vapnik', 'Primary', 'Supervised', 'V. Mahadevkar', 'Supervised', 'V. Mahadevkar', 'V. Mahadevkar', 'MIL', 'V. Mahadevkar', 'V. Mahadevkar', 'V. Mahadevkar', 'AwA', 'V. Mahadevkar', 'V. Mahadevkar', 'Deep', 'V. Mahadevkar', 'V. Mahadevkar', '107309S. V. Mahadevkar', 'ed\ndatasets', 'Markov', 'V. Mahadevkar', 'Areas', 'V. Mahadevkar', 'V. Mahadevkar', 'V. Mahadevkar', 'Transfer', 'V. Mahadevkar', 'V. Mahadevkar', 'Talking Heads', 'Generative Model-Based', 'Accuracy', 'V. 