In [1]:
import pandas as pd
import regex as re
import PyPDF2
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
def extract_text_from_pdf(pdf_path):
    try:
        # Open the PDF file in binary mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)

            # Get the number of pages in the PDF
            num_pages = len(pdf_reader.pages)

            # Initialize a list to store the extracted text from each page
            text_list = []

            # Loop through all pages and extract text
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text_list.append(page.extract_text())

            return text_list

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None


In [10]:
# Example usage
pdf_path = r'D:\Proyectos\nllp_antitrust.v1\SHS. Casenote Legal Briefs - Casenote Legal Briefs_ Antitrust Keyed to Sullivan, Hovencamp & S.pdf'
extracted_text_list = extract_text_from_pdf(pdf_path)

if extracted_text_list:
    # Create a pandas DataFrame from the extracted text
    df = pd.DataFrame({'Page': range(1, len(extracted_text_list) + 1),
                       'Text': extracted_text_list})
    
    # Display the DataFrame
    print(df)

    # You can also save the DataFrame to a CSV file if needed
    # df.to_csv('output.csv', index=False)
else:
    print("Failed to extract text from the PDF.")


     Page                                               Text
0       1                                                   
1       2                                                  2
2       3  This publication is designed to provide accura...
3       4  About Wolters Kluwer Law & Business\nWolters K...
4       5                                                  5
..    ...                                                ...
248   249  Common Latin Words and Phrases Encountered in ...
249   250  LEX LOCI:\n The law of the place; the notion t...
250   251  Casenote Legal Briefs\nAdministrative Law\nBre...
251   252  Contracts\nDawson, Harvey, Henderson & Baird\n...
252   253  International Law\nDamrosch, Henkin, Murphy & ...

[253 rows x 2 columns]


In [11]:
# Saving the raw

df.to_csv(r'D:\Proyectos\nllp_antitrust.v1\book_raw.csv', index=False)

Phase 2

We will clean the raw csv 

In [13]:
df_raw = pd.read_csv(r'D:\Proyectos\nllp_antitrust.v1\book_raw.csv')

In [27]:
# We will drop the NaN values

df_raw.dropna(subset=['Text'], inplace=True)

In [28]:
df_raw

Unnamed: 0,Page,Text
1,2,2
2,3,This publication is designed to provide accura...
3,4,About Wolters Kluwer Law & Business\nWolters K...
4,5,5
5,6,Aspen Publishers is proud to offer \nCasenote ...
...,...,...
248,249,Common Latin Words and Phrases Encountered in ...
249,250,LEX LOCI:\n The law of the place; the notion t...
250,251,Casenote Legal Briefs\nAdministrative Law\nBre...
251,252,"Contracts\nDawson, Harvey, Henderson & Baird\n..."


In [37]:
df_raw

Unnamed: 0,Page,Text
0,1,
1,2,2
2,3,This publication is designed to provide accura...
3,4,About Wolters Kluwer Law & Business\nWolters K...
4,5,5
...,...,...
248,249,Common Latin Words and Phrases Encountered in ...
249,250,LEX LOCI:\n The law of the place; the notion t...
250,251,Casenote Legal Briefs\nAdministrative Law\nBre...
251,252,"Contracts\nDawson, Harvey, Henderson & Baird\n..."


In [44]:
df_raw = df_raw[(df_raw['Page'] >= 17) & (df_raw['Page'] <= 248)]
df_raw.to_csv(r'D:\Proyectos\nllp_antitrust.v1\book.csv', index=False)

Phase 3

In this section we will organize the CSV according to the PDF sections 

In [2]:
book = pd.read_csv(r'D:\Proyectos\nllp_antitrust.v1\book.csv')
book

Unnamed: 0,Page,Text
0,17,United States v. Trans-Missouri Freight Assn.\...
1,18,United States v. Addyston Pipe & Steel Co.\nFe...
2,19,19
3,20,Quick Reference Rules of Law\n \n1.\nIntersta...
4,21,conduct. (Concord Boat Corp. v. Brunswick Corp...
...,...,...
227,244,Hallie v. City of Eau Claire\nUnincorporated t...
228,245,Quicknotes\nINJUNCTIVE RELIEF\n A court order ...
229,246,City of Columbia & Columbia Outdoor Advertisin...
230,247,247


In [3]:
# Extract case_name from text

pattern = r'^(.*?)(?=NATURE OF CASE:)'
book['case_name'] = book['Text'].str.extract(pattern, flags=re.DOTALL)


In [4]:
# Extract nature of case from the text
pattern = r'NATURE OF CASE:(.*?)(?=FACT SUMMARY:)'
book['nature_of_case'] = book['Text'].str.extract(pattern, flags=re.DOTALL)


In [5]:
# Extract fact summary 

pattern = r'FACT SUMMARY:(.*?)(?=RULE OF LAW)'
book['fact_summary'] = book['Text'].str.extract(pattern, flags=re.DOTALL)

In [6]:
# Extract rule of law

pattern = r'RULE OF LAW\n(.*?)(?=FACTS:)'
book['rule_of_law'] = book['Text'].str.extract(pattern, flags=re.DOTALL)

In [7]:
book

Unnamed: 0,Page,Text,case_name,nature_of_case,fact_summary,rule_of_law
0,17,United States v. Trans-Missouri Freight Assn.\...,United States v. Trans-Missouri Freight Assn.\...,\n Review of dismissal of antitrust enforcemen...,\n Trans-Missouri Freight Association (D) cont...,The Sherman Act’s prohibition on agreements in...
1,18,United States v. Addyston Pipe & Steel Co.\nFe...,United States v. Addyston Pipe & Steel Co.\nFe...,\n Action to dissolve a combination which soug...,\n Addyston Pipe & Steel Co. (D) and other pip...,A combination with the sole purpose to regulat...
2,19,19,,,,
3,20,Quick Reference Rules of Law\n \n1.\nIntersta...,,,,
4,21,conduct. (Concord Boat Corp. v. Brunswick Corp...,,,,
...,...,...,...,...,...,...
227,244,Hallie v. City of Eau Claire\nUnincorporated t...,Hallie v. City of Eau Claire\nUnincorporated t...,\n Appeal from denial of injunctive relief.\n,\n The Towns (P) appealed from a court of appe...,A municipality’s anticompetitive activities ar...
228,245,Quicknotes\nINJUNCTIVE RELIEF\n A court order ...,,,,
229,246,City of Columbia & Columbia Outdoor Advertisin...,City of Columbia & Columbia Outdoor Advertisin...,"\n Appeal of reversal of judgment n.o.v., rein...","\n Omni Outdoor Advertising, Inc. (P) contende...","A city may, through regulation, give a busines..."
230,247,247,,,,


In [180]:
book.to_csv(r'D:\Proyectos\nllp_antitrust.v1\corpus.csv', index=False)

In [181]:
book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Page            232 non-null    int64 
 1   Text            232 non-null    object
 2   case_name       109 non-null    object
 3   nature_of_case  109 non-null    object
 4   fact_summary    109 non-null    object
 5   rule_of_law     109 non-null    object
dtypes: int64(1), object(5)
memory usage: 11.0+ KB


In [170]:
observacion = book['Text'].iloc[231]    
observacion  

'FTC v. Ticor Title Insurance Co.\nFederal agency (P) v. Insurance Co. (D)\n504 U.S. 621 (1992).\nNATURE OF CASE:\n Appeal from judgment dismissing administrative complaint filed by the Federal Trade\nCommission (FTC).\nFACT SUMMARY:\n After Ticor Title Insurance Co. (D) had its rates set by a title insurance rating bureau\nlicensed by the state the FTC (P) filed a complaint alleging that the rating system constituted price-fixing.\nRULE OF LAW\nIn order to receive immunity from the application of antitrust laws, a state price regulatory system\nmust be under the active supervision of the state.\nFACTS:\n Four states established title insurance rating bureaus which jointly filed insurance rates for Ticor\nTitle Insurance Co. (Ticor) (D) and other insurance companies (D). Under the system in question, the bureau\nwould file the rates, and, if the state did not reject them within thirty days, the rates would go into effect.\nAlthough the mechanisms for review of rates were in place, any 