# RAG Algorithm

In [1]:
import re
import spacy
import pdfplumber 

def find_citations(pdf_file_path):
    
    nlp = spacy.load("en_core_web_sm")
    
    document_text = ""
    with pdfplumber.open(pdf_file_path) as pdf:
        for page in pdf.pages:
            document_text += page.extract_text()
    
    citation_pattern_1 = re.compile(r"[A-Za-z\s.'’&,]+v\.?\s[A-Za-z\s.'’&,]+\(.*?\)", re.DOTALL)
    citation_pattern_2 = re.compile(r"[A-Z\s.,]+[,\s]?[Vv]\.[A-Z\s.,]+")
    section_symbol = "§"
    
    doc = nlp(document_text)
    
    citations = []
    
    for i, sentence in enumerate(doc.sents):
        if "Name:" not in sentence.text:
            if section_symbol in sentence.text:
                citations.append((i, sentence.text.strip()))
            matches = citation_pattern_1.findall(sentence.text)
            if not matches:
                matches = citation_pattern_2.findall(sentence.text)
                for match in matches:
                    citations.append((i, match.strip()))
            else:
                for match in matches:
                    citations.append((i, match.strip() + ")"))
    
    citations.sort(key=lambda x: x[0])
    
    sorted_citations = [citation for _, citation in citations]   

    return sorted_citations


In [2]:
pdf_file_path = "RAG-Assignment.pdf"
extracted_citations = find_citations(pdf_file_path)
print("Extracted Citations:")
k = 1
for citation in extracted_citations:
    print(f"{k}. {citation}")
    k += 1

Extracted Citations:
1. IRC § 501(c)(3)
2. Bob Jones University v. Simon (472 U.S. 509 (1983))
3. Reynolds v. United States (98 F.3d 1127 (9th Cir. 1996))
4. Eastern Montana College of Education v. Helena (924 F.2d 1322 (9th Cir. 1991))
5. Glock v. Commissioner (79 T.C. 449 (1982))
6. Speakman v. Commissioner (823 F.2d 1168 (6th Cir. 1987))
7. Murdoch v. Commissioner (704 F.2d 1002 (9th Cir. 1983))
8. Texas Heart Hospital of St. Luke's Episcopal Health Charities, Inc. v. United States (978
F.2d 280 (5th Cir. 1992))
9. United States v. The Sanctuary (49 F.3d 572 (9th Cir. 1995))
10. .Hermitage Ministries, Inc. v. Commissioner (73 T.C. 1106 (1979))


In [3]:
pdf_file_path = "C:\Datasets/012224zor_4gcj.pdf"
extracted_citations = find_citations(pdf_file_path)
print("Extracted Citations:")
k = 1
for citation in extracted_citations:
    print(f"{k}. {citation}")
    k += 1

Extracted Citations:
1. U.S. STEEL CORP. V. EPA, ET AL.
2. TEXAS V. NEW MEXICO, ET AL.
3. JORDAN, CHRISTINA V. HOWELL, KARLA
4. ERWIN, DANIEL V. HOWELL, KARLA
5. HUGHES, SAMUEL T. V. UNITED STATES
T
6. GLOSSIP, RICHARD E. V. OKLAHOMA
7. PORTER, STEPHEN R. V. BOARD OF TRUSTEES OF NC, ET AL.
8. ROSS, KRISTY V. FTC
9. ARCHER, DEVON V. UNITED STATES
10. YOAST, TERRENCE R. V. POTTSTOWN BOROUGH, PA, ET AL.
11. OTOH, PETER V. U.S. BANK TRUST, ET AL.
12. MERLO, KENDALL V. WARREN, INGRID
13. DE BOTTON, RAYMOND V. QUALITY LOAN SERV.
14. BAPTISTE, HAROLD V. DEPT.
15. SUPERIOR WELL SERVICES, INC. V. AMERICAN HOME ASSURANCE, ET AL.
16. ABDULLA, SOHAIL M. V. SOUTHERN BANK
17. KENNO, YOSEPH Y. V. CO GOVERNOR
18. SMITH, CALEB A. C. V. UNITED STATES
19. MILLER, DAISY V. UNITED STATES
20. DIAMOND J. WHOLESALE, LLC V. TOP TOBACCO, L.P., ET AL.
21. JOHNSON, DAQUAIL R. V. VIRGINIA
22. MOODY, JEREMY V. GEORGIA
23. LANGSTON, RICHARD V. CONNECTICUT
24. SIMMONS, KIRK A. V. SCARANTINO, THOMAS
25. AUSTIN, MARIO 

In [4]:
# Extracted 90 citations from a us federal court document pdf 

## EVALUATION

#### On Sample Document

In [5]:
from fuzzywuzzy import fuzz

def evaluate_citations(extracted_citations, actual_citations):
    TP = 0
    FP = 0
    FN = 0

    for extracted_citation in extracted_citations:
        found_match = False
        for actual_citation in actual_citations:
            if fuzz.ratio(extracted_citation, actual_citation) >= 95: 
                TP += 1
                found_match = True
                break
        if not found_match:
            FP += 1

    for actual_citation in actual_citations:
        if all(fuzz.ratio(actual_citation, extracted_citation) < 95 for extracted_citation in extracted_citations):
            FN += 1

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score



actual_citation = ["IRC § 501(c)(3)", "Bob Jones University v. Simon (472 U.S. 509 (1983))","Reynolds v. United States (98 F.3d 1127 (9th Cir. 1996))","Eastern Montana College of Education v. Helena (924 F.2d 1322 (9th Cir. 1991))","Glock v. Commissioner (79 T.C. 449 (1982))","Speakman v. Commissioner (823 F.2d 1168 (6th Cir. 1987))","Murdoch v. Commissioner (704 F.2d 1002 (9th Cir. 1983))","Texas Heart Hospital of St. Luke's Episcopal Health Charities, Inc. v. United States (978 F.2d 280 (5th Cir. 1992))","United States v. The Sanctuary (49 F.3d 572 (9th Cir. 1995))","Hermitage Ministries, Inc. v. Commissioner (73 T.C. 1106 (1979))"]

extracted = find_citations("RAG-Assignment.pdf") 
precision, recall, f1_score = evaluate_citations(extracted, actual_citation)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)



Precision: 1.0
Recall: 1.0
F1-score: 1.0


##### The above code uses fuzz ratio to calculate true positive, false negative and false positive, if fuzzratio is greater than 0.95(high similarity) it considers the citations to be matching with the actual citation, this is to ensure that the slight formatting differneces between the extracted citations and actual citations don't disturb the precision, recall and f1 score calculation.

### Simple Evaluation

In [6]:
def evaluate_citations(extracted_citations, actual_citations):
    TP = len(set(extracted_citations) & set(actual_citations))
    FP = len(set(extracted_citations) - set(actual_citations))
    FN = len(set(actual_citations) - set(extracted_citations))
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

actual_citation = ["IRC § 501(c)(3)", "Bob Jones University v. Simon (472 U.S. 509 (1983))","Reynolds v. United States (98 F.3d 1127 (9th Cir. 1996))","Eastern Montana College of Education v. Helena (924 F.2d 1322 (9th Cir. 1991))","Glock v. Commissioner (79 T.C. 449 (1982))","Speakman v. Commissioner (823 F.2d 1168 (6th Cir. 1987))","Murdoch v. Commissioner (704 F.2d 1002 (9th Cir. 1983))","Texas Heart Hospital of St. Luke's Episcopal Health Charities, Inc. v. United States (978 F.2d 280 (5th Cir. 1992))","United States v. The Sanctuary (49 F.3d 572 (9th Cir. 1995))","Hermitage Ministries, Inc. v. Commissioner (73 T.C. 1106 (1979))"]  
extracted = find_citations("RAG-Assignment.pdf") 

precision, recall, f1_score = evaluate_citations(extracted, actual_citation)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Precision: 0.8
Recall: 0.8
F1-score: 0.8000000000000002


##### This one is giving 0.8 because this code treats "Texas Heart Hospital of St. Luke's Episcopal Health Charities, Inc. v. United States (978##### 
F.2d 280 (5th Cir. 1992)" and ".Hermitage Ministries, Inc. v. Commissioner (73 T.C. 1106 (1979)) in extracted citations as false positives"
)