<a href="https://colab.research.google.com/github/shahab-f/proj-x-content-machine-formerly-blogger/blob/main/Cross_Document_Analysis_rev01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install transformers accelerate bitsandbytes python-docx PyPDF2

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
from docx import Document
import PyPDF2
from google.colab import files

# Global variables
model_loaded = False
MODEL_NAME = "codellama/CodeLlama-7b-hf"
MAX_LENGTH = 500
MAX_NEW_TOKENS = 300
TEMPERATURE = 0.7
TOP_P = 0.95

def load_model_if_needed():
    global model_loaded, tokenizer, model, pipe
    if not model_loaded:
        print("Loading model...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        if torch.cuda.is_available():
            print("GPU is available. Using GPU.")
            model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
        else:
            print("GPU is not available. Using CPU. This might be very slow and may not work due to memory constraints.")
            model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", low_cpu_mem_usage=True)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=MAX_LENGTH, device_map="auto")
        model_loaded = True
        print("Model loaded successfully.")
    else:
        print("Model already loaded. Skipping loading process.")

def analyze_document(document_text):
    load_model_if_needed()
    prompt = f"Analyze the following legal document and extract key information:\n\n{document_text[:2000]}\n\nKey information:"
    result = pipe(prompt, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=TEMPERATURE, top_p=TOP_P, num_return_sequences=1)
    return result[0]['generated_text']

def compare_documents(analyses):
    load_model_if_needed()
    comparison_prompt = f"Compare the following document analyses and identify common themes, trends, and conflicting information:\n\n{analyses}\n\nComparison results:"
    result = pipe(comparison_prompt, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=TEMPERATURE, top_p=TOP_P, num_return_sequences=1)
    return result[0]['generated_text']

def read_document(file_path):
    if file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    elif file_path.endswith('.pdf'):
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            return '\n'.join([page.extract_text() for page in pdf_reader.pages])
    else:
        raise ValueError(f"Unsupported file type: {file_path}")

# Main execution
if __name__ == "__main__":
    use_upload = input("Do you want to upload files? (yes/no): ").lower().strip() == 'yes'
    document_paths = []

    if use_upload:
        print("Please upload your files. Upload them one by one.")
        while True:
            uploaded = files.upload()
            if not uploaded:
                break
            document_paths.extend(uploaded.keys())
            more = input("Do you want to upload another file? (yes/no): ").lower().strip()
            if more != 'yes':
                break
    else:
        while True:
            path = input("Enter the path to your document (or press Enter to finish): ")
            if not path:
                break
            if os.path.exists(path):
                document_paths.append(path)
            else:
                print("The specified path does not exist. Please try again.")

        if not document_paths:
            print("No valid paths entered. Exiting.")
            exit()

    analyses = []
    for document_path in document_paths:
        try:
            if os.path.isfile(document_path):
                document_text = read_document(document_path)
                analysis = analyze_document(document_text)
                analyses.append(analysis)
                print(f"Analyzed {os.path.basename(document_path)}")
            elif os.path.isdir(document_path):
                for filename in os.listdir(document_path):
                    file_path = os.path.join(document_path, filename)
                    if file_path.endswith((".txt", ".docx", ".pdf")):
                        try:
                            document_text = read_document(file_path)
                            analysis = analyze_document(document_text)
                            analyses.append(analysis)
                            print(f"Analyzed {filename}")
                        except Exception as e:
                            print(f"Error processing {filename}: {str(e)}")
            else:
                print(f"The path {document_path} is neither a file nor a directory.")
        except Exception as e:
            print(f"Error processing {document_path}: {str(e)}")

    if analyses:
        if len(analyses) > 1:
            comparison_result = compare_documents("\n\n".join(analyses))
            print("\nComparison Result:")
            print(comparison_result)
        else:
            print("\nAnalysis Result:")
            print(analyses[0])
    else:
        print("No documents were successfully analyzed.")

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12

Saving on_en_sct_Mar2015.pdf to on_en_sct_Mar2015.pdf
Do you want to upload another file? (yes/no): yes


Saving Conv_mtg_security_info_en_Jan.pdf to Conv_mtg_security_info_en_Jan.pdf
Do you want to upload another file? (yes/no): no
Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

GPU is available. Using GPU.


config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Model loaded successfully.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Analyzed on_en_sct_Mar2015.pdf
Model already loaded. Skipping loading process.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Analyzed Conv_mtg_security_info_en_Jan.pdf
Model already loaded. Skipping loading process.

Comparison Result:
Compare the following document analyses and identify common themes, trends, and conflicting information:

Analyze the following legal document and extract key information:

  Page 1  
 
Ontario Standard Charge Terms  
March 2015   
 
Residential Mortgage  
Ontario  
 
Filing No. 201506 
 
 
 
Set of Standard Charge Terms  
Land Registration Reform Act  
Filed by Bank of Montreal  
The following set of standard charge terms shall be deemed to be included in every charge in which the 
set is referre d to by its filing number, as provided in section 9 of the Act.  
 
General Terms  
By entering into the mortgage with us, you promise to repay a loan and you give 
us security over property.  The security gives us a right to have the property used 
to pay what i s owed.  
 
The mortgage includes these General Terms . 
The General Terms show:  
 How you must pay the loan and what co