<a href="https://colab.research.google.com/github/poseidon-rust2/extract_income_statements/blob/main/extract_income_statements.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install      pinecone-client
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.32.1 --progress-bar off
!pip install -qqq datasets==2.14.4 --progress-bar off
!pip install -qqq peft==0.5.0 --progress-bar off
!pip install -qqq langchain==0.0.299 --progress-bar off
!pip install -qqq bitsandbytes==0.41.1 --progress-bar off
!pip install -qqq trl==0.7.1 --progress-bar off
!pip install -qqq xformers==0.0.21 --progress-bar off
!pip install -qqq sentence_transformers==2.2.2 --progress-bar off
!pip install -qqq tokenizers==0.14.0 --progress-bar off
!pip install -qqq optimum==1.13.1 --progress-bar off
!pip install -qqq auto-gptq==0.4.2 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --progress-bar off
!pip install -qqq unstructured==0.10.16 --progress-bar off
!pip install pypdf

In [None]:
import json
import re
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"


In [None]:
#MODEL_NAME = "meta-llama/Llama-2-7b-hf"
#MODEL_NAME = "NousResearch/Llama-2-7b-hf"
#MODEL_NAME = "TheBloke/Llama-2-13b-Chat-GPTQ"
MODEL_NAME = "TheBloke/Llama-2-7B-GPTQ"

In [None]:
dataset = load_dataset("poseidon-rust2/income_statements_apple")
dataset

In [None]:
notebook_login()

In [None]:
from transformers import GPTQConfig
from peft import prepare_model_for_kbit_training
def create_model_and_tokenizer():
    quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
    model = AutoModelForCausalLM.from_pretrained(
                                  MODEL_NAME,
                                  use_safetensors=True,
                                  quantization_config=quantization_config_loading,
                                  trust_remote_code=True,
                                  device_map="auto"
                              )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = prepare_model_for_kbit_training(model)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [None]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

In [None]:
model.config.quantization_config.to_dict()

In [None]:
OUTPUT_DIR = "experiments"

In [None]:
%load_ext tensorboard
%tensorboard --logdir experiments/runs

In [None]:
lora_alpha = 32
lora_dropout = 0.05
lora_r = 16
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    #target_modules=["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=13,
    evaluation_strategy="no",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="tensorboard",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=config,
    dataset_text_field="text",
    max_seq_length=3000,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
trainer.model

In [None]:
from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    device_map='cuda',
    low_cpu_mem_usage=True,
)

In [None]:
#inputs = tokenizer(dataset["train"][0]["text"], return_tensors="pt").to(DEVICE)
#outputs = trained_model.generate(**inputs, max_new_tokens=3000, temperature=0.001)
#print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from langchain import HuggingFacePipeline
from transformers import GenerationConfig, pipeline

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=generation_config,
)

llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

In [None]:
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer

import pinecone
import os

!gdown "https://drive.google.com/uc?id=1qu8vT4BH2UyVrfWyqCcQ6Ujj8Xu4CXhH"
loader = PyPDFLoader("/content/temp.pdf")
data = loader.load()

In [None]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1400, chunk_overlap=300)

In [None]:
docs=text_splitter.split_documents(data)

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'f5444e56-58db-42db-afd6-d4bd9b2cb40c')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'asia-southeast1-gcp-free')

In [None]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [None]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)
index_name = "langchainpinecone"

In [None]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

In [None]:
query="consolidated statements of operations, operating expenses, operating income"
docs=docsearch.similarity_search(query)

In [None]:
from langchain.chains.question_answering import load_qa_chain
chain=load_qa_chain(llm, chain_type="stuff")

In [None]:
information_to_extract = "Extract consolidated statements of operations over the years in a tsv formatted text."
chain.run(input_documents=docs, question=information_to_extract)