## Get Access Token for Service Principal

In [18]:
import os
import adal
import requests
from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient
import pandas as pd
from pypdf import PdfReader, PdfWriter
import io
import time
import openai
from openai import OpenAI
import json





In [2]:
from env import env_vars
import env
for k,v in env_vars.items():
    os.environ[k] = v
    
    
subscription_id = env_vars['subscription_id']
resource_group  = env_vars['resource_group']
storage  = env_vars['storage']


#Service Principal with BlobSotrage Contributor role permissions
service_principal_id  = env_vars['service_principal_id']
service_principal_tenant_id  = env_vars['service_principal_tenant_id']
service_principal_client_id  = env_vars['service_principal_client_id']
service_principal_client_secret  = env_vars['service_principal_client_secret']

blob_name = env_vars["blob_name"]
container_name = env_vars["container_name"]



openai.api_type = "azure"
openai.api_version = "2023-07-01-preview"
openai.api_base = env_vars["openai_api_base"]
openai.azure_endpoint = env_vars["openai_api_base"]
openai.api_key = env_vars["openai_api_key"]



## Create an App Registration and give it appropriate perm on the subscription

In [4]:
class RAG_Auth:
    def __init__(self, service_principal_tenant_id, service_principal_client_id, service_principal_client_secret):
        resource = 'https://management.azure.com/'
        authority_url = f'https://login.microsoftonline.com/{service_principal_tenant_id}'
        context = adal.AuthenticationContext(authority_url)
        token = context.acquire_token_with_client_credentials(resource, service_principal_client_id, service_principal_client_secret)
        self.access_token = token['accessToken']
        self.credential = ClientSecretCredential(service_principal_tenant_id, service_principal_client_id, service_principal_client_secret)
        
    def get_access_token(self):
        return self.access_token
    
    def get_credential(self):
        return self.credential
# Extract the access token



In [5]:
class RAG_Req:
    def __init__(self, access_token):
        self.headers = headers = {
            'Authorization': 'Bearer ' + access_token,
            'Content-Type': 'application/json'
        }
    def proc_response(self, response):
        if response.status_code == 200:
            response = response.json()

            return response
        else:
            print("Request failed with status code:", response.status_code)
            print("Response content:", response.text)
        
    def get_req(self, url):
        response = requests.get(url, headers=headers)
        return proc_response(response)
    
    def post_req(self, url, body=""):
        response = requests.post(url, headers=headers, body=body)
        return proc_response(response)

In [6]:
from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient, BlobClient


class RAG_Blob:
    def __init__(self, service_principal_tenant_id, service_principal_client_id, service_principal_client_secret, storage):
        
        rag_auth = RAG_Auth(service_principal_tenant_id, service_principal_client_id, service_principal_client_secret)

        self.credential  = rag_auth.get_credential()
        self.blob_service_client = BlobServiceClient(account_url=f"https://{storage}.blob.core.windows.net", credential=self.credential)
        
        access_token = rag_auth.get_access_token()
        self.rag_req = RAG_Req(access_token)
        
    def create_container(self, container_name):
        container_client = self.blob_service_client.get_container_client(container_name)
        if not container_client.exists():
            try:
                container_client.create_container()
                print(f"Container '{container_name}' created successfully.")
            except Exception as e:
                print(f"An error occurred: {e}")
        else:
            print(f"Container '{container_name}' already exists.")
            
    def upload_blob(self, container_name, blob_name):
        container_client = self.blob_service_client.get_container_client(container_name)
        blob_client = container_client.get_blob_client(blob_name)
        try:
            # Upload the file to the blob
            with open(blob_name, "rb") as data:
                blob_client.upload_blob(data, overwrite=True)

            print(f"Blob '{blob_name}' uploaded successfully.")
        except Exception as e:
            print(f"An error occurred: {e}")
            
    def list_files(self, in_container_name):
        container_client = self.blob_service_client.get_container_client(in_container_name)
        blob_list = container_client.list_blobs()
        # serialize blob_list
        blob_list = json.dumps([blob.name for blob in blob_list])   
        return blob_list
        
            
    def get_blob_key(self, subscription_id, resource_group, form_recognizer):
        url = f'https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{form_recognizer}/listKeys?api-version=2023-05-01'
        response = self.rag_req.get_req(url)
        return response["key1"]
        
    def blob_name_from_file_page(self, filename, page=0):
        if os.path.splitext(filename)[1].lower() == ".pdf":
            return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
        else:
            return os.path.basename(filename)
        
        
    def upload_blob_data(self, container_name, blob_name, io_blob_data):
        container_client = self.blob_service_client.get_container_client(container_name)
        container_client.upload_blob(blob_name, io_blob_data, overwrite=True)
        
        
    def blob_exists(self, container_name, blob_name):
        blob_client = self.blob_service_client.get_blob_client(container_name, blob_name)
        return blob_client.exists()

    def create_upload_page_chunks(self, chunk_container_name, blob_name):
        reader = PdfReader(blob_name)
        pages = reader.pages
        for i in range(len(pages)):
            chunk_blob_name = self.blob_name_from_file_page(blob_name, i)
            if not self.blob_exists(chunk_container_name, chunk_blob_name):
                f = io.BytesIO()
                writer = PdfWriter()
                writer.add_page(PdfReader(blob_name).pages[i])
                writer.write(f)
                f.seek(0)
                print(chunk_container_name, ":", chunk_blob_name)
                self.upload_blob_data(chunk_container_name, chunk_blob_name, f)
        
    def get_pdf_content(self, in_container_name, in_file_name):
        container_client = self.blob_service_client.get_container_client(in_container_name)
        blob_client = container_client.get_blob_client(in_file_name)
        blob_data = blob_client.download_blob()
        blob_data = blob_data.readall()

        # read pdf file in_file_name
        pdf_reader = PdfReader(io.BytesIO(blob_data))
        num_pages = len(pdf_reader.pages)
        page_text = []
        for i in range(num_pages):
            page = pdf_reader.pages[i]
            page_text.append(page.extract_text())

        return "\n".join(page_text)
    

In [7]:
def get_merged_summary(user_content):
    system_content = """You are a GPT designed to summarize large documents efficiently. 
You will receive two types of inputs: 1. A summary of earlier sections of the document, 
and 2. The content of the current section of the document. 
Your primary task is to create a revised summary by effectively combining the summary of earlier sections with the content of the current section. 
The output you provide, which is the revised summary, should be concise and focused solely on the content provided, without any additional comments or remarks. 
This output will then be used as the 'Summary of earlier sections' for the subsequent sections of the document."""
    try:
        message_text = [{"role":"system",
                         "content":system_content},
                        {"role":"user","content":user_content}]


        completion = openai.chat.completions.create(
          model="gpt432k",
          messages = message_text,
          temperature=0.6,
          max_tokens=1000,
          top_p=0.95,
          frequency_penalty=0.5,
          presence_penalty=0.5,
          stop=None
        )
        return(completion.choices[0].message.content)
    except Exception as e:
        print(e)
        #display(HTML(html_content))
        return({})
    
def get_doc_summary(user_content):
    system_content = """You are an assistant designed to summarize large documents efficiently. 
The output you provide, should be concise and focused solely on the content provided, without any additional comments or remarks. 
"""
    try:
        message_text = [{"role":"system",
                         "content":system_content},
                        {"role":"user",
                         "content":f"Document Content: {user_content} \nSummary:"}]


        completion = openai.chat.completions.create(
          model="gpt432k",
          messages = message_text,
          temperature=0.6,
          max_tokens=1000,
          top_p=0.95,
          frequency_penalty=0.5,
          presence_penalty=0.5,
          stop=None
        )
        return(completion.choices[0].message.content)
    except Exception as e:
        print(e)
        #display(HTML(html_content))
        return({})

In [8]:
def revise_file_name(fname):
    file = fname.split("-")[0]
    no = fname.split("-")[1].split(".")[0]
    
    suffix = fname.split("-")[1].split(".")[1]
    revised_fname = "file" + "_" + str(no).zfill(3) + "_" + file + "." + suffix
    return revised_fname

def get_sorted_file_list(file_list):
    revised_file_list = [revise_file_name(file) for file in eval(file_list)]
    df = pd.DataFrame({"fname": eval(file_list), "revised_fname": revised_file_list})
    df.sort_values("revised_fname", inplace=True)  
    revised_file_list = df["revised_fname"].values
    file_list = df["fname"].values
    return file_list

## Create below resources:
1. Cognitive Search Resource
2. Storage Account Resource
3. Form recognizer resource
4. Cognitive services resource

In [9]:
!az login > x



In [28]:
!az account set --subscription "MCAPS-Hybrid-REQ-66361-2023-shagrawal" >> x

In [29]:
!az account show >> x

In [30]:
!az group create --name %resource_group% --location %location% >> x

## Give Service Principal Blob Data Contributor role
This took some time to take effect
https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles


## Upload documents to Storage Account Resource

In [13]:
storage_blob_url = f"https://{storage}.blob.core.windows.net/"

In [14]:
blob_name

'informationmemorandumjfsl.pdf'

In [15]:
rag_blob = RAG_Blob(service_principal_tenant_id, service_principal_client_id, service_principal_client_secret, storage)
rag_blob.create_container(container_name)
rag_blob.upload_blob(container_name, blob_name)

Container 'rildata' already exists.
Blob 'informationmemorandumjfsl.pdf' uploaded successfully.


## Create a pipeline to Chunk documents
https://github.com/Azure-Samples/azure-search-power-skills/blob/main/Vector/EmbeddingGenerator/chunker/text_chunker.py

In [16]:
chunk_container_name = container_name + "pagechunks"
rag_blob.create_container(chunk_container_name)
rag_blob.create_upload_page_chunks(chunk_container_name, blob_name)

Container 'rildatapagechunks' already exists.


In [36]:

file_list = rag_blob.list_files(chunk_container_name)
file_list = get_sorted_file_list(file_list)

In [37]:
if 1==2:
    file_list = file_list[0:20]

In [25]:
summary = ""

for file in file_list:
    print(file)
    file_content = rag_blob.get_pdf_content(in_container_name=chunk_container_name, in_file_name=file)
        
    summary = get_merged_summary(f"Summary of earlier sections:{summary} \nContent of current section:{file_content} \nRevised Summary:")
    time.sleep(10)
    print(summary)

informationmemorandumjfsl-0.pdf
Jio Financial Services Limited, originally incorporated as Reliance Strategic Investments Private Limited in 1999, later converted to a public limited company in 2002. It was renamed as Jio Financial Services Limited following a Scheme of Arrangement in 2023. The company is headquartered in Mumbai and its promoter is Mukesh D. Ambani. It aims to list 635,32,84,188 equity shares with a face value of ₹10 each, allotted by the company pursuant to the Scheme of Arrangement. The shares are expected to be listed on both BSE and NSE stock exchanges. The Information Memorandum for this listing contains all material information about the company and its investment risks. Kfin Technologies Limited is the registrar and share transfer agent for the company's shares.

informationmemorandumjfsl-1.pdf
Jio Financial Services Limited, formerly Reliance Strategic Investments Private Limited, was incorporated in 1999 and converted to a public limited company in 2002. Follo

Jio Financial Services Limited, established in 1999 and renamed in 2023, plans to list over 635 billion equity shares on BSE and NSE stock exchanges. Key definitions include JFSL's subsidiaries, joint ventures; RIL; audit committee; statutory auditors; Depository registered with SEBI under Securities regulations (1996), Equity Shares as fully paid-up equity shares allotted by the company, "NBFC -ND-SI” or “Systemically Important NBFCs”, a non-banking financial company registered with RBI, "NCLT" or "Tribunal" referring to The National Company Law Tribunal, Mumbai Bench.

Additional terminologies include Net Worth as per Regulation 2(1)(hh ) of the SEBI ICDR Regulations, promoter of the Company being Shri Mukesh D. Ambani, Record Date as July 20, 2023 for determining shareholders of RIL for issue of new equity shares. The Registered Office location and Registrar and Share Transfer Agent is identified as Kfin Technologies Limited. Definitions also cover Reliance GDRs and Risk Management 

In [40]:
arr_summary = []
summary = ""
tot_file_content = ""
import time
ctr = 0
for file in file_list:
    print(file)
    file_content = rag_blob.get_pdf_content(in_container_name=chunk_container_name, in_file_name=file)
    tot_file_content = tot_file_content + file_content
    ctr = ctr + 1
    if ctr == 10:
        summary = get_doc_summary(tot_file_content)
        arr_summary.append(summary)
        ctr = 0
        tot_file_content = ""
    
    #time.sleep(10)
    

informationmemorandumjfsl-0.pdf
informationmemorandumjfsl-1.pdf
informationmemorandumjfsl-2.pdf
informationmemorandumjfsl-3.pdf
informationmemorandumjfsl-4.pdf
informationmemorandumjfsl-5.pdf
informationmemorandumjfsl-6.pdf
informationmemorandumjfsl-7.pdf
informationmemorandumjfsl-8.pdf
informationmemorandumjfsl-9.pdf
informationmemorandumjfsl-10.pdf
informationmemorandumjfsl-11.pdf
informationmemorandumjfsl-12.pdf
informationmemorandumjfsl-13.pdf
informationmemorandumjfsl-14.pdf
informationmemorandumjfsl-15.pdf
informationmemorandumjfsl-16.pdf
informationmemorandumjfsl-17.pdf
informationmemorandumjfsl-18.pdf
informationmemorandumjfsl-19.pdf


In [41]:
content1 = "\n".join(arr_summary[0:1])
content2 = "\n".join(arr_summary[1:])
print(len(content1))
print(len(content2))
summary1 = get_doc_summary(content1)
summary2 = get_doc_summary(content2)
summary = get_doc_summary(content1 + "\n" + content2)

785
1499


In [42]:
print(summary)

Jio Financial Services Limited, formerly Reliance Strategic Investments Limited, is planning to list 635,32,84,188 equity shares on the BSE Limited and National Stock Exchange of India Limited. The company operates as a non-banking financial institution registered with the Reserve Bank of India and serves individuals and SMEs. It warns potential investors about risks associated with investing in its shares including economic downturns affecting its customer base, interest rate volatility, disruption in debt funding sources, and non-compliance with future borrowing agreements. It also mentions reliance on dividends from subsidiaries for revenue. The document includes information about shareholding structure, business summary, industry overview, financial details including outstanding litigation and contingent liabilities, risk factors related to the business and related party transactions.

