In [2]:
import re
import pandas as pd
from html_workflow.parser import ExtractItems
from utils.text_cleaner import clsCleaner
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import os
from  bs4 import BeautifulSoup
import requests
import tqdm
import re
import unicodedata
import gradio as gr
import json

In [3]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return extraction.extract_items(content=content)

def generate_summary_or_document(cik, company_name, json_section, is_inference=False):
    if json_section:
        summary = get_summarise_text(json_section)
        if not is_inference:
            return create_document({"CIK": cik, "company_name": company_name}, summary)
        else:
            return summary
    return None

def process_record(record, raw_fillings_dir, is_inference=False):
    cik = record['CIK']
    company_name = record['Company_Name']
    file_path = os.path.join(raw_fillings_dir, cik + '.html')
    
    if os.path.exists(file_path):
        json_section = process_file(file_path)
        return generate_summary_or_document(cik, company_name, json_section, is_inference)
    return None

def create_document(company_metadata,summary):
    return Document(page_content=summary,metadata=company_metadata)

def get_summarise_text(content_json):
    final_text = ''
    text_list = clsCleaner.extract_values(content_json,['item_1' , 'item_2','item_5'])
    for text in text_list:
        # text = clsCleaner.create_half_text(text)
        output = query({"inputs": f" {text}",})
        if len(output) > 0 :
            final_text = final_text + output[0]['summary_text']
    return final_text   



In [4]:
extraction = ExtractItems(
        remove_tables=True,
        items_to_extract=[
			"1", "1A", "1B", "2", "5", "7"
		],
    )

In [5]:
def inference(cik):
    try:
        output_json = {"cluster_companies":{"company_name":[] , "CIK" :[] , "score":[]}}
        company_csv = pd.read_csv("comapny_mapping.csv",dtype={'CIK': str}) 
        record=company_csv[company_csv['CIK']==str(cik)].to_dict(orient='records')
        summary=process_record(record[0],raw_fillings_dir='./data',is_inference=True)
        new_db = FAISS.load_local('faiss_index', embeddings)
        result = new_db.similarity_search_with_score(summary,k=3)
        print(len(result))
        for document,score in result:
            print(document)
            output_json['cluster_companies']['company_name'].append(document.metadata['company_name'])
            output_json['cluster_companies']['CIK'].append(document.metadata['CIK'])
            output_json['cluster_companies']['score'].append(str(score))
            output_json['inference_company_name'] = record[0]['Company_Name']
            output_json['inference_CIK'] = record[0]['CIK']
            output_json['summary'] = summary
        return json.dumps(output_json) 
    except Exception as e:
        return json.dumps(output_json)

In [7]:
gr.Interface(fn=inference, inputs="text", outputs=gr.Json(), title="Inscope Home Assignment", description="Summarising and Clustering").launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://686abad61fc6766193.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


