In [None]:
# !pip install python-docx
# !pip install pdfminer.six
# !pip install openai
# !pip install typing-extensions


In [2]:
from docx import Document
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

In [3]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

nltk.download('punkt')  # Download the Punkt tokenizer for sentence splitting

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import os
import pandas as pd

import openai
import time
# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = '********************'


## Managing Documents In Various Formats.


In [21]:
def read_pdf(file_path) :
    pdf_text = []

    for page in extract_pages(file_path):
        page_elements = [(element.y1, element) for element in page._objs]
        page_elements.sort(key=lambda a: a[0], reverse=True)
        for _ , element in page_elements:
            # Check if the element is text element
            if isinstance(element, LTTextContainer):
                # Use the function to extract the text for each text element
                line_text = element.get_text()
                pdf_text.append(line_text)
    return pdf_text

In [22]:
def read_document(file_path):
    """
    Iam Assuming That csv or xlsx Has Column called Text contains paragraph
    Row = 1 paragraph
    """

    if file_path.endswith('.csv'):
        return  pd.read_csv(file_path)

    elif file_path.endswith('.xlsx'):
        return  pd.read_excel(file_path)

    elif file_path.endswith('.txt'):
        with open(file_path) as file:
            file_text = file.read()
        paragraphs = file_text.split('\n\n')
        return pd.DataFrame(paragraphs, columns=['Text'])

    elif file_path.endswith('.doc'):
        doc = Document(file_path)
        paragraphs = doc.paragraphs
        paragraphs_as_strings = [p.text for p in paragraphs]
        return  pd.DataFrame(paragraphs_as_strings, columns=['Text'])

    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        paragraphs = doc.paragraphs
        paragraphs_as_strings = [p.text for p in paragraphs]
        return  pd.DataFrame(paragraphs_as_strings, columns=['Text'])

    elif file_path.endswith('.pdf'):
        pdf_text = read_pdf(file_path)
        return  pd.DataFrame(pdf_text, columns=['Text'])

    else :
        return None


## Bert Paragraph-Based Retrieval

In [23]:
def retrieve_paragraphs_deep_learning(user_topic, paragraphs):
    # Load pre-trained BERT model and tokenizer

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    user_topic_embedding = model(**tokenizer(user_topic, return_tensors='pt', max_length=512, truncation=True))['last_hidden_state'][0]

    document_embeddings = [model(**tokenizer(p, return_tensors='pt', max_length=512, truncation=True))['last_hidden_state'][0] for p in
                           paragraphs]


    similarities = []

    similarities = [
    cosine_similarity(user_topic_embedding.detach().numpy(), sentence_embedding.detach().numpy())[0][0].item() for
    sentence_embedding in document_embeddings]

    relevant_paragraphs = []

    for idx, similarity in enumerate(similarities):
        if similarity > 0.55:  # Adjust the similarity threshold as needed
            relevant_paragraphs.append(paragraphs[idx])

    return relevant_paragraphs





## Generating Headlines

In [48]:
def headline_generator(relevant_paragraphs):

    client = openai.Client()

    paragraphs = []
    headlines = []

    # Define your prompt (the paragraph)
    for paragraph in relevant_paragraphs:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                  "role": "system",
                  "content": "Craft a reflective Headline that highlights the broader significance of the upcoming paargraph."
                },
                {
                    "role": "user",
                    "content": paragraph
                }
            ],
            temperature=0.8,
            max_tokens=32,
            top_p=0.8
        )
        assistant_response = response.choices[0].message.content
        paragraphs.append(f'Paragraph: {paragraph}')
        headlines.append(f'Headline: {assistant_response}')
        # Rate limit reached for gpt-3.5-turbo then iam delaying the request
        time.sleep(20)

    return paragraphs, headlines


## Main Flow

In [49]:
df =  pd.read_csv("/content/BBC News Train.csv")

In [50]:
df.head(2)


Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business


In [51]:
document = df.Text[:10].to_list()
specified_topic = "Movie Director"

In [52]:
relevant_paragraphs  = retrieve_paragraphs_deep_learning(specified_topic, document)


In [53]:
len(relevant_paragraphs)

4

In [54]:
relevant , headlines = headline_generator(relevant_paragraphs)

### Printing Predicted Paragraphs and Predicted Headline

In [62]:
for idx , (paragraph , headline) in enumerate(zip(relevant, headlines)):
  print(f'\n===================================================\n')
  print(f"Paragraph: {idx}\n")
  print(paragraph.replace(".","\n"))
  print("\n")
  print(headline)




Paragraph: 0

Paragraph: worldcom ex-boss launches defence lawyers defending former worldcom chief bernie ebbers against a battery of fraud charges have called a company whistleblower as their first witness
  cynthia cooper  worldcom s ex-head of internal accounting  alerted directors to irregular accounting practices at the us telecoms giant in 2002
7bn) accounting fraud
 mr ebbers has pleaded not guilty to charges of fraud and conspiracy
  prosecution lawyers have argued that mr ebbers orchestrated a series of accounting tricks at worldcom  ordering employees to hide expenses and inflate revenues to meet wall street earnings estimates
 but ms cooper  who now runs her own consulting business  told a jury in new york on wednesday that external auditors arthur andersen had approved worldcom s accounting in early 2001 and 2002
 she said andersen had given a  green light  to the procedures and practices used by worldcom
 mr ebber s lawyers have said he was unaware of the fraud  arguing 