In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx

def preprocess_text(text):
    # Tokenize the text into paragraphs
    paragraphs = text.strip().split("\n\n")

    # Optionally, you can perform cleaning and normalization here

    return paragraphs

def textrank_best_paragraph(document, query):
    # Preprocess the document and get the paragraphs
    paragraphs = preprocess_text(document)

    # Initialize the TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Compute the TF-IDF matrix for the paragraphs
    tfidf_matrix = tfidf_vectorizer.fit_transform(paragraphs)

    # Compute the cosine similarity between paragraphs and the query
    query_vector = tfidf_vectorizer.transform([query])
    paragraph_similarity = (tfidf_matrix * query_vector.T).A.flatten()

    # Convert the similarity scores to a graph
    graph = nx.Graph()
    for i, sim_score in enumerate(paragraph_similarity):
        graph.add_node(i, weight=sim_score)
    for i in range(len(paragraphs)):
        for j in range(i + 1, len(paragraphs)):
            similarity = paragraph_similarity[i] * paragraph_similarity[j]
            if similarity > 0:
                graph.add_edge(i, j, weight=similarity)

    # Rank the paragraphs using the PageRank algorithm
    scores = nx.pagerank(graph, weight='weight')

    # Get the index of the top-ranked paragraph
    best_paragraph_index = max(scores, key=scores.get)

    # Get the best paragraph
    best_paragraph = paragraphs[best_paragraph_index]

    return best_paragraph

# Example usage with a longer document
document = """
    This is the first paragraph. It contains some relevant information. 
    The second paragraph has additional details. 
    Paragraph three is not very relevant. 
    The last paragraph summarizes the document's content. 
    
    In recent years, natural language processing (NLP) has made significant advancements. 
    It enables machines to understand and process human language effectively. 
    NLP techniques are used in various applications, including chatbots, machine translation, 
    sentiment analysis, and text summarization.
    
    The field of NLP has seen tremendous growth due to the availability of large datasets and 
    powerful transformer-based models like BERT and GPT. These models can learn complex 
    linguistic patterns and have been fine-tuned for various tasks. As a result, NLP 
    applications have become more accurate and useful in real-world scenarios.
    
    TextRank is a popular algorithm for extractive text summarization and keyword extraction. 
    It is based on the PageRank algorithm used by Google for ranking web pages. TextRank 
    treats sentences or words in the text as nodes in a graph and uses their semantic similarity 
    to calculate scores. The algorithm then selects the most important sentences or keywords 
    based on these scores.
    
    However, when dealing with longer documents, traditional TextRank may not be sufficient. 
    In such cases, using a combination of semantic similarity and graph-based ranking can be 
    more effective. Additionally, transformer-based models can be used to calculate semantic 
    similarity, taking into account the context and meaning of the words and sentences in the 
    document and the query. This enables better selection of relevant paragraphs, sentences, 
    or keywords from the text.
"""

query = "natural language processing"

best_paragraph = textrank_best_paragraph(document, query)
print("Best Paragraph:", best_paragraph)


Best Paragraph: This is the first paragraph. It contains some relevant information. 
    The second paragraph has additional details. 
    Paragraph three is not very relevant. 
    The last paragraph summarizes the document's content. 
    
    In recent years, natural language processing (NLP) has made significant advancements. 
    It enables machines to understand and process human language effectively. 
    NLP techniques are used in various applications, including chatbots, machine translation, 
    sentiment analysis, and text summarization.
    
    The field of NLP has seen tremendous growth due to the availability of large datasets and 
    powerful transformer-based models like BERT and GPT. These models can learn complex 
    linguistic patterns and have been fine-tuned for various tasks. As a result, NLP 
    applications have become more accurate and useful in real-world scenarios.
    
    TextRank is a popular algorithm for extractive text summarization and keyword extr

In [3]:
import spacy
import numpy as np

def find_best_paragraph(text, query):
  """Finds the best paragraph in a text given a query.

  Args:
    text: The text to search.
    query: The query to search for.

  Returns:
    The paragraph with the highest similarity to the query.
  """

  # Preprocess the text.
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(text)

  # Create a vector representation of each paragraph.
  paragraph_vectors = []
  for paragraph in doc.paragraphs:
    words = [token.lemma_ for token in paragraph]
    word_vectors = np.array([nlp.vocab[word].vector for word in words])
    paragraph_vectors.append(np.mean(word_vectors, axis=0))

  # Calculate the similarity between each paragraph and the query.
  similarities = np.dot(paragraph_vectors, query)

  # Rank the paragraphs based on their similarity to the query.
  best_paragraph = doc.paragraphs[np.argmax(similarities)]

  return best_paragraph

if __name__ == "__main__":
  text = """
    In recent years, natural language processing (NLP) has made significant advancements. 
    It enables machines to understand and process human language effectively. 
    NLP techniques are used in various applications, including chatbots, machine translation, 
    sentiment analysis, and text summarization.
    
    The field of NLP has seen tremendous growth due to the availability of large datasets and 
    powerful transformer-based models like BERT and GPT. These models can learn complex 
    linguistic patterns and have been fine-tuned for various tasks. As a result, NLP 
    applications have become more accurate and useful in real-world scenarios.
    
    TextRank is a popular algorithm for extractive text summarization and keyword extraction. 
    It is based on the PageRank algorithm used by Google for ranking web pages. TextRank 
    treats sentences or words in the text as nodes in a graph and uses their semantic similarity 
    to calculate scores. The algorithm then selects the most important sentences or keywords 
    based on these scores.
    
    However, when dealing with longer documents, traditional TextRank may not be sufficient. 
    In such cases, using a combination of semantic similarity and graph-based ranking can be 
    more effective. Additionally, transformer-based models can be used to calculate semantic 
    similarity, taking into account the context and meaning of the words and sentences in the 
    document and the query. This enables better selection of relevant paragraphs, sentences, 
    or keywords from the text.
  """

  query = "Google"

  best_paragraph = find_best_paragraph(text, query)

  print(best_paragraph)


AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'paragraphs'

In [1]:
from transformers import pipeline, RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def zero_shot_text_classification(text, candidate_labels):
    # Load the pre-trained Roberta model and tokenizer
    model_name = "roberta-base"
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaModel.from_pretrained(model_name)

    # Encode the input text and candidate labels
    text_encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    label_encodings = tokenizer(candidate_labels, return_tensors="pt", padding=True, truncation=True)

    # Get the embeddings for the text and candidate labels
    with torch.no_grad():
        text_embeddings = model(**text_encoding).last_hidden_state.mean(dim=1)
        label_embeddings = model(**label_encodings).last_hidden_state.mean(dim=1)

    # Calculate cosine similarities between the text and candidate labels
    similarities = cosine_similarity(text_embeddings, label_embeddings)

    # Convert the similarities to probabilities
    probabilities = torch.softmax(torch.tensor(similarities), dim=1)

    # Prepare the results
    results = []
    for i, label in enumerate(candidate_labels):
        results.append({"label": label, "score": probabilities[0, i].item()})

    return results

# Example usage
input_text = """
Developed process flow diagram (PFD) of the MIBK process. 
The fresh acetone feed (stream 1) enters the process at 70 °C and 1.8 atm in a liquid 
phase and is mixed with the recycled acetone (stream 14), which enters at 74 °C and 1.8 
atm. Before entering the reactor R-101, the mixed stream is heated to 300 °C. The hydrogen 
feed is mixed with recycled hydrogen (stream 11) before being heated to 300 °C and fed 
into reactor R-101. Both reactor feeds pass through valves, V-101 and V-102, to reduce the 
pressure in the reactor. The hydrogen–acetone molar feed ratio is maintained at 2:1, as 
recommended [40,41]. The reaction proceeds isothermally in the gas phase at 300 °C and 
1 atm in a fixed bed catalytic reactor (R-101). The conversion of acetone is 66%, with a 
selectivity of 69.4% to methyl isobutyl ketone (MIBK). Other products, such as isopropa-
nol (IPA) and diiasobutyl ketone (DIBK), are also produced. The selectivity of each of 
these substances was determined based on the experimental work, which is shown in Ta-
ble 1. The reactor effluent will then be compressed to 6.5 atm to compensate for pressure 
losses through the pipelines and to enhance the separation of hydrogen without major 
product losses from the process stream at low pressure and high temperature, preventing 
the use of cryogenic conditions. MIBK losses at 1 atm are approximately 6 kmol/h. Conse-
quently, compression is mandatory to avoid cryogenic conditions. At 6.5 atm, the losses 
drop to about 0.7 kmol/h of MIBK. After compression, the stream is cooled to 35 °C in E-
Figure 1. Developed process ﬂow diagram (PFD) of the MIBK process.
"""

candidate_labels = ["Introduction", "Process", "Result","literature"]

classification_results = zero_shot_text_classification(input_text, candidate_labels)
print(classification_results)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': 'Introduction', 'score': 0.24966977536678314}, {'label': 'Process', 'score': 0.24896663427352905}, {'label': 'Result', 'score': 0.24797721207141876}, {'label': 'literature', 'score': 0.2533864378929138}]
