In [17]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def preprocess_text(text):
    sentences = text.split('. ')
    return sentences

def get_sentence_embeddings(sentences, tokenizer, model):
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

def rank_sentences(sentences, embeddings):
    sentence_scores = []
    for i, emb in enumerate(embeddings):
        score = cosine_similarity([emb], embeddings).mean()
        sentence_scores.append((score, i))
    ranked_sentences = sorted(sentence_scores, reverse=True, key=lambda x: x[0])
    return [sentences[i] for _, i in ranked_sentences]

def keyword_relevant_sentences(sentences, keywords):
    relevant_sentences = []
    for sentence in sentences:
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            relevant_sentences.append(sentence)
    return relevant_sentences

def summarize(text, keywords, num_sentences=5):
    sentences = preprocess_text(text)
    embeddings = get_sentence_embeddings(sentences, tokenizer, model)
    ranked_sentences = rank_sentences(sentences, embeddings)
    
    summary = ranked_sentences[:num_sentences]
    relevant_sentences = keyword_relevant_sentences(sentences, keywords)
    
    # Ensure unique sentences in the final summary
    final_summary = list(set(summary + relevant_sentences))
    
    return '. '.join(final_summary)

# Example usage
text = """Your input text goes here. It should be a long paragraph with multiple sentences."""
keywords = ["keyword1", "keyword2"]

summary = summarize(text, keywords)
print("Summary:", summary)


TypeError: 'list' object is not callable

In [14]:
list1 = summary.split(".")

In [15]:
list2 = text.split(".")

In [16]:
list2 

['Your input text goes here',
 ' It should be a long paragraph with multiple sentences',
 '']

In [5]:
list

['Climate change is one of the most pressing issues of our time',
 ' Rising global temperatures have led to a variety of environmental impacts, including more frequent and severe weather events',
 ' The polar ice caps are melting at an alarming rate, causing sea levels to rise and threatening coastal communities',
 ' Governments and organizations around the world are working to address climate change through various measures',
 ' Renewable energy sources, such as solar and wind power, are being developed to reduce reliance on fossil fuels',
 ' International agreements, like the Paris Agreement, aim to unite countries in the fight against climate change',
 ' It is essential that everyone takes part in efforts to mitigate the effects of climate change to ensure a sustainable future for generations to come',
 '\n ']

In [3]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

def compound_to_simple(sentence):
    doc = nlp(sentence)
    simple_sentences = []

    # Identify root and dependent clauses
    for sent in doc.sents:
        root = [token for token in sent if token.dep_ == "ROOT"][0]
        clause_1 = " ".join([token.text for token in root.subtree])
        simple_sentences.append(clause_1)

        # Handle conjunctions
        for token in sent:
            if token.dep_ == "cc":  # coordinating conjunction
                conj_head = token.head
                clause_2 = " ".join([child.text for child in conj_head.subtree if child != token])
                simple_sentences.append(clause_2)

    return simple_sentences

# Example usage
compound_sentence = "A paragraph is a series of sentences that are organized and coherent."
simple_sentences = compound_to_simple(compound_sentence)
for simple_sentence in simple_sentences:
    print(simple_sentence)

A paragraph is a series of sentences that are organized and coherent .
that are organized coherent


In [8]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

def compound_to_simple(sentence):
    doc = nlp(sentence)
    simple_sentences = []

    for sent in doc.sents:
        # Get the part of the sentence before the conjunction or relative clause
        before_clause = []
        clause_parts = []
        in_clause = False

        for token in sent:
            if token.dep_ in {"nsubj", "nsubjpass"} and not in_clause:
                before_clause.append(token.text)
            elif token.dep_ == "cc" or token.dep_ == "conj":
                in_clause = True
                clause_parts.append(token.text)
            elif in_clause:
                clause_parts.append(token.text)
            else:
                before_clause.append(token.text)
        
        # Remove the conjunction (e.g., 'and') from the clause parts
        if clause_parts and clause_parts[0] in {"and", "or", "but"}:
            clause_parts.pop(0)

        before_clause_text = " ".join(before_clause)
        clause_text = " ".join(clause_parts)

        if clause_text:
            clause_1 = f"{before_clause_text}"
            clause_2 = f"{before_clause_text}"
            simple_sentences.append(clause_1)
            simple_sentences.append(clause_2)

    return simple_sentences

# Example usage
compound_sentence = "A paragraph is a series of sentences that are organized and coherent."
simple_sentences = compound_to_simple(compound_sentence)

print(f"input - \"{compound_sentence}\"")
for simple_sentence in simple_sentences:
    print(f"output - \"{simple_sentence}\"")


input - "A paragraph is a series of sentences that are organized and coherent."
output - "A paragraph is a series of sentences that are organized"
output - "A paragraph is a series of sentences that are organized"


In [13]:
import nltk
import spacy

# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

def compound_to_simple(sentence):
    # Tokenize the sentence into words
    doc = nlp(sentence)
    
    # Extract the conjunctions and split the sentence based on them
    simple_sentences = []
    temp_sentence = []

    for token in doc:
        temp_sentence.append(token.text)
        if token.dep_ == 'cc' or token.dep_ == 'punct':
            simple_sentences.append(' '.join(temp_sentence).strip())
            temp_sentence = []
    
    if temp_sentence:
        simple_sentences.append(' '.join(temp_sentence).strip())
    
    # Further split based on commas and semicolons if needed
    final_sentences = []
    for sent in simple_sentences:
        sub_sentences = nltk.sent_tokenize(sent.replace(',', '.').replace(';', '.'))
        final_sentences.extend(sub_sentences)
    
    return final_sentences

# Example usage
compound_sentence = "It is raining like cats and dogs"
simple_sentences = compound_to_simple(compound_sentence)

for idx, sent in enumerate(simple_sentences):
    print(f"Simple Sentence {idx+1}: {sent}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Simple Sentence 1: It is raining like cats and
Simple Sentence 2: dogs


In [15]:
import nltk
import spacy

# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

def split_by_conjunctions(sentence):
    """
    Split a sentence by its conjunctions and return the part before the conjunction
    and the phrases/words after the conjunction.
    """
    doc = nlp(sentence)
    before_conjunction = []
    after_conjunctions = []
    temp_sentence = []

    for token in doc:
        if token.dep_ == 'cc':  # Coordinating conjunction
            before_conjunction.append(' '.join(temp_sentence).strip())
            temp_sentence = []
        else:
            temp_sentence.append(token.text)
    
    if temp_sentence:
        after_conjunctions.append(' '.join(temp_sentence).strip())
    
    return before_conjunction, after_conjunctions

def expand_phrases(before_conjunction, after_conjunctions):
    """
    Expand phrases to form new simple sentences based on the parts before and after conjunctions.
    """
    expanded_sentences = []
    for part in before_conjunction:
        if 'like' in part:
            parts = part.split('like')
            if len(parts) > 1:
                for after_part in after_conjunctions:
                    subjects = after_part.split('and')
                    for subject in subjects:
                        expanded_sentences.append(parts[0].strip() + ' like ' + subject.strip())
            else:
                expanded_sentences.append(part)
        else:
            for after_part in after_conjunctions:
                expanded_sentences.append(part + ' ' + after_part)
    return expanded_sentences

def compound_to_simple(sentence):
    # Split the sentence by conjunctions
    before_conjunction, after_conjunctions = split_by_conjunctions(sentence)
    
    # Expand phrases based on the parts before and after conjunctions
    final_sentences = expand_phrases(before_conjunction, after_conjunctions)
    
    return final_sentences

# Example usage
compound_sentence = "It is raining like cats and dogs."
simple_sentences = compound_to_simple(compound_sentence)

for idx, sent in enumerate(simple_sentences):
    print(f"Simple Sentence {idx+1}: {sent}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Simple Sentence 1: It is raining like dogs .


In [18]:
import nltk
import spacy

# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

def split_by_conjunctions(sentence):
    """
    Split a sentence by its conjunctions and return the part before the conjunction
    and the phrases/words after the conjunction.
    """
    doc = nlp(sentence)
    before_conjunction = []
    after_conjunctions = []
    temp_sentence = []

    for token in doc:
        if token.dep_ == 'cc':  # Coordinating conjunction
            before_conjunction.append(' '.join(temp_sentence).strip())
            temp_sentence = []
        else:
            temp_sentence.append(token.text)
    
    if temp_sentence:
        after_conjunctions.append(' '.join(temp_sentence).strip())
    
    return before_conjunction, after_conjunctions

def expand_phrases(before_conjunction, after_conjunctions):
    """
    Expand phrases to form new simple sentences based on the parts before and after conjunctions.
    """
    expanded_sentences = []
    for part in before_conjunction:
        if 'like' in part:
            parts = part.split('like')
            if len(parts) > 1:
                for after_part in after_conjunctions:
                    subjects = after_part.split('and')
                    for subject in subjects:
                        expanded_sentences.append(parts[0].strip() + ' like ' + subject.strip())
            else:
                expanded_sentences.append(part)
        else:
            for after_part in after_conjunctions:
                expanded_sentences.append(part + ' ' + after_part)
    return expanded_sentences

def compound_to_simple(sentence):
    # Split the sentence by conjunctions
    before_conjunction, after_conjunctions = split_by_conjunctions(sentence)
    
    # Print parts before and after conjunctions
    print("Before conjunction(s):", before_conjunction)
    print("After conjunction(s):", after_conjunctions)
    
    # Expand phrases based on the parts before and after conjunctions
    final_sentences = expand_phrases(before_conjunction, after_conjunctions)
    
    return final_sentences

# Example usage
compound_sentence = "A paragraph is a series of sentences that are organized and coherent."
simple_sentences = compound_to_simple(compound_sentence)

for idx, sent in enumerate(simple_sentences):
    print(f"Simple Sentence {idx+1}: {sent}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Before conjunction(s): ['A paragraph is a series of sentences that are organized']
After conjunction(s): ['coherent .']
Simple Sentence 1: A paragraph is a series of sentences that are organized coherent .


In [22]:
import spacy

def split_compound_sentence(sentence):
    # Load the spaCy model
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    
    # Identify the conjunction and split the sentence
    for token in doc:
        if token.pos_ == "CCONJ":
            # Find the start and end of the first part
            start = token.head.left_edge.i
            end = token.head.i
            
            first_part = doc[start:end+1].text
            
            # Create two new sentences
            first_sentence = first_part + "."
            second_sentence = first_part.rsplit(maxsplit=1)[0] + " " + doc[end+2:].text
            
            return first_sentence, second_sentence

# Example usage
input_sentence = "It is raining like cats and dogs"
output_sentences = split_compound_sentence(input_sentence)
print("Output:")
print(output_sentences[0])
print(output_sentences[1])


Output:
cats.
cats dogs
