# Run the code below once as a setup

In [3]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Modifiable version of the working code:

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# Function to find common phrases in two texts
inp = int(input("How many words should the sentences in common contain?"))

def find_common_words(sentence1, sentence2):
    # Tokenize the sentences and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+\'?\w*')  # Modified regular expression
    words1 = [word.lower() for word in tokenizer.tokenize(sentence1) if word.lower() not in stop_words]
    words2 = [word.lower() for word in tokenizer.tokenize(sentence2) if word.lower() not in stop_words]

    # Find common words
    common_words = set(words1) & set(words2)

    return common_words

def find_common_phrases(text1, text2, min_phrase_length=inp):
    # Tokenize the texts into sentences
    tokenizer = RegexpTokenizer(r'\w+\'?\w*')  # Modified regular expression
    tokens1 = tokenizer.tokenize(text1)
    tokens2 = tokenizer.tokenize(text2)

    common_phrases = set()

    for phrase_length in range(min(len(tokens1), len(tokens2)), min_phrase_length - 1, -1):
        ngrams1 = [' '.join(tokens1[i:i+phrase_length]) for i in range(len(tokens1) - phrase_length + 1)]
        ngrams2 = [' '.join(tokens2[i:i+phrase_length]) for i in range(len(tokens2) - phrase_length + 1)]

        common_phrases.update(set(ngrams1) & set(ngrams2))

    # Return the common phrases that meet the minimum length criteria
    common_phrases = [phrase for phrase in common_phrases if len(phrase.split()) >= min_phrase_length]

    return common_phrases

## Example texts
sentence1 = "In a quiet meadow, a lone firefly flickered. It danced, illuminating the night with its soft glow. A tiny owl watched, enchanted by the shimmering light. Nature's symphony played on as the firefly's dance continued, a secret performance in the moonlight."
sentence2 = "In a quiet meadow, a lone firefly flickered. The child smiled, feeling the magic of the night. It was a happy gathering. Nature's symphony played on as the firefly's dance continued."

## Have the user input their desired texts

# sentence1 = input("Enter the first text: ")
# sentence2 = input("Enter the second text: ")


common_words = find_common_words(sentence1, sentence2)
common_phrases = find_common_phrases(text1, text2, min_phrase_length=inp)

print("Common Words:", common_words)
print("Common Phrases:")
for i, phrase in enumerate(common_phrases, 1):
    print(f"{i}. {phrase}")

How many words should the sentences in common contain? 7


Common Words: {'lone', 'night', 'quiet', 'firefly', 'symphony', 'meadow', 'dance', "firefly's", "nature's", 'played', 'flickered', 'continued'}
Common Phrases:
1. In a quiet meadow a lone firefly
2. Nature's symphony played on as the firefly's
3. In a quiet meadow a lone firefly flickered
4. a quiet meadow a lone firefly flickered
5. played on as the firefly's dance continued
6. Nature's symphony played on as the firefly's dance
7. Nature's symphony played on as the firefly's dance continued
8. symphony played on as the firefly's dance
9. symphony played on as the firefly's dance continued
