In [29]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def read_text_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def preprocess_text(text):
    # Tokenize the text into words
    words = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]

    return words

def calculate_pos_probabilities(ngram_list):
    pos_counts = defaultdict(lambda: defaultdict(int))

    # Tag each n-gram with its POS
    for ngram in ngram_list:
        pos_tags = nltk.pos_tag(ngram)
        pos_sequence = ' '.join([tag[1] for tag in pos_tags])
        pos_counts[ngram][pos_sequence] += 1

    # Calculate probabilities
    pos_probabilities = {}
    for ngram, counts in pos_counts.items():
        total_count = sum(counts.values())
        noun_count = counts.get('NN', 0) + counts.get('NNS', 0) + counts.get('NNP', 0) + counts.get('NNPS', 0)
        verb_count = counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get('VBP', 0) + counts.get('VBZ', 0)
        noun_prob = noun_count / total_count
        verb_prob = verb_count / total_count
        pos_probabilities[ngram] = {'Noun': noun_prob, 'Verb': verb_prob}

    return pos_probabilities

def most_likely_pos(pos_probabilities):
    most_likely_pos_tags = {}
    for ngram, probabilities in pos_probabilities.items():
        most_likely_pos_tags[ngram] = max(probabilities, key=probabilities.get)
    return most_likely_pos_tags

def main():
    filename = 'shakes.txt'
    text = read_text_file(filename)
    words = preprocess_text(text)

    # Set the value of n for n-grams
    n = 3

    # Generate n-grams
    ngram_list = list(ngrams(words, n))

    # Calculate POS probabilities for each n-gram
    pos_probabilities = calculate_pos_probabilities(ngram_list)

    # Find the most likely POS for each n-gram
    most_likely_pos_tags = most_likely_pos(pos_probabilities)

    # Print the results
    for ngram, pos_tag in most_likely_pos_tags.items():
        print(f"{ngram}: Most likely POS tag - {pos_tag}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


('project', 'gutenberg', 'ebook'): Most likely POS tag - Noun
('gutenberg', 'ebook', 'complete'): Most likely POS tag - Noun
('ebook', 'complete', 'works'): Most likely POS tag - Noun
('complete', 'works', 'william'): Most likely POS tag - Noun
('works', 'william', 'shakespeare'): Most likely POS tag - Noun
('william', 'shakespeare', 'william'): Most likely POS tag - Noun
('shakespeare', 'william', 'shakespeare'): Most likely POS tag - Noun
('william', 'shakespeare', 'ebook'): Most likely POS tag - Noun
('shakespeare', 'ebook', 'use'): Most likely POS tag - Noun
('ebook', 'use', 'anyone'): Most likely POS tag - Noun
('use', 'anyone', 'anywhere'): Most likely POS tag - Noun
('anyone', 'anywhere', 'cost'): Most likely POS tag - Noun
('anywhere', 'cost', 'almost'): Most likely POS tag - Noun
('cost', 'almost', 'restrictions'): Most likely POS tag - Noun
('almost', 'restrictions', 'whatsoever'): Most likely POS tag - Noun
('restrictions', 'whatsoever', 'may'): Most likely POS tag - Noun
('

In [32]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import defaultdict

# Load the input text file
with open('shakes.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text into words
words = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

# Function to generate n-grams
def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

# Function to determine POS tags of n-grams
def tag_ngrams(ngrams):
    tagged_ngrams = []
    for ngram in ngrams:
        tagged_ngram = pos_tag(ngram)
        tagged_ngrams.append(tagged_ngram)
    return tagged_ngrams

# Function to count occurrences of nouns and verbs in n-grams
def count_pos(tagged_ngrams):
    noun_counts = defaultdict(int)
    verb_counts = defaultdict(int)
    for tagged_ngram in tagged_ngrams:
        # Convert the list to a tuple to make it hashable
        tagged_ngram = tuple(tagged_ngram)
        for word, pos in tagged_ngram:
            if pos.startswith('N'):
                noun_counts[tagged_ngram] += 1
            elif pos.startswith('V'):
                verb_counts[tagged_ngram] += 1
    return noun_counts, verb_counts

# Function to calculate probabilities
def calculate_probabilities(noun_counts, verb_counts):
    all_ngrams = set(noun_counts.keys()) | set(verb_counts.keys())
    
    total_nouns = sum(noun_counts.values())
    total_verbs = sum(verb_counts.values())
    
    noun_probabilities = defaultdict(float)
    verb_probabilities = defaultdict(float)
    
    for ngram in all_ngrams:
        noun_probabilities[ngram] = noun_counts[ngram] / total_nouns if total_nouns != 0 else 0.0
        verb_probabilities[ngram] = verb_counts[ngram] / total_verbs if total_verbs != 0 else 0.0
        
    return noun_probabilities, verb_probabilities

# Function to determine most likely POS tag
def most_likely_pos(noun_prob, verb_prob):
    most_likely = {}
    for ngram in noun_prob:
        if noun_prob[ngram] > verb_prob[ngram]:
            most_likely[ngram] = 'Noun'
        else:
            most_likely[ngram] = 'Verb'
    return most_likely

# Main function
def main():
    n = 2
    ngrams = generate_ngrams(filtered_words, n)
    tagged_ngrams = tag_ngrams(ngrams)
    noun_counts, verb_counts = count_pos(tagged_ngrams)
    noun_prob, verb_prob = calculate_probabilities(noun_counts, verb_counts)
    most_likely_tags = most_likely_pos(noun_prob, verb_prob)

    # Print results
    for ngram, pos in most_likely_tags.items():
        print(f"{ngram}: {pos}")

if __name__ == "__main__":
    main()


(('death', 'NN'), ('dead', 'NN')): Noun
(('mated', 'VBN'), ('lion', 'NN')): Verb
(('live', 'JJ'), ('quoth', 'NN')): Noun
(('old', 'JJ'), ('woes', 'NNS')): Noun
(('ay', 'NN'), ('would', 'MD')): Noun
(('image', 'NN'), ('dies', 'NNS')): Noun
(('last', 'JJ'), ('come', 'NN')): Noun
(('doubly', 'RB'), ('bertram', 'NN')): Noun
(('king', 'VBG'), ('tell', 'NN')): Verb
(('end', 'NN'), ('mother', 'NN')): Noun
(('faith', 'RB'), ('honour', 'NN')): Noun
(('night', 'NN'), ('end', 'NN')): Noun
(('known', 'VBN'), ('countess', 'NN')): Verb
(('yet', 'RB'), ('men', 'NNS')): Noun
(('live', 'JJ'), ('twice', 'NN')): Noun
(('need', 'NN'), ('stop', 'VB')): Verb
(('things', 'NNS'), ('disdain', 'VBP')): Verb
(('breath', 'NN'), ('sings', 'NNS')): Noun
(('old', 'JJ'), ('courtier', 'NN')): Noun
(('data', 'NNS'), ('alphabetical', 'JJ')): Noun
(('insufficiency', 'NN'), ('heart', 'NN')): Noun
(('afford', 'NN'), ('praise', 'NN')): Noun
(('thee', 'NNS'), ('like', 'IN')): Noun
(('nature', 'NN'), ('subdued', 'VBD')): Verb

In [33]:
import nltk
from collections import defaultdict

def load_data(file_path):
    data = []
    with open(file_path) as file:
        for line in file:
            sentence = nltk.word_tokenize(line)
            tagged_sentence = nltk.pos_tag(sentence)
            data.append(tagged_sentence)
    return data

def get_ngram_counts(data, n):
    ngram_counts = defaultdict(int)
    ngram_tag_counts = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        ngrams = nltk.ngrams(sentence, n)
        for ngram in ngrams:
            words = tuple(word for word, _ in ngram)
            tags = tuple(tag for _, tag in ngram)
            ngram_counts[words] += 1
            ngram_tag_counts[words][tags] += 1
    return ngram_counts, ngram_tag_counts

def compute_probabilities(ngram_counts, ngram_tag_counts, n):
    probabilities = defaultdict(lambda: {'NOUN': 0.0, 'VERB': 0.0})
    for ngram, ngram_count in ngram_counts.items():
        noun_count = ngram_tag_counts[ngram][('NOUN',) * n]
        verb_count = ngram_tag_counts[ngram][('VERB',) * n]
        probabilities[ngram]['NOUN'] = noun_count / ngram_count
        probabilities[ngram]['VERB'] = verb_count / ngram_count
    return probabilities

In [34]:
def main(file_path, n):
    data = load_data(file_path)
    ngram_counts, ngram_tag_counts = get_ngram_counts(data, n)
    probabilities = compute_probabilities(ngram_counts, ngram_tag_counts, n)
    results = []
    for ngram, probs in probabilities.items():
        result = {
            'ngram': ' '.join(ngram),
            'noun_prob': probs['NOUN'],
            'verb_prob': probs['VERB'],
            'likely_tag': max(probs, key=probs.get)
        }
        results.append(result)
    return results

In [35]:
file_path = 'shakes.txt'
n = 2
results = main(file_path, n)

for result in results:
    print(f"For '{result['ngram']}': Noun={result['noun_prob']}, Verb={result['verb_prob']}, Likely={result['likely_tag']}")

For 'ï »': Noun=0.0, Verb=0.0, Likely=NOUN
For '» ¿The': Noun=0.0, Verb=0.0, Likely=NOUN
For '¿The Project': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Project Gutenberg': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Gutenberg EBook': Noun=0.0, Verb=0.0, Likely=NOUN
For 'EBook of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of The': Noun=0.0, Verb=0.0, Likely=NOUN
For 'The Complete': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Complete Works': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Works of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of William': Noun=0.0, Verb=0.0, Likely=NOUN
For 'William Shakespeare': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Shakespeare ,': Noun=0.0, Verb=0.0, Likely=NOUN
For ', by': Noun=0.0, Verb=0.0, Likely=NOUN
For 'This eBook': Noun=0.0, Verb=0.0, Likely=NOUN
For 'eBook is': Noun=0.0, Verb=0.0, Likely=NOUN
For 'is for': Noun=0.0, Verb=0.0, Likely=NOUN
For 'for the': Noun=0.0, Verb=0.0, Likely=NOUN
For 'the use': Noun=0.0, Verb=0.0, Likely=NOUN
For 'use of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of a

In [36]:
import nltk
from collections import defaultdict

def load_data(file_path):
    data = []
    with open(file_path) as file:
        for line in file:
            sentence = nltk.word_tokenize(line)
            tagged_sentence = nltk.pos_tag(sentence)
            data.append(tagged_sentence)
    return data

def get_ngram_counts(data, n):
    ngram_counts = defaultdict(int)
    ngram_tag_counts = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        ngrams = list(nltk.ngrams(sentence, n))
        for ngram in ngrams:
            words = tuple(word for word, _ in ngram)
            tags = tuple(tag for _, tag in ngram)
            ngram_counts[words] += 1
            ngram_tag_counts[words][tags] += 1
    return ngram_counts, ngram_tag_counts

def compute_probabilities(ngram_counts, ngram_tag_counts, n):
    probabilities = defaultdict(lambda: {'NOUN': 0.0, 'VERB': 0.0})
    for ngram, ngram_count in ngram_counts.items():
        noun_count = ngram_tag_counts[ngram][('NN',) * n]
        verb_count = ngram_tag_counts[ngram][('VB',) * n]
        probabilities[ngram]['NOUN'] = noun_count / ngram_count
        probabilities[ngram]['VERB'] = verb_count / ngram_count
    return probabilities

def main(file_path, n):
    data = load_data(file_path)
    ngram_counts, ngram_tag_counts = get_ngram_counts(data, n)
    probabilities = compute_probabilities(ngram_counts, ngram_tag_counts, n)
    results = []
    for ngram, probs in probabilities.items():
        result = {
            'ngram': ' '.join(ngram),
            'noun_prob': probs['NOUN'],
            'verb_prob': probs['VERB'],
            'likely_tag': max(probs, key=probs.get)
        }
        results.append(result)
    return results

file_path = 'shakes.txt'
n = 2
results = main(file_path, n)

for result in results:
    print(f"For '{result['ngram']}': Noun={result['noun_prob']}, Verb={result['verb_prob']}, Likely={result['likely_tag']}")


For 'ï »': Noun=0.0, Verb=0.0, Likely=NOUN
For '» ¿The': Noun=0.0, Verb=0.0, Likely=NOUN
For '¿The Project': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Project Gutenberg': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Gutenberg EBook': Noun=0.0, Verb=0.0, Likely=NOUN
For 'EBook of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of The': Noun=0.0, Verb=0.0, Likely=NOUN
For 'The Complete': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Complete Works': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Works of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of William': Noun=0.0, Verb=0.0, Likely=NOUN
For 'William Shakespeare': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Shakespeare ,': Noun=0.0, Verb=0.0, Likely=NOUN
For ', by': Noun=0.0, Verb=0.0, Likely=NOUN
For 'This eBook': Noun=0.0, Verb=0.0, Likely=NOUN
For 'eBook is': Noun=0.0, Verb=0.0, Likely=NOUN
For 'is for': Noun=0.0, Verb=0.0, Likely=NOUN
For 'for the': Noun=0.0, Verb=0.0, Likely=NOUN
For 'the use': Noun=0.0, Verb=0.0, Likely=NOUN
For 'use of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of a

In [37]:
import nltk
from collections import defaultdict

def load_data(file_path):
    data = []
    with open(file_path) as file:
        for line in file:
            sentence = nltk.word_tokenize(line)
            tagged_sentence = nltk.pos_tag(sentence)
            data.append(tagged_sentence)
    return data

def get_ngram_counts(data, n):
    ngram_counts = defaultdict(int)
    ngram_tag_counts = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        ngrams = list(nltk.ngrams(sentence, n))
        for ngram in ngrams:
            words = tuple(word for word, _ in ngram)
            tags = tuple(tag for _, tag in ngram)
            ngram_counts[words] += 1
            ngram_tag_counts[words][tags] += 1
    return ngram_counts, ngram_tag_counts

def compute_probabilities(ngram_counts, ngram_tag_counts, n):
    probabilities = defaultdict(lambda: {'NOUN': 0.0, 'VERB': 0.0})
    for ngram, ngram_count in ngram_counts.items():
        noun_count = ngram_tag_counts[ngram][('NN',) * n]
        verb_count = ngram_tag_counts[ngram][('VB',) * n]
        if ngram_count > 0:
            probabilities[ngram]['NOUN'] = noun_count / ngram_count
            probabilities[ngram]['VERB'] = verb_count / ngram_count
    return probabilities

def main(file_path, n):
    data = load_data(file_path)
    ngram_counts, ngram_tag_counts = get_ngram_counts(data, n)
    probabilities = compute_probabilities(ngram_counts, ngram_tag_counts, n)
    results = []
    for ngram, probs in probabilities.items():
        result = {
            'ngram': ' '.join(ngram),
            'noun_prob': probs['NOUN'],
            'verb_prob': probs['VERB'],
            'likely_tag': max(probs, key=probs.get)
        }
        results.append(result)
    return results

file_path = 'shakes.txt'
n = 2
results = main(file_path, n)

for result in results:
    print(f"For '{result['ngram']}': Noun={result['noun_prob']}, Verb={result['verb_prob']}, Likely={result['likely_tag']}")


For 'ï »': Noun=0.0, Verb=0.0, Likely=NOUN
For '» ¿The': Noun=0.0, Verb=0.0, Likely=NOUN
For '¿The Project': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Project Gutenberg': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Gutenberg EBook': Noun=0.0, Verb=0.0, Likely=NOUN
For 'EBook of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of The': Noun=0.0, Verb=0.0, Likely=NOUN
For 'The Complete': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Complete Works': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Works of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of William': Noun=0.0, Verb=0.0, Likely=NOUN
For 'William Shakespeare': Noun=0.0, Verb=0.0, Likely=NOUN
For 'Shakespeare ,': Noun=0.0, Verb=0.0, Likely=NOUN
For ', by': Noun=0.0, Verb=0.0, Likely=NOUN
For 'This eBook': Noun=0.0, Verb=0.0, Likely=NOUN
For 'eBook is': Noun=0.0, Verb=0.0, Likely=NOUN
For 'is for': Noun=0.0, Verb=0.0, Likely=NOUN
For 'for the': Noun=0.0, Verb=0.0, Likely=NOUN
For 'the use': Noun=0.0, Verb=0.0, Likely=NOUN
For 'use of': Noun=0.0, Verb=0.0, Likely=NOUN
For 'of a

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import defaultdict

# Function to preprocess text
def preprocess_text(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return filtered_words

# Function to generate n-grams
def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

# Function to determine POS tags of n-grams
def tag_ngrams(ngrams):
    tagged_ngrams = []
    for ngram in ngrams:
        tagged_ngram = pos_tag(ngram)
        tagged_ngrams.append(tagged_ngram)
    return tagged_ngrams

# Function to count occurrences of nouns and verbs in n-grams
def count_pos(tagged_ngrams):
    noun_counts = defaultdict(int)
    verb_counts = defaultdict(int)
    for tagged_ngram in tagged_ngrams:
        # Convert the list to a tuple to make it hashable
        tagged_ngram = tuple(tagged_ngram)
        for word, pos in tagged_ngram:
            if pos.startswith('N'):
                noun_counts[tagged_ngram] += 1
            elif pos.startswith('V'):
                verb_counts[tagged_ngram] += 1
    return noun_counts, verb_counts

# Function to calculate probabilities
def calculate_probabilities(noun_counts, verb_counts):
    all_ngrams = set(noun_counts.keys()) | set(verb_counts.keys())
    
    total_nouns = sum(noun_counts.values())
    total_verbs = sum(verb_counts.values())
    
    noun_probabilities = defaultdict(float)
    verb_probabilities = defaultdict(float)
    
    for ngram in all_ngrams:
        noun_probabilities[ngram] = noun_counts[ngram] / total_nouns if total_nouns != 0 else 0.0
        verb_probabilities[ngram] = verb_counts[ngram] / total_verbs if total_verbs != 0 else 0.0
        
    print("Noun count:",total_nouns, "Verb count:",total_verbs)
    print("Noun probability:",noun_probabilities, "Verb probability:",verb_probabilities)
    return noun_probabilities, verb_probabilities
    

# Function to determine most likely POS tag
def most_likely_pos(noun_prob, verb_prob):
    most_likely = {}
    for ngram in noun_prob:
        if noun_prob[ngram] > verb_prob[ngram]:
            most_likely[ngram] = 'Noun'
        else:
            most_likely[ngram] = 'Verb'
    return most_likely

# Main function
def main(input_text):
    filtered_words = preprocess_text(input_text)
    n = 2
    ngrams = generate_ngrams(filtered_words, n)
    tagged_ngrams = tag_ngrams(ngrams)
    noun_counts, verb_counts = count_pos(tagged_ngrams)
    noun_prob, verb_prob = calculate_probabilities(noun_counts, verb_counts)
    most_likely_tags = most_likely_pos(noun_prob, verb_prob)

    return most_likely_tags


input_text = """
Shakespeare was a great playwright who wrote many famous plays. His works include tragedies, comedies, and sonnets.
He used language skillfully to explore human emotions and relationships.
His plays continue to be performed and studied worldwide.
"""

# Call the main function with the input text
results = main(input_text)

# Print results
for ngram, pos in results.items():
    print(f"{ngram}: {pos}")
    



Noun count: 23 Verb count: 10
Noun probability: defaultdict(<class 'float'>, {(('famous', 'JJ'), ('plays', 'NNS')): 0.043478260869565216, (('plays', 'NNS'), ('works', 'NNS')): 0.08695652173913043, (('great', 'JJ'), ('playwright', 'NN')): 0.043478260869565216, (('works', 'NNS'), ('include', 'VBP')): 0.043478260869565216, (('used', 'VBN'), ('language', 'NN')): 0.043478260869565216, (('relationships', 'NNS'), ('plays', 'NNS')): 0.08695652173913043, (('tragedies', 'NNS'), ('comedies', 'NNS')): 0.08695652173913043, (('wrote', 'VBD'), ('many', 'JJ')): 0.0, (('continue', 'NN'), ('performed', 'VBD')): 0.043478260869565216, (('shakespeare', 'NN'), ('great', 'JJ')): 0.043478260869565216, (('playwright', 'JJ'), ('wrote', 'VBD')): 0.0, (('plays', 'NNS'), ('continue', 'VBP')): 0.043478260869565216, (('include', 'NN'), ('tragedies', 'NNS')): 0.08695652173913043, (('language', 'NN'), ('skillfully', 'RB')): 0.043478260869565216, (('studied', 'VBN'), ('worldwide', 'NN')): 0.043478260869565216, (('perfo