# **Vanakam** 🤝

1.	Derive a top-down, depth-first, left-to-right parse tree for the given sentence:
“The angry bear chased the frightened little squirrel”
Use the following grammar rules to create the parse tree:
S → NP VP
NP → Det Nom
VP → V NP
Nom →Adj Nom | N
Det → the
Adj→ little | angry | frightened
N → squirrel | bear
V → chased


In [None]:
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det Nom
VP -> V NP
Nom -> Adj Nom | N
Det -> 'the'
Adj -> 'little' | 'angry' | 'frightened'
N -> 'squirrel' | 'bear'
V -> 'chased' """)
sentence = 'the angry bear chased the frightened little squirrel'.split() 
def parse(sent):
  parser = nltk.ChartParser(grammar) 
  for tree in parser.parse(sent):
    return tree

print(parse(sentence)) 
parse(sentence).pretty_print()


2.	Create the grammar rules for the following sentence and implement the same to generate the following parse tree.

In [None]:
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det Nom | N
VP -> V NP
Nom ->  N
Det -> 'a'
Adj -> 'little' | 'angry' | 'frightened'
N -> 'restaurant' | 'dosa'
V -> 'serves' """)
sentence = 'a restaurant serves dosa'.split() 
def parse(sent):
  parser = nltk.ChartParser(grammar) 
  for tree in parser.parse(sent):
    return tree 
  
print(parse(sentence)) 
parse(sentence).pretty_print()

3.	Implement the Minimum edit distance algorithm and find the minimum number of operations required to convert string1 to string2.

str1 = 'Saturday' 

str2 = 'Sunday'

In [None]:
def edit_distance(str1, str2, a, b):
    string_matrix = [[0 for i in range(b+1)] for i in range(a+1)]

    for i in range(a+1):
        for j in range(b+1):

            if i == 0:
                string_matrix[i][j] = j 
            elif j == 0:
                string_matrix[i][j] = i   
            elif str1[i-1] == str2[j-1]:
                string_matrix[i][j] = string_matrix[i-1][j-1]  
            else:
                string_matrix[i][j] = 1 + min(string_matrix[i][j-1],      
                                       string_matrix[i-1][j],      
                                       string_matrix[i-1][j-1])    

    return string_matrix[a][b]

str1 = 'Saturday'
str2 = 'Sunday'

print('No. of Operations required :',edit_distance(str1, str2, len(str1), len(str2)))


4.	Given the corpus below, find the most probable next word following the sequences 

In [None]:
#N gram
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
from nltk.corpus import reuters
nltk.download('punkt')


trigram_freq = defaultdict(Counter)
trigram_prob = defaultdict(Counter)
def process_sentences(sentences):
    for sentence in sentences:
        # Tokenize the sentence
        tokens = nltk.word_tokenize(sentence)

        # Create trigrams for the sentence
        trigrams = list(ngrams(tokens, 3))

        # Update trigram frequencies
        for t1, t2, t3 in trigrams:
            trigram_freq[(t1, t2)][t3] += 1

    # Update trigram probabilities
    for (t1, t2), t3_freq in trigram_freq.items():
        total_count = sum(t3_freq.values())
        for t3, count in t3_freq.items():
            trigram_prob[(t1, t2)][t3] = count / total_count

#N gram
def find_next_word(keyword1, keyword2):
    next_word = None
    max_prob = 0

    for (t1, t2), t3_prob in trigram_prob.items():
        if t1 == keyword1 and t2 == keyword2:
            for t3, prob in t3_prob.items():
                if prob > max_prob:
                    max_prob = prob
                    next_word = t3

    return next_word


input_sentences = [
    "<S> I am Henry </S>",
"<S> I like college </S>",
"<S> Do Henry like college </S>"
"<S> Henry I am </S>",
"<S> Do I like Henry </S>",
"<S> Do I like college </S>",
"<S> I do like Henry </S>"
]

process_sentences(input_sentences)


keyword1 = "I"
keyword2 = "like"
next_word = find_next_word(keyword1, keyword2)
print(f"The most probable next word after '{keyword1} {keyword2}' is '{next_word}'")


The most probable next word after 'I like' is 'college'


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


5.	Implement Stemming and Lemmatization algorithm and find the output for the following words.

-	Programming
-	Loving
-	Lovely
-	Kind


In [None]:
#stemming
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
ans = PorterStemmer()
text = "Programming Loving Lovely Kind"
token = nltk.word_tokenize(text)
for i in token:
  print("Stemming for {} is {}".format(i, ans.stem(i)))

In [None]:
#lemmatization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
ans = WordNetLemmatizer()
text = "Programming Loving Lovely Kind"
token = nltk.word_tokenize(text)
for i in token:
  print("Lemma for {} is {}".format(i,ans.lemmatize(i)))

Lemma for Programming is Programming
Lemma for Loving is Loving
Lemma for Lovely is Lovely
Lemma for Kind is Kind


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


6.	Implement and do Parts of speech tagging for the following sentences. 
-	I need a flight from Atlanta.
-	Everything to permit us.
-	I would like to address the public on this issue.
-	We need your shipping address.


In [None]:
#parts of speech
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sentences = [
 "I need a flight from Atlanta.",
 "Everything to permit us.",
 "I would like to address the public on this issue."
"We need your shipping address."
]
for sentence in sentences:
 words = nltk.word_tokenize(sentence)
 pos_tags = nltk.pos_tag(words)
 print(pos_tags)

7.	Implement the extraction / abstraction-based Text summarization on the paragraph of your own. (Paragraph must have at least 10 lines) 

In [None]:
# Text summarization
# !pip install sumy
import nltk 
nltk.download('punkt') 
from sumy.parsers.plaintext import PlaintextParser 
from sumy.nlp.tokenizers import Tokenizer 
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.utils import get_stop_words
print('ENTER YOUR TEXT HERE : ') 
text = input()
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LsaSummarizer()
summarizer.stop_words = get_stop_words("english")
summary = " ".join([str(sentence) for sentence in summarizer(parser.document, 3)])
print("TEXT SUMMARY:")
print(summary)

8.	Implement the program in python to convert the speech to text. Upon text conversion, tokenize the sentence and analyze the emotion for the same.

In [None]:
#!pip install SpeechRecognition
import speech_recognition as sr 
r = sr.Recognizer() 
file_path = "/content/harvard.wav"
with sr.AudioFile(file_path) as source:
  audio = r.record(source)
try:
  text = r.recognize_google(audio)
  print("You said: ", text)

except sr.UnknownValueError:
  print("Sorry, I didn't understand that.")


9.	Write snippets to do the following using regular expression concept. The text can be created on your own; it must have 3 mail id’s, 2 phone numbers, 
-	Write RE to extract all Email id’s in the given text. 
-	Write RE to extract all mobile numbers.
-	Write RE to extract the names from the below list which match a certain pattern S u _ _ _
Sunil, Shyam, Ankit, Surjeet, Sumit, Subhi, Surbhi, Siddharth, Sujan
-	Work with the following functions:
re.search(), re.match(), re.sub(), re.compile(), re.findall()
-	Write RE matches a string that has 'ab' followed by zero or more 'c'.
-	Write RE matches 'a' followed by zero or more copies of the sequence 'bc'
-	Write RE matches 'ab' followed by zero or one 'c'


In [None]:
#1
import re
text = "My email is john@example.com."
email_pattern = r'\w+@\w+\.\w+'
emails = re.findall(email_pattern, text)

print(emails)

#2
import re
text = "My phone number is 123-456-7890. 9003363162"
phone_pattern = r'\d{3}-?\d{3}-?\d{4}'
phones = re.findall(phone_pattern, text)

print(phones)

#3
import re
names = ['Sunil', 'Shyam', 'Ankit', 'Surjeet', 'Sumit', 'Subhi', 'Surbhi', 'Siddharth', 'Sujan']
name_pattern = r'Su\w{3}'
matching_names = [name for name in names if re.match(name_pattern, name)]

print(matching_names)

#4
import re
text = "abcc abc abccc ab abbbc"
pattern = r'abc*'
matches = re.findall(pattern, text)

print(matches)

#5
import re
text = "a abcbc abcbcbc ab abcb"
pattern = r'ab?c*'
matches = re.findall(pattern, text)

print(matches)

import re
text = "a abcbcbc abcbcbc ab abcb"
pattern = r'a?bc*'
matches = re.findall(pattern, text)

print(matches)


