In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.probability import FreqDist

In [2]:
f = open('datasets/FIFAWorldCup2018.txt')
fifa_txt = f.read()
f.seek(0)
f.close()

print(fifa_txt)

2018 FIFA World CUP is an international tournament where all teams compete religously and they showcase beautiful talent. It took place in Russia from 14 June to 15 July 2018. It was the first World Cup to be held in Eastern Europe, and the 11th time that it had been held in Europe. At an estimated cost of over 14.2 billion, it was the most expensive World Cup. It was also the first World Cup to use the video assistant referee  system. 
The finals involved 32 teams, of which 31 came through qualifying competitions, while the host nation qualified automatically. Of the 32 teams, 20 had also appeared in the previous tournament in 2014, while both Iceland and Panama made their first appearances at a FIFA World Cup. A total of 64 matches were played in 12 venues across 11 cities. 
The final took place on 15 July at the Luzhniki Stadium in Moscow, between France and Croatia. France won the match 4–2 to claim their second World Cup title, marking the fourth consecutive title won by a Europea

Question 1:

In [3]:
def GetNMostFrequentNouns(txt, n):
    fdist = FreqDist()
    pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in pos:
        if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
            fdist[word.upper()] += 1
    
    return fdist.most_common(n)

def GetNMostFrequentVerbs(txt, n):
    fdist = FreqDist()
    pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in pos:
        if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            fdist[word.upper()] += 1
    
    return fdist.most_common(n)

def GetNMostFrequentDeterminers(txt, n):
    fdist = FreqDist()
    pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in pos:
        if tag in ['DT']:
            fdist[word.upper()] += 1
    
    return fdist.most_common(n)

def GetNMostFrequentPrepositions(txt, n):
    fdist = FreqDist()
    pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in pos:
        if tag in ['IN']:
            fdist[word.upper()] += 1
    
    return fdist.most_common(n)

In [4]:
print("GetNMostFrequentNouns: ", GetNMostFrequentNouns(fifa_txt, 3))
print("GetNMostFrequentVerbs: ", GetNMostFrequentVerbs(fifa_txt, 3))
print("GetNMostFrequentDeterminers: ", GetNMostFrequentDeterminers(fifa_txt, 3))
print("GetNMostFrequentPrepositions: ", GetNMostFrequentPrepositions(fifa_txt, 3))

GetNMostFrequentNouns:  [('WORLD', 15), ('CUP', 15), ('FIFA', 12)]
GetNMostFrequentVerbs:  [('WAS', 11), ('HAD', 7), ('WERE', 7)]
GetNMostFrequentDeterminers:  [('THE', 57), ('A', 10), ('AN', 5)]
GetNMostFrequentPrepositions:  [('OF', 30), ('IN', 18), ('FROM', 10)]


Question 2:

In [5]:
def PrintSyntaxTree(txt):
    tokens = sent_tokenize(txt)
    pos = nltk.pos_tag(word_tokenize(tokens[0]))
    print(pos)

PrintSyntaxTree(fifa_txt)

[('2018', 'CD'), ('FIFA', 'NNP'), ('World', 'NNP'), ('CUP', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('international', 'JJ'), ('tournament', 'NN'), ('where', 'WRB'), ('all', 'DT'), ('teams', 'NNS'), ('compete', 'VBP'), ('religously', 'RB'), ('and', 'CC'), ('they', 'PRP'), ('showcase', 'VBP'), ('beautiful', 'JJ'), ('talent', 'NN'), ('.', '.')]


Question 3:

In [6]:
def TextAfterRemovingPunctuations(txt):
    tokens = regexp_tokenize(txt, pattern=r'\w+')
    return ' '.join(tokens)

def TextAfterRemovingDigits(txt):
    tokens = regexp_tokenize(txt, pattern=r'\D+')
    return ' '.join(tokens)

def AllCapitalizedWordsFromText(txt):
    tokens = regexp_tokenize(txt, pattern=r'(?<!^)(?<!\. )[A-Z][a-z]+')
    return tokens

def AllEmailsFromText(txt):
    tokens = regexp_tokenize(txt, pattern=r'\S+@\S+')
    return tokens

print("TextAfterRemovingPunctuations: \n",TextAfterRemovingPunctuations(fifa_txt), "\n")
print("TextAfterRemovingDigits: \n",TextAfterRemovingDigits(fifa_txt))
print("AllCapitalizedWordsFromText: \n",AllCapitalizedWordsFromText(fifa_txt), "\n")
print("AllEmailsFromText: \n",AllEmailsFromText(fifa_txt))

TextAfterRemovingPunctuations: 
 2018 FIFA World CUP is an international tournament where all teams compete religously and they showcase beautiful talent It took place in Russia from 14 June to 15 July 2018 It was the first World Cup to be held in Eastern Europe and the 11th time that it had been held in Europe At an estimated cost of over 14 2 billion it was the most expensive World Cup It was also the first World Cup to use the video assistant referee system The finals involved 32 teams of which 31 came through qualifying competitions while the host nation qualified automatically Of the 32 teams 20 had also appeared in the previous tournament in 2014 while both Iceland and Panama made their first appearances at a FIFA World Cup A total of 64 matches were played in 12 venues across 11 cities The final took place on 15 July at the Luzhniki Stadium in Moscow between France and Croatia France won the match 4 2 to claim their second World Cup title marking the fourth consecutive title won

Question 4:

In [7]:
def ChunkingVer1(txt):
    tokens = nltk.word_tokenize(txt)
    tagged_tokens = nltk.pos_tag(tokens)
    
    chunks = []
    current_chunk = []
    
    for word, pos in tagged_tokens:
        if pos.startswith('NNP'):  # Proper noun
            current_chunk.append(word)
        elif pos.startswith('VB'):  # Verb
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def ChunkingVer2(txt):
    tokens = nltk.word_tokenize(txt)
    tagged_tokens = nltk.pos_tag(tokens)
    
    chunks = []
    current_chunk = []
    
    for word, pos in tagged_tokens:
        if pos.startswith('VB'):  # Verb
            current_chunk.append(word)
        elif pos.startswith('JJ'):  # Adjective
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def ChunkingVer3(txt):
    tokens = nltk.word_tokenize(txt)
    tagged_tokens = nltk.pos_tag(tokens)
    
    chunks = []
    current_chunk = []
    
    for word, pos in tagged_tokens:
        if pos.startswith('DT'):  # Determiner
            current_chunk.append(word)
        elif pos.startswith('NN'):  # Noun
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def ChunkingVer4(txt):
    tokens = nltk.word_tokenize(txt)
    tagged_tokens = nltk.pos_tag(tokens)
    
    chunks = []
    current_chunk = []
    
    for word, pos in tagged_tokens:
        if pos.startswith('VB'):  # Verb
            current_chunk.append(word)
        elif pos.startswith('RB'):  # Adverb
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def ChunkingVer5(txt):
    tokens = nltk.word_tokenize(txt)
    tagged_tokens = nltk.pos_tag(tokens)
    
    chunks = []
    current_chunk = []
    
    for word, pos in tagged_tokens:
        if pos.startswith('DT') or pos.startswith('IN'):  # Determiner or Preposition
            current_chunk.append(word)
        elif pos.startswith('JJ'):  # Adjective
            current_chunk.append(word)
        elif pos.startswith('NN'):  # Noun
            current_chunk.append(word)
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    return chunks

def ChunkingVer6(input_string):
    tokens = nltk.word_tokenize(input_string)
    tagged_tokens = nltk.pos_tag(tokens)
    
    chunks = []
    current_chunk = []
    
    for word, pos in tagged_tokens:
        if pos.startswith('JJ'):  # Adjective
            current_chunk.append(word)
        elif pos.startswith('NN'):  # Noun
            current_chunk.append(word)
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    return chunks


In [8]:
print("ChunkingVer1: ", ChunkingVer1(fifa_txt))
print("ChunkingVer2: ", ChunkingVer2(fifa_txt))
print("ChunkingVer3: ", ChunkingVer3(fifa_txt))
print("ChunkingVer4: ", ChunkingVer4(fifa_txt))
print("ChunkingVer5: ", ChunkingVer5(fifa_txt))
print("ChunkingVer6: ", ChunkingVer6(fifa_txt))

ChunkingVer1:  ['FIFA World CUP', 'Russia June July', 'World Cup', 'Eastern Europe', 'Europe', 'World Cup', 'World Cup', 'Iceland Panama', 'FIFA World Cup A', 'July Luzhniki Stadium Moscow France Croatia France', 'World Cup', 'FIFA World Cup', 'January', 'February', 'FIFA World Cup Mexico', 'Indonesia', 'FIFA February', 'Australia Japan United States', 'UEFA', 'FIFA World Cup', 'England Russia Netherlands/Belgium Portugal/Spain FIFA Executive Committee', 'Zürich December', 'Russia', 'Portugal/Spain', 'Belgium/Netherlands England', 'English Football Association', 'Russian FIFA', 'England Sepp Blatter', 'Russia', 'Garcia Report', 'Michael J. Garcia', 'Hans-Joachim Eckert FIFA Eckert', 'FIFA', 'Garcia', 'FA', 'Eckert Russia Greg Dyke', 'David Bernstein', 'World Cup June', 'World Cup', 'FIFA World Cup Brazil', 'World Cup', 'FIFA World Cup Construction Renovation Transport Infrastructure', 'Construction', 'World Cup', 'Twelve']
ChunkingVer2:  ['is', 'compete showcase', 'took was', 'be held 

Question 5:

In [11]:
from nltk.parse.generate import generate

def get_most_frequent_words(text, category, num_words=2):    
    words = word_tokenize(text.lower())
    
    words_category = [word for word, pos in nltk.pos_tag(words) if pos.startswith(category)]
    
    # Calculate word frequencies
    freq_dist = FreqDist(words_category)
    
    # Get the most common noun
    most_common_words = freq_dist.most_common(num_words)
    
    return most_common_words

def generate_grammar(txt):
    # Get the most frequent words for each category
    delimiters = get_most_frequent_words(txt, category="DT")
    verbs = get_most_frequent_words(txt, category="VB")
    prepositions = get_most_frequent_words(txt, category="IN")
    nouns = get_most_frequent_words(txt, category="NN")
    
    # Define the CFG rules
    grammar_rules = [
        "S -> NP VP",
        "NP -> DT N",
        "NP -> DT N VP",
        "VP -> VB NP",
        "VP -> VB NP PP",
        "PP -> IN NP",
        f"DT -> '{delimiters[0][0]}' | '{delimiters[1][0]}'",
        f"VB -> '{verbs[0][0]}' | '{verbs[1][0]}'",
        f"IN -> '{prepositions[0][0]}' | '{prepositions[1][0]}'",
        f"N -> '{nouns[0][0]}' | '{nouns[1][0]}'"
    ]
    
    s = "\n".join(grammar_rules)

    # Create the CFG
    cfg_grammar = nltk.CFG.fromstring(f"""{s}""")
    print(cfg_grammar.productions(), "\n\n")
    
    gen = generate(cfg_grammar, n=10)
    for sentence in gen:
        print(' '.join(sentence))

In [12]:
generate_grammar(fifa_txt)

[S -> NP VP, NP -> DT N, NP -> DT N VP, VP -> VB NP, VP -> VB NP PP, PP -> IN NP, DT -> 'the', DT -> 'a', VB -> 'was', VB -> 'had', IN -> 'of', IN -> 'in', N -> 'world', N -> 'cup'] 


the world was the world
the world was the cup
the world was a world
the world was a cup
the world was the world was the world
the world was the world was the cup
the world was the world was a world
the world was the world was a cup
the world was the world was the world was the world
the world was the world was the world was the cup
