In [18]:
import nltk
from nltk.probability import ConditionalFreqDist

## Frequency Distribution of words in a text

In [19]:
text = """
On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which began in 2014. The invasion has likely resulted in tens of thousands of deaths on both sides. It has caused Europe's largest refugee crisis since World War II.[10][11] An estimated 8 million Ukrainians were displaced within their country by late May and 7.8 million fled the country by 8 November 2022,[12][13][14][15] while Russia, within five weeks of the invasion, experienced its greatest emigration since the 1917 October Revolution.[16]

Following the 2014 Ukrainian Revolution, Russia annexed Crimea, and Russian-backed paramilitaries seized part of the Donbas region of south-eastern Ukraine, which consists of Luhansk and Donetsk oblasts, sparking a regional war.[17][18] In March 2021, Russia began a large military build-up along its border with Ukraine, eventually amassing up to 190,000 troops and their equipment. Despite the build-up, denials of plans to invade or attack Ukraine were issued by various Russian government officials up to the day before the invasion.[22] On 21 February 2022, Russia recognised the Donetsk People's Republic and the Luhansk People's Republic, two self-proclaimed breakaway quasi-states in the Donbas.[23] The next day, the Federation Council of Russia authorised the use of military force and Russian troops entered both territories.[24]
"""

fd = nltk.FreqDist(text.split())
fd

FreqDist({'the': 14, 'of': 10, 'and': 6, 'Russia': 5, 'in': 4, 'a': 3, 'by': 3, 'to': 3, 'On': 2, 'February': 2, ...})

## Conditional Frequency Distribution of words in a text

In [20]:
cfd = ConditionalFreqDist((len(word), word) for word in text.split())
cfd[4]

FreqDist({'both': 2, 'were': 2, 'War,': 1, 'tens': 1, 'late': 1, 'fled': 1, 'five': 1, '1917': 1, '2014': 1, 'part': 1, ...})

# HW 1: To determine Frequency Distribution and Conditional Frequency Distribution of any one of the Presidential inaugural addresses

## Frequency Distribution of words in a text

In [3]:
import nltk
from nltk.corpus import inaugural

In [4]:
inaugural.fileids()

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1

In [5]:
text = inaugural.words(fileids='1861-Lincoln.txt')
fd = nltk.FreqDist(text)
fd

FreqDist({'the': 240, ',': 195, 'of': 146, 'to': 132, '.': 110, 'and': 101, 'be': 76, 'in': 72, 'that': 57, 'a': 53, ...})

## Conditional Frequency Distribution of words in a text

In [6]:
from nltk.probability import ConditionalFreqDist

In [7]:
cfd = ConditionalFreqDist((len(word), word) for word in text)
cfd[4]

FreqDist({'that': 57, 'will': 25, 'this': 23, 'with': 20, 'have': 20, 'from': 16, 'such': 15, 'upon': 15, 'them': 13, 'than': 13, ...})

## Chinese Segmentation using Jieba

In [8]:
import jieba

Forward Matching Algorithm chooses the longest word from the start of the string and splits it. For example, take the string: thetabledownthere

Forward Matching Algorithm will result in the following words: ['theta', 'bled', 'own', 'there']

Backward Matching Algorithm works similar to the Forward Matching Algorithm except it starts working from the end of the string.

It results in the following words: ['there', 'down', 'table', 'the']

In [14]:
seg = jieba.cut("中文维基百科计划搭其他12种主要语言维基百科计划同时成立", cut_all=True) # Uses a greedy algorithm - Forward Matching Algorithm
print(" ".join(seg))

中文 维基 维基百 维基百科 百科 计划 搭 其他 12 种 主要 语言 维基 维基百 维基百科 百科 计划 同时 成立


## Basic Text Processing Pipeline

In [10]:
import nltk

In [12]:
sent = "Become an expert in NLP"
words = nltk.word_tokenize(sent) # Works similar to .split(), splits based on a delimiter (default is space)
print(words)

['Become', 'an', 'expert', 'in', 'NLP']


In [16]:
texts = ["""7th Heaven is an American television drama series created by executive producer Brenda Hampton, 
and co-executive produced by Aaron Spelling and E. Duke Vincent through Spelling Television.[1] 
The series revolves around a family headed by parents Eric Camden (Stephen Collins), a Protestant Reverend, 
and Annie Camden (Catherine Hicks), a homemaker. Their seven children are Matt (Barry Watson), Mary (Jessica Biel), 
Lucy (Beverley Mitchell), Simon (David Gallagher), Ruthie (Mackenzie Rosman) and twins Sam and David (Nikolas and Lorenzo 
Brino)."""
]
for text in texts:
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        
        print(words)
        tagged = nltk.pos_tag(words) # Tags each of the words in each of the sentences based on the Penn Treebank tag set
        print(tagged)
        print("------")

['7th', 'Heaven', 'is', 'an', 'American', 'television', 'drama', 'series', 'created', 'by', 'executive', 'producer', 'Brenda', 'Hampton', ',', 'and', 'co-executive', 'produced', 'by', 'Aaron', 'Spelling', 'and', 'E.', 'Duke', 'Vincent', 'through', 'Spelling', 'Television', '.']
[('7th', 'CD'), ('Heaven', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('American', 'JJ'), ('television', 'NN'), ('drama', 'NN'), ('series', 'NN'), ('created', 'VBN'), ('by', 'IN'), ('executive', 'JJ'), ('producer', 'NN'), ('Brenda', 'NNP'), ('Hampton', 'NNP'), (',', ','), ('and', 'CC'), ('co-executive', 'JJ'), ('produced', 'VBN'), ('by', 'IN'), ('Aaron', 'NNP'), ('Spelling', 'NNP'), ('and', 'CC'), ('E.', 'NNP'), ('Duke', 'NNP'), ('Vincent', 'NNP'), ('through', 'IN'), ('Spelling', 'NNP'), ('Television', 'NNP'), ('.', '.')]
------
['[', '1', ']', 'The', 'series', 'revolves', 'around', 'a', 'family', 'headed', 'by', 'parents', 'Eric', 'Camden', '(', 'Stephen', 'Collins', ')', ',', 'a', 'Protestant', 'Reverend', ',', 'and