In [1]:
import nltk
from nltk.corpus import brown

test_sentence_tokens = ['a','fact','about','the','unicorn','is','the','same','as','an','alternative','fact','about','the','unicorn','.']

words = brown.words()
fdist1 = nltk.FreqDist(w.lower() for w in words)

total_words = len(words)

In [2]:
fdist1

FreqDist({'the': 69971, ',': 58334, '.': 49346, 'of': 36412, 'and': 28853, 'to': 26158, 'a': 23195, 'in': 21337, 'that': 10594, 'is': 10109, ...})

In [3]:
print('Frequency of tokens in sample sententence in Brown according to NLTK:')

for word in test_sentence_tokens:
    print(word,fdist1[word])


Frequency of tokens in sample sententence in Brown according to NLTK:
a 23195
fact 447
about 1815
the 69971
unicorn 0
is 10109
the 69971
same 686
as 7253
an 3740
alternative 34
fact 447
about 1815
the 69971
unicorn 0
. 49346


In [4]:
# input('Pausing: Hit Return when Ready.')

print('Given that there are',total_words,'in the Brown Corpus, the unigram probability of these words')
print('is as follows (rounded to 3 significant digits):')

for word in test_sentence_tokens:
    unigram_probability = fdist1[word]/total_words
    print(word,float('%.3g' % unigram_probability))
    ## print(word,round((fdist1[word]/total_words),3))
    

Given that there are 1161192 in the Brown Corpus, the unigram probability of these words
is as follows (rounded to 3 significant digits):
a 0.02
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0
is 0.00871
the 0.0603
same 0.000591
as 0.00625
an 0.00322
alternative 2.93e-05
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0
. 0.0425


In [5]:
words2 = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        ## and after end-of-sentence markers (overgenerate due to abbreviations, etc.)
        words2.append('*start_end*')
    if fdist1[word]==1:
        ## words occurring only once are treated as Out of Vocabulary Words
        words2.append('*oov*')
    else:
        words2.append(word)
    previous = word

In [6]:
words2.append('*start_end*')

In [7]:
fdist2 = nltk.FreqDist(w.lower() for w in words2)

In [8]:
print('There are',fdist2['*oov*'],'instances of OOVs')

print('Unigram probabilities including OOV probabilities.')

There are 15673 instances of OOVs
Unigram probabilities including OOV probabilities.


In [9]:
def get_unigram_probability(word):
    if word in fdist1:
        unigram_probability = fdist2[word]/total_words
    else:
        unigram_probability = fdist2['*oov*']/total_words
    return(unigram_probability)

In [10]:
for word in test_sentence_tokens:
    unigram_probability = get_unigram_probability(word)
    print(word,float('%.3g' % unigram_probability))

# input('Pausing: Hit Return when Ready.')
## make new version that models Out of Vocabulary (OOV) words

print('Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')
print('Assuming some idealizations: all periods, questions and exclamation marks end sentences;')

a 0.02
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0135
is 0.00871
the 0.0603
same 0.000591
as 0.00625
an 0.00322
alternative 2.93e-05
fact 0.000385
about 0.00156
the 0.0603
unicorn 0.0135
. 0.0425
Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
Assuming some idealizations: all periods, questions and exclamation marks end sentences;


In [11]:
print('Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')
print('Assuming some idealizations: all periods, questions and exclamation marks end sentences;')

Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
Assuming some idealizations: all periods, questions and exclamation marks end sentences;


<generator object bigrams at 0x000001E6EE108B10>

In [26]:
print('Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')
print('Assuming some idealizations: all periods, questions and exclamation marks end sentences;')

bigrams = nltk.bigrams(w.lower() for w in words2)
## get bigrams for words2 (words plus OOV)
# print(*map(' '.join, bigrams), sep=', ')
cfd = nltk.ConditionalFreqDist(bigrams)

Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
Assuming some idealizations: all periods, questions and exclamation marks end sentences;


<ConditionalFreqDist with 34144 conditions>

In [29]:
for token1 in cfd:
    if not '*oov*' in cfd[token1]:
        cfd[token1]['*oov*']=100

*start_end*
the
fulton
county
grand
jury
said
friday
an
investigation
of
atlanta's
recent
primary
election
produced
``
no
evidence
''
that
any
irregularities
took
place
.
further
in
*oov*
city
executive
committee
,
which
had
over-all
charge
deserves
praise
and
thanks
atlanta
for
manner
was
conducted
september-october
term
been
charged
by
superior
court
judge
durwood
pye
to
investigate
reports
possible
hard-fought
won
mayor-nominate
ivan
allen
jr.
only
a
relative
handful
such
received
considering
widespread
interest
number
voters
size
this
it
did
find
many
georgia's
registration
laws
are
outmoded
or
inadequate
often
ambiguous
recommended
legislators
act
have
these
studied
revised
end
modernizing
improving
them
commented
on
other
topics
among
purchasing
departments
well
operated
follow
generally
accepted
practices
inure
best
both
governments
merger
proposed
however
believes
two
offices
should
be
combined
achieve
greater
efficiency
reduce
cost
administration
department
is
lacking
experien

warren
k.
hess
years'
probation
drive
driving
sentence
pronounced
victim
accident
lee
stansbery
39
reprimanded
violating
agreements
redevelopment
eastwick
hazards
pedestrians
corp.
acre
tract
bounded
dicks
parks
designated
authority's
feature
row
garden
apartments
churches
clusters
corporation
formed
reynolds
metal
builder
second
developing
constantinos
doxiadis
greece
planner
sectors
barred
landscaped
esplanade
entire
length
eliminates
grovers
ran
pedestrian
bridges
kansas
mo.
upi
fighters
injured
bomb
tore
battalion
stanton
gladden
42
figure
representation
teamsters
suffered
multiple
fractures
ankles
baptist
memorial
ignition
sets
blast
driveway
i'd
flash
lying
gladden's
younger
boy
knocked
bed
wall
hood
flies
flying
roof
front
wheel
landed
away
laboratory
explosive
device
containing
tnt
car's
starter
target
threatening
torn
dissension
outspoken
critic
union's
began
organizing
firemen
reward
bombing
kas.
guards
mining
shiflett
secretary-treasurer
active
ankara
turkey
oct.
ap
turkish


trustees
by-laws
taxing
roadblock
tax-exempt
institutions
segregated
standards
institution
grade
tax-exemption
privileges
conforms
aforementioned
statutory
provisions
obstacles
desegregation
intend
exempt
criteria
endowments
colored
integrated
challenged
affiliated
newspaper
trustees'
commitment
excellence
paramount
generous
recognize
obligation
promote
creed
insofar
governing
documents
applications
prospective
irrespective
corporate
existence
derives
desiring
intellectual
moral
fulfillment
riding
slid
pole
waddell
ne
knoll
cir.
hammons
evidently
occupants
willard
olvey
ponce
leon
coleman
se
lacerations
bruises
renewed
picketing
stand-ins
theaters
worded
contacted
coahr
gather
eve
operators
likelihood
three-day
sporadic
negotiate
friday's
inability
indifference
integrate
pledged
extensive
presence
picket
profits
uptown
buckhead
killingsworth
72
venable
kililngsworth
s
w
cafeteria
pittsboro
survivors
sisters
bessie
bloom
gettysburg
68
marietta
nw
painter
oakland
blanchard's
hearn
emma
o

ancestor
ranges
temperate
patients
belgian
administered
1908
touched
conscience
commencing
paternalism
unmatched
colonialism
ordinarily
independence
congolese
imagines
infant
tin
cobalt
uranium
brussels
beneficiaries
gunther
describes
societe
generale
colossus
anaconda
producing
lumped
ruthless
exploited
compassion
instituted
ration
obliged
eat
hungry
25,000
puzzle
chien
carpenters
indefinite
fearing
elite
unrest
manifestly
unprepared
oversimplification
gale
prepare
colonies
pas
une
sang
detested
mess
motivations
kasavubu
splitting
balkanizing
moise
tshombe
near-balkanization
federalism
notably
patrice
lumumba
hurry
fragmentation
provincial
provinces
leopoldville
kasai
kivu
katanga
equator
western-style
universal
well-wishers
assume
respectable
publique
twenty-four
tribesmen
commenced
mutiny
looting
engulfed
civilians
seceded
succeeding
blamed
vague
withdrawal
secretary-general
hammarskjold
preferable
nonwhite
tunisia
guinea
mali
eire
sweden
reluctant
withdraw
obstructed
complicated
u.

nervousness
fullness
aurally
dramatically
silvio
varviso
injected
tucker
edgardo
bel
canto
serviceable
covent
bellini
tenda
sonambula
ping-pong
sp-44001
bands
interruptions
sundry
disc
london's
jackets
directionality
mix
farther
placement
nearer
mixed
percussion
percussive
volley
twenties
sp-44006
memories
keating's
kombo
tingling
moods
sp-44005
sp-44002
frequencies
brassy
pianos
sp-44007
resultant
potentialities
manage
authentic
authenticity
imprimatur
impeccably
unimpeachable
studios
cadre
sketches
interiors
architectural
complied
storyline
unrealistic
naomi
boaz
meticulously
involves
intrigues
twists
traced
gospels
murrow
gibbon
macaulay
ignorant
francois
d'albert
hungarian-born
conservatory
jenni
jenni's
vying
drawback
mozart's
sonata
clattered
noisily
brahm's
artists'
large-scale
brahmsian
directness
bella
fleming
refers
nickname
sentimental
ravel-like
idiom
conjugal
felicity
smack
dohnanyi
hubay
paganini
virtuoso
sleeve
lyric
sarasate
saint-saens
xydis
philharmonic
lewisohn
greek

40,000,000
heaviest
concentrations
coasts
upsurge
markedly
watering
texoma
corp
thirty-two
ninety
sleek
outboards
sailboats
smartly
boatmen
year-'round
tedious
trailer
hauled
horizons
canoes
camping
accommodating
plywood
plastic
toilets
galleys
bunks
purchasers
boatman
encompass
phenomenal
horsepower
wherein
owning
launch
rating
unduly
terrified
infancy
burdened
aloft
water's
carefree
industry's
foreseeing
coordinated
reorganized
naebm
booklets
squadrons
piloting
dine
ever-increasing
sparked
coined
commercially
dock
yachtel
boatel
pertain
nomenclature
boatyards
4,000
interlocking
interact
tappets
tappet
lengthwise
locking
angles
notches
sloping
slide
pulls
levers
turnout
turnouts
flange
snag
figs.
connecting
wedge-shaped
derails
rails
prototype
larson's
fig.
photos
1/8''
1/4''
1/2''
widths
sanding
right-hand
12''
1''
5-3/4''
sq.
spacing
left-hand
compress
solder
tack-solder
drilled
spacers
horizontally
center-punch
screws
2-56
3-3/4''
drilling
contacts
reassemble
roundhead
nuts
align
t

yore
jist
sergeant
twister
modes
halda
stillwell
georgian
forgit
yank
parodied
thiot
plantation
pups
fall-in
maladies
diarrhoea
looseness
stationed
shitts
rebs
dyerear
escapes
reb
blots
homely
admonitions
planters'
laundering
owing
usages
stereotyped
boon
companions
amorous
exploits
pappy's
comrade
sis
alf
sed
gust
doo
orney
drunkard
damed
helion
infantryman
pa
ornery
suns
thay
choicest
disparagement
reub
stuck-up
illiterate
ass
alabamian
ignoramus
floridian
guts
1862
fay
louisianan
a.b.
m.a.
pemberton
regiment
cavalry
shucks
mauldin
sherman's
corporal
gunpowder
alabamans
greenest
booty
imperialism
legends
rationalize
justifying
cohesive
spanish-american
1880s
curricula
outlived
citizen's
rewriting
self-deception
heterogeneous
weems's
uniting
incarnation
rebelling
patrick
hale
crockett
patriotism
propagandistic
homogeneous
oral
myths
literate
definitions
bogus
bunyan
culturally
teutonic
lore
anthropologists
amateurs
propagandists
footnote
collector
hunts
acutely
uncritical
shudder
prop

plutarch
montaigne
dialogues
coleridge's
alexander's
wordsworth
romantics
hegel's
tragedians
elimination
dictate
respite
judiciary
shearn
quiescent
disowned
inherently
balked
snapping
supporter
sensationalism
seduction
industriously
watterson
courier-journal
constituency
carmack
disgrace
pro-hearst
devery
[
]
indiana's
confer
surreptitiously
koenigsberg
despairingly
grinning
ihmsen
intently
chief's
aspirant
floradora
trevelyan's
garibaldi's
garibaldi
trevelyan
messina
emmanuel
statuto
radetzky
galantuomo
papal
assessing
trilogy
manin
venetian
appreciative
bright's
crimean
grey's
northumberland
contemptible
interplay
dynamics
liberal-radical
deserted
pre-french
merges
unconnected
1870
reprinted
lowell
excursions
boroughs
self-satisfaction
humanistic
regius
blenheim
ramillies
wycliffe
macaulay's
eras
savoy
etched
reread
expectant
kehl
rhine
strasbourg
franco-german
overturning
streetcars
cordon
waspish
bicycle
barricade
titre
alors
imperiously
bravo
mustached
thanking
andre
nouvelle
alsa

belge
foreami
kwango
pendant
bruxelles
l'institut
recherche
scientifique
afrique
centrale
irsac
maquet
l'universite
officielle
demographie
romaniuk
statistique
neesen
walle
censuses
successively
stratified
strata
semester's
hindsight
auditors
pregnancy
contagious
in-laws
associating
kohnstamm-negative
concretistic
hesitancy
guilford-martin
guilford
kohnstamm-positive
rorschach
kohnstamm
nonreactors
24%
arm-elevation
arm-levitation
suggestibility
autosuggestibility
inhibiting
involuntary
nonreactivity
relinquishing
aniseikonic
stationary
constriction
socioeconomic
tenement
unstructured
third-grade
over-achievers
compulsivity
castaneda
multiphastic
permeates
stanford
sub-tests
wechsler
under-achievement
i.q.
descriptive
surmised
compulsives
decrement
memorizing
panicked
sarason
roleplaying
questionnaires
suitcase
antagonists
receptionist
outstandingly
criticizing
laxness
manager's
blackboard
supervisor's
introject
hebephrenic
cheeks
granny
scolding
annoy
condescending
schizophrenic
conde

pugh
camels
tripoli
jerez
iq
mushr
ozon
tonsil
chinaman
tooth-hurty
kool-aid
bobbie
oakmont
edythe
mousie
allegheny
debs
john-and-linda
longue
conneaut
dillinger
linda's
horne's
scatterbrained
sewickley
edythe's
webber
mont
bobbie's
steels
macisaacs
lovejoy
carnegie-illinois
stuart-family
murkland
john'll
nadine
francie
thom's
thom
nadine's
wally's
francie's
gladdy
gladdy's
michelson
fairview
ishii
aa
groggins
papanicolaou
smear
bancroft's
coverlet
outta
yielding-mediterranian-woman-
orly
rhine-main
elysees
hun
bugatti
farina
a40-ajk
mercedes
triomphe
d'eiffel
maxim's
jour
nuit
montmartre
panther
pils
tuborg
capricorn
topcoat
remy
rotten
allons
bon
j'ai
suzanne
louvre
jeroboam
theresa
stubblefield
appian
colosseum
stubblefields
azalea
elec
carraway
tuxapoka
spagna
elec's
emma's
rosie
whittaker
carrozza
alligator
dinsmore
dolan
christians'
lucille's
mmm
idiot's
johnnie's
petted
faneuil
alma
schmalma
all-american-boy
kirby's
vivian
mockery
pyhrric
peony
what's-his-name
bobbsey
kissin'
ka

In [15]:
def multiply_list(inlist):
    out = 1
    for number in inlist:
        out *= number
    return(out)

In [16]:
def get_bigram_probability(first,second):
    if not second in cfd[first]:
        print('Backing Off to Unigram Probability for',second)
        unigram_probability = get_unigram_probability(second)
        return(unigram_probability)
    else:
        bigram_frequency = cfd[first][second]
    unigram_frequency = fdist2[first]
    bigram_probability = bigram_frequency/unigram_frequency
    return(bigram_probability)

In [17]:
def calculate_bigram_freq_of_sentence_token_list(tokens):
    prob_list = []
    ## assume that 'START' precedes the first token
    previous = '*start_end*'
    for token in tokens:
        if not token in fdist2:
            token = '*oov*'
        next_probability = get_bigram_probability(previous,token)
        print(previous,token,(float('%.3g' % next_probability)))
        prob_list.append(next_probability)
        previous = token
    ## assume that 'END' follows the last token
    next_probability = get_bigram_probability(previous,'*start_end*')
    print(previous,'*start_end*',next_probability)
    prob_list.append(next_probability)
    probability = multiply_list(prob_list)
    print('Total Probability',float('%.3g' % probability))
    return(probability)

In [18]:
result = calculate_bigram_freq_of_sentence_token_list(test_sentence_tokens)

*start_end* a 0.0182
a fact 0.000388
fact about 0.00447
about the 0.182
the *oov* 0.0293
*oov* is 0.00485
is the 0.0786
the same 0.00898
same as 0.035
as an 0.029
an alternative 0.00241
Backing Off to Unigram Probability for fact
alternative fact 0.000385
fact about 0.00447
about the 0.182
the *oov* 0.0293
*oov* . 0.0865
. *start_end* 1.0
Total Probability 1.12e-30
