In [1]:
import nltk
nltk.data.path.append('E:\\Python\\nltk_data')

## Wordnet

**WordNet** is a semantically oriented dictionary of English, similar to a traditional thesaurus but with a richer structure. NLTK includes the English WordNet, with 155,287 words and 117,659 synonym sets.

In [2]:
from nltk.corpus import wordnet as wn

In [3]:
wn.synsets('motorcar')

[Synset('car.n.01')]

In [4]:
wn.synset('car.n.01')._lemma_names

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [5]:
for synset in wn.synsets('car'):
    print(synset._lemma_names)

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']


We can access all the lemmas involving the word *car* as follows:

In [6]:
wn.lemmas('car')

[Lemma('car.n.01.car'),
 Lemma('car.n.02.car'),
 Lemma('car.n.03.car'),
 Lemma('car.n.04.car'),
 Lemma('cable_car.n.01.car')]

In [7]:
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
sorted([lemma._name for synset in types_of_motorcar for lemma in synset._lemmas])

['Model_T',
 'S.U.V.',
 'SUV',
 'Stanley_Steamer',
 'ambulance',
 'beach_waggon',
 'beach_wagon',
 'bus',
 'cab',
 'compact',
 'compact_car',
 'convertible',
 'coupe',
 'cruiser',
 'electric',
 'electric_automobile',
 'electric_car',
 'estate_car',
 'gas_guzzler',
 'hack',
 'hardtop',
 'hatchback',
 'heap',
 'horseless_carriage',
 'hot-rod',
 'hot_rod',
 'jalopy',
 'jeep',
 'landrover',
 'limo',
 'limousine',
 'loaner',
 'minicar',
 'minivan',
 'pace_car',
 'patrol_car',
 'phaeton',
 'police_car',
 'police_cruiser',
 'prowl_car',
 'race_car',
 'racer',
 'racing_car',
 'roadster',
 'runabout',
 'saloon',
 'secondhand_car',
 'sedan',
 'sport_car',
 'sport_utility',
 'sport_utility_vehicle',
 'sports_car',
 'squad_car',
 'station_waggon',
 'station_wagon',
 'stock_car',
 'subcompact',
 'subcompact_car',
 'taxi',
 'taxicab',
 'tourer',
 'touring_car',
 'two-seater',
 'used-car',
 'waggon',
 'wagon']

In [8]:
motorcar.hypernyms()

[Synset('motor_vehicle.n.01')]

In [9]:
wn.synset('walk.v.01').entailments()

[Synset('step.v.01')]

In [10]:
wn.lemma('supply.n.02.supply').antonyms()

[Lemma('demand.n.02.demand')]

# Processing Raw Text


In [11]:
import urllib.request

In [12]:
url = "http://www.gutenberg.org/files/1342/1342-0.txt"
with urllib.request.urlopen(url) as response:
    raw = response.read().decode('utf-8')
type(raw)

str

In [13]:
len(raw)

790335

In [14]:
raw[:75]

'\ufeff\r\nThe Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\r\n\r\nTh'

In [15]:
tokens = nltk.word_tokenize(raw)
type(tokens)

list

In [16]:
len(tokens)

146110

In [17]:
tokens[:10]

['\ufeff',
 'The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Pride',
 'and',
 'Prejudice',
 ',']

In [18]:
words = [w.lower() for w in tokens]
vocab = sorted(set(words))
len(vocab)

7747

In [19]:
empty = []
nested = [empty, empty, empty]
nested

[[], [], []]

In [20]:
nested[1].append('Python')
nested

[['Python'], ['Python'], ['Python']]

In [21]:
nested = [[]] * 3
nested[1].append('Anaconda')
nested

[['Anaconda'], ['Anaconda'], ['Anaconda']]

In [22]:
id(nested[0]), id(nested[1]), id(nested[2])

(70046464, 70046464, 70046464)

In [23]:
import random
size = 5
python = ['Python']
nested2 = [python] * size
position = random.choice(range(size))
nested2[position] = ['Python']
nested2

[['Python'], ['Python'], ['Python'], ['Python'], ['Python']]

In [24]:
nested2[0] == nested2[1] == nested2[2] == nested2[3] == nested2[4]

True

In [25]:
nested2[0] is nested2[1] is nested2[2] is nested2[3] is nested2[4]

False

In [26]:
[id(nest) for nest in nested2]

[220359336, 220359336, 220359336, 220359336, 220192600]

In [27]:
[id(nest) for nest in reversed(nested2)]

[220192600, 220359336, 220359336, 220359336, 220359336]

In [28]:
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(f'{m}: {fdist[m]}',end = ' ')

can: 94 could: 87 may: 93 might: 38 must: 53 will: 389 

In [29]:
nltk.corpus.indian.tagged_words()

[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]