# NLTK Natural Language Toolkit
- tokenization
- lemmatization
- pos tagging
- spelling correction

## spaCy
- NER - Named Entity Recognition

## Tokenization

In [1]:
import nltk

In [3]:
data = "Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java. A historic mix of cultures– Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine. The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock."

In [5]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Juanda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Juanda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [6]:
nltk.sent_tokenize(data)

["Jakarta, Indonesia's massive capital, sits on the northwest coast of the island of Java.",
 'A historic mix of cultures – Javanese, Malay, Chinese, Arab, Indian and European – has influenced its architecture, language and cuisine.',
 'The old town, Kota Tua, is home to Dutch colonial buildings, Glodok (Jakarta’s Chinatown) and the old port of Sunda Kelapa, where traditional wooden schooners dock.']

In [7]:
nltk.word_tokenize(data)

['Jakarta',
 ',',
 'Indonesia',
 "'s",
 'massive',
 'capital',
 ',',
 'sits',
 'on',
 'the',
 'northwest',
 'coast',
 'of',
 'the',
 'island',
 'of',
 'Java',
 '.',
 'A',
 'historic',
 'mix',
 'of',
 'cultures',
 '–',
 'Javanese',
 ',',
 'Malay',
 ',',
 'Chinese',
 ',',
 'Arab',
 ',',
 'Indian',
 'and',
 'European',
 '–',
 'has',
 'influenced',
 'its',
 'architecture',
 ',',
 'language',
 'and',
 'cuisine',
 '.',
 'The',
 'old',
 'town',
 ',',
 'Kota',
 'Tua',
 ',',
 'is',
 'home',
 'to',
 'Dutch',
 'colonial',
 'buildings',
 ',',
 'Glodok',
 '(',
 'Jakarta',
 '’',
 's',
 'Chinatown',
 ')',
 'and',
 'the',
 'old',
 'port',
 'of',
 'Sunda',
 'Kelapa',
 ',',
 'where',
 'traditional',
 'wooden',
 'schooners',
 'dock',
 '.']

In [8]:
data = "Hello, my name is Juanda. How are you?"
nltk.sent_tokenize(data)

['Hello, my name is Juanda.', 'How are you?']

In [9]:
nltk.word_tokenize(data)

['Hello', ',', 'my', 'name', 'is', 'Juanda', '.', 'How', 'are', 'you', '?']

## Lemmatization

In [10]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize('wives')

'wife'

In [16]:
wd.lemmatize('goes')

'go'

In [17]:
wd.lemmatize('gone','v')

'go'

In [18]:
wd.lemmatize('happier','a')

'happy'

In [14]:
wd.lemmatize('going')

'going'

In [15]:
wd.lemmatize('cries')

'cry'

## Pos Tagging

In [21]:
nltk.pos_tag(nltk.word_tokenize("I love playing football at garden."))

[('I', 'PRP'),
 ('love', 'VBP'),
 ('playing', 'VBG'),
 ('football', 'NN'),
 ('at', 'IN'),
 ('garden', 'NN'),
 ('.', '.')]

In [20]:
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Juanda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Juanda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [23]:
nltk.help.upenn_tagset("PRP")

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


## Spelling Correction

In [24]:
nltk.jaccard_distance(set('orange'),set('orenge'))

0.16666666666666666

In [31]:
dictionary = ['banana','mango','grapes','watermelon','orange']
def recommend(word):
    score = 1
    ans = ""
    for w in dictionary:
        dist = nltk.jaccard_distance(set(w),set(word))
        if dist<score:
            score = dist
            ans = w
    return ans

In [32]:
recommend('grapes')

'grapes'

## spaCy

In [1]:
import spacy
from spacy import displacy

In [4]:
nlp = spacy.load('en')

## Name Entity Recognition

In [7]:
doc = nlp("My name is Harry Potter, I live in Jakarta and work for the company Microsoft Inc and on 12 June, 2020 I am doing good.")

In [8]:
displacy.render(doc,style='ent',jupyter=True)

## Dependency Parsing

In [11]:
options = {'compact':True,'bg':'seagreen','color':'fff'}
displacy.render(doc,style='dep',jupyter=True,options=options)