In [2]:
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag

In [3]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [4]:
text = "Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."

In [5]:
sentences = sent_tokenize(text)

In [6]:
sentences[0:2]

['Machine learning is the science of getting computers to act without being explicitly programmed.',
 'In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome.']

In [7]:
words = word_tokenize(text)

In [11]:
words[0:4]

['Machine', 'learning', 'is', 'the']

#### Stemming and lemmatization using NLTK

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [7]:
ps = nltk.PorterStemmer()
ls = nltk.LancasterStemmer()
wl = nltk.WordNetLemmatizer()

In [8]:
word = "behaviors"
print(ps.stem(word))
print(ls.stem(word))
print(wl.lemmatize(word))

behavior
behavy
behavior


In [9]:
word = "behaving"
print(ps.stem(word))
print(ls.stem(word))
print(wl.lemmatize(word))

behav
behav
behaving


In [12]:
stemmed = [ ls.stem(w) for w in words]

In [13]:
stemmed

['machin',
 'learn',
 'is',
 'the',
 'sci',
 'of',
 'get',
 'comput',
 'to',
 'act',
 'without',
 'being',
 'explicit',
 'program',
 '.',
 'in',
 'the',
 'past',
 'decad',
 ',',
 'machin',
 'learn',
 'has',
 'giv',
 'us',
 'self-driving',
 'car',
 ',',
 'pract',
 'speech',
 'recognit',
 ',',
 'effect',
 'web',
 'search',
 ',',
 'and',
 'a',
 'vast',
 'improv',
 'understand',
 'of',
 'the',
 'hum',
 'genom',
 '.',
 'machin',
 'learn',
 'is',
 'so',
 'pervas',
 'today',
 'that',
 'you',
 'prob',
 'us',
 'it',
 'doz',
 'of',
 'tim',
 'a',
 'day',
 'without',
 'know',
 'it',
 '.',
 'many',
 'research',
 'also',
 'think',
 'it',
 'is',
 'the',
 'best',
 'way',
 'to',
 'mak',
 'progress',
 'toward',
 'human-level',
 'ai',
 '.',
 'in',
 'thi',
 'class',
 ',',
 'you',
 'wil',
 'learn',
 'about',
 'the',
 'most',
 'effect',
 'machin',
 'learn',
 'techn',
 ',',
 'and',
 'gain',
 'pract',
 'impl',
 'them',
 'and',
 'get',
 'them',
 'to',
 'work',
 'for',
 'yourself',
 '.',
 'mor',
 'import',
 ','

In [15]:
lm = nltk.WordNetLemmatizer()

In [16]:
lemm = [ lm.lemmatize(w) for w in words]

In [31]:
lemm

['Machine',
 'learning',
 'is',
 'the',
 'science',
 'of',
 'getting',
 'computer',
 'to',
 'act',
 'without',
 'being',
 'explicitly',
 'programmed',
 '.',
 'In',
 'the',
 'past',
 'decade',
 ',',
 'machine',
 'learning',
 'ha',
 'given',
 'u',
 'self-driving',
 'car',
 ',',
 'practical',
 'speech',
 'recognition',
 ',',
 'effective',
 'web',
 'search',
 ',',
 'and',
 'a',
 'vastly',
 'improved',
 'understanding',
 'of',
 'the',
 'human',
 'genome',
 '.',
 'Machine',
 'learning',
 'is',
 'so',
 'pervasive',
 'today',
 'that',
 'you',
 'probably',
 'use',
 'it',
 'dozen',
 'of',
 'time',
 'a',
 'day',
 'without',
 'knowing',
 'it',
 '.',
 'Many',
 'researcher',
 'also',
 'think',
 'it',
 'is',
 'the',
 'best',
 'way',
 'to',
 'make',
 'progress',
 'towards',
 'human-level',
 'AI',
 '.',
 'In',
 'this',
 'class',
 ',',
 'you',
 'will',
 'learn',
 'about',
 'the',
 'most',
 'effective',
 'machine',
 'learning',
 'technique',
 ',',
 'and',
 'gain',
 'practice',
 'implementing',
 'them',
 

In [32]:
words[0:5]

['Machine', 'learning', 'is', 'the', 'science']

In [9]:
postags = pos_tag(words)

In [8]:
pos_tag("apple")

[('a', 'DT'), ('p', 'NN'), ('p', 'NN'), ('l', 'NN'), ('e', 'NN')]

In [10]:
postags[0:20]

[('Machine', 'NN'),
 ('learning', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('science', 'NN'),
 ('of', 'IN'),
 ('getting', 'VBG'),
 ('computers', 'NNS'),
 ('to', 'TO'),
 ('act', 'VB'),
 ('without', 'IN'),
 ('being', 'VBG'),
 ('explicitly', 'RB'),
 ('programmed', 'VBN'),
 ('.', '.'),
 ('In', 'IN'),
 ('the', 'DT'),
 ('past', 'JJ'),
 ('decade', 'NN'),
 (',', ',')]

In [15]:
nltk.help.upenn_tagset("JJ")

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [42]:
word1 = "Google Incorporation Pvt Limited is starting a new office at Hyderabad"

In [43]:
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag

In [44]:
word1 = word_tokenize(word1)

In [45]:
tagged = pos_tag(word1)

In [46]:
tagged

[('Google', 'NNP'),
 ('Incorporation', 'NNP'),
 ('Pvt', 'NNP'),
 ('Limited', 'NNP'),
 ('is', 'VBZ'),
 ('starting', 'VBG'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('office', 'NN'),
 ('at', 'IN'),
 ('Hyderabad', 'NNP')]

In [47]:
a = [a[0] for a in tagged if a[1][1] == 'NNP']

In [48]:
a

[]

# Download NER for windows from : https://pypi.python.org/pypi/ner/0.1#downloads 

In [1]:
import ner

In [51]:
text = " google chief is visiting india during the month of may, they are going to setup a new R&D facility at hyderabad"

In [2]:
%pwd

'/home/visa'

In [3]:
import os 
#os.chdir("**New Directory**")

In [6]:
import nltk 
with open('sample.txt', 'r') as f:
    sample = f.read()


sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

entity_names = []
for tree in chunked_sentences:
    # Print results per sentence
    # print extract_entity_names(tree)

    entity_names.extend(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
print (entity_names)

['Google', 'Sundar Pichai', 'India', 'Hyderabad', 'Apple Incorporation Limited', 'Facebook Inc', 'Mark', 'CEO', 'Hitex Convention']


In [23]:
sentences = nltk.sent_tokenize(sample)

In [24]:
for sent in sentences:
    print sent

google chief is visiting india during the month of may, they are going to setup a new R&D facility at hyderabad


In [2]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# Download NER tagger from : https://nlp.stanford.edu/software/CRF-NER.shtml#Download

In [3]:
st = StanfordNERTagger('/home/visa/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   '/home/visa/stanford-ner/stanford-ner.jar',
					   encoding='utf-8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [7]:
text = 'CEO of Microsoft Satya Nadella is visiting India in 2018, as part of expansion will start a new R&D faility at Hyderabad. Google will be competing with Apple inc.'
text2 = 'TCS India Pvt Ltd is expanding offices in India, Will recruit freshers from top tier colleges'
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

tokenized_text2 = word_tokenize(text2)
classified_text2 = st.tag(tokenized_text2)

print(classified_text)
print(classified_text2)

[('CEO', 'O'), ('of', 'O'), ('Microsoft', 'ORGANIZATION'), ('Satya', 'PERSON'), ('Nadella', 'PERSON'), ('is', 'O'), ('visiting', 'O'), ('India', 'LOCATION'), ('in', 'O'), ('2018', 'O'), (',', 'O'), ('as', 'O'), ('part', 'O'), ('of', 'O'), ('expansion', 'O'), ('will', 'O'), ('start', 'O'), ('a', 'O'), ('new', 'O'), ('R', 'O'), ('&', 'O'), ('D', 'O'), ('faility', 'O'), ('at', 'O'), ('Hyderabad', 'LOCATION'), ('.', 'O'), ('Google', 'ORGANIZATION'), ('will', 'O'), ('be', 'O'), ('competing', 'O'), ('with', 'O'), ('Apple', 'ORGANIZATION'), ('inc', 'ORGANIZATION'), ('.', 'O')]
[('TCS', 'ORGANIZATION'), ('India', 'ORGANIZATION'), ('Pvt', 'ORGANIZATION'), ('Ltd', 'ORGANIZATION'), ('is', 'O'), ('expanding', 'O'), ('offices', 'O'), ('in', 'O'), ('India', 'LOCATION'), (',', 'O'), ('Will', 'O'), ('recruit', 'O'), ('freshers', 'O'), ('from', 'O'), ('top', 'O'), ('tier', 'O'), ('colleges', 'O')]
