## Tokenization...

In [1]:
import nltk
import pandas as pd

In [2]:
from nltk.tokenize import sent_tokenize            # split the document into sentences
from nltk.tokenize import word_tokenize          # split the document or sentence into words

In [3]:
dataset = 'Hello Everyone. Welcome here. We are here.'

In [4]:
d = sent_tokenize(text=dataset, language='english')

In [5]:
for i in d:
    print(i)

Hello Everyone.
Welcome here.
We are here.


In [6]:
w = word_tokenize(dataset)

In [7]:
w

['Hello', 'Everyone', '.', 'Welcome', 'here', '.', 'We', 'are', 'here', '.']

In [8]:
for j in w:
    if j!='.':
        print(j)

Hello
Everyone
Welcome
here
We
are
here


## Stemming...

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [10]:
data = ['love','loving','lover','loved','loves','lovingly']

In [11]:
st = PorterStemmer()
st

<PorterStemmer>

In [12]:
for i in data:
    print(st.stem(i))

love
love
lover
love
love
lovingli


In [13]:
dataset = 'Hello Everyone. Welcome here. We are here.'

In [14]:
words = word_tokenize(dataset)

In [15]:
for w in words:
    print(st.stem(w))

hello
everyon
.
welcom
here
.
We
are
here
.


## Lemmatization...

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
wnl = WordNetLemmatizer()

In [18]:
wnl.lemmatize('churches')

'church'

In [19]:
wnl.lemmatize('dogs')

'dog'

In [20]:
wnl.lemmatize('better', pos='a')       # pos shows Part Of Speech.. here 'a' showing better is an adjective

'good'

## Stop Words...

In NLP, useless words are called stopwords.
NLTK has list of stopwords stored is 16 different languages.

In [21]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [22]:
dataset = 'Hello Everyone. Welcome here. We are here. Weather is awesome'

In [23]:
# create set of stopwords in english...
sw = set(stopwords.words('english'))
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [24]:
len(sw)

179

In [25]:
word_tok = word_tokenize(dataset)

In [26]:
word_tok

['Hello',
 'Everyone',
 '.',
 'Welcome',
 'here',
 '.',
 'We',
 'are',
 'here',
 '.',
 'Weather',
 'is',
 'awesome']

In [27]:
# Remove stopwords from dataset..

filtered = []
for w in word_tok:
    if w not in sw:
        filtered.append(w)

In [28]:
filtered

['Hello', 'Everyone', '.', 'Welcome', '.', 'We', '.', 'Weather', 'awesome']

## Tagging...

##### Part of Speech Tagging - it's a process of assigning one of the parts of speech to a given word... e.g. word:Paper, Tag:Noun

In [3]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [4]:
dataset = 'Tajmahal is one of the world"s most celebrated structures. Indian History.'

In [5]:
wr_tok = word_tokenize(dataset)

In [6]:
tag = pos_tag(wr_tok)

In [8]:
result = pd.DataFrame(tag, columns=['Words','POS_tag'])
result

Unnamed: 0,Words,POS_tag
0,Tajmahal,NNP
1,is,VBZ
2,one,CD
3,of,IN
4,the,DT
5,world,NN
6,'',''
7,s,VBZ
8,most,JJS
9,celebrated,JJ


In [34]:
# set of tags....

nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Chunking... Grouping the Information.

In [35]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

In [36]:
dataset = 'Taj mahal is one of the world"s most celebrated structures. Indian History.'

In [37]:
wtok = word_tokenize(dataset)

In [38]:
pstag = pos_tag(wtok)

In [39]:
sequence_chunk = """
chunk:
{<NNPS>}
{<NNP>+}
{<NN>+}"""

In [40]:
chunk = RegexpParser(sequence_chunk)

In [41]:
chunk_result = chunk.parse(pstag)

In [42]:
print(chunk_result)

(S
  (chunk Taj/NNP)
  (chunk mahal/NN)
  is/VBZ
  one/CD
  of/IN
  the/DT
  (chunk world/NN)
  ''/''
  s/VBZ
  most/JJS
  celebrated/JJ
  structures/NNS
  ./.
  Indian/JJ
  (chunk History/NNP)
  ./.)


## Named Entity Recognition... e.g. Tesla:Organization, Elon Musk:Person

In [43]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [44]:
dataset = 'Abraham Lincoln was an American statesman and lawyer, who served as the 16th President od USA'

In [45]:
tag = pos_tag(word_tokenize(dataset))

In [46]:
tag

[('Abraham', 'NNP'),
 ('Lincoln', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('statesman', 'NN'),
 ('and', 'CC'),
 ('lawyer', 'NN'),
 (',', ','),
 ('who', 'WP'),
 ('served', 'VBD'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('16th', 'CD'),
 ('President', 'NNP'),
 ('od', 'MD'),
 ('USA', 'NNP')]

In [47]:
ner = ne_chunk(tag)

In [48]:
print(ner)

(S
  (PERSON Abraham/NNP)
  (PERSON Lincoln/NNP)
  was/VBD
  an/DT
  (GPE American/JJ)
  statesman/NN
  and/CC
  lawyer/NN
  ,/,
  who/WP
  served/VBD
  as/IN
  the/DT
  16th/CD
  President/NNP
  od/MD
  (ORGANIZATION USA/NNP))


In [49]:
ner.draw()

# Displaying Similar Words

In [11]:
import nltk
nltk.download('brown')           # library brown has a collection of words...

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\SANDIP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [12]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

In [18]:
print(text.similar('sandip'))           # to find the words similar to 'woman'

No matches
None


In [17]:
print(text.similar("and"))         # to find the words similar to 'and'

in of to on for at as that with is or but was from when by it if the
all
None


# Creating a summary of documents

In [1]:
import os
import nltk
import re

In [2]:
eng_stop_words = nltk.corpus.stopwords.words('english')
len(eng_stop_words)

179

In [3]:
file = open('E:\Sandip\Python was conceived in the late 1980s.txt', 'r')
my_file = file.read()

In [4]:
len(my_file)

1950

In [6]:
docs = re.sub(r'[^a-zA-Z]',' ',my_file)           # remove unwanted characters...
docs

'Python was conceived in the late     s     by Guido van Rossum at Centrum Wiskunde   Informatica  CWI  in the Netherlands as a successor to ABC programming language  which was inspired by SETL      capable of exception handling and interfacing with the Amoeba operating system      Its implementation began in December           Van Rossum shouldered sole responsibility for the project  as the lead developer  until    July       when he announced his  permanent vacation  from his responsibilities as Python s Benevolent Dictator For Life  a title the Python community bestowed upon him to reflect his long term commitment as the project s chief decision maker      In January       active Python core developers elected a   member  Steering Council  to lead the project      As of       the current members of this council are Barry Warsaw  Brett Cannon  Carol Willing  Thomas Wouters  and Pablo Galindo Salgado      Python     was released on    October       with many major new features  inclu

In [7]:
docs = docs.lower()     # convert into lower case

In [9]:
docs = docs.strip()     # removing leading and trailing spaces
docs

'python was conceived in the late     s     by guido van rossum at centrum wiskunde   informatica  cwi  in the netherlands as a successor to abc programming language  which was inspired by setl      capable of exception handling and interfacing with the amoeba operating system      its implementation began in december           van rossum shouldered sole responsibility for the project  as the lead developer  until    july       when he announced his  permanent vacation  from his responsibilities as python s benevolent dictator for life  a title the python community bestowed upon him to reflect his long term commitment as the project s chief decision maker      in january       active python core developers elected a   member  steering council  to lead the project      as of       the current members of this council are barry warsaw  brett cannon  carol willing  thomas wouters  and pablo galindo salgado      python     was released on    october       with many major new features  inclu

In [19]:
tokenize = nltk.word_tokenize(docs)         # tokenize the docs into words
tokenize

['python',
 'conceived',
 'late',
 'guido',
 'van',
 'rossum',
 'centrum',
 'wiskunde',
 'informatica',
 'cwi',
 'netherlands',
 'successor',
 'abc',
 'programming',
 'language',
 'inspired',
 'setl',
 'capable',
 'exception',
 'handling',
 'interfacing',
 'amoeba',
 'operating',
 'system',
 'implementation',
 'began',
 'december',
 'van',
 'rossum',
 'shouldered',
 'sole',
 'responsibility',
 'project',
 'lead',
 'developer',
 'july',
 'announced',
 'permanent',
 'vacation',
 'responsibilities',
 'python',
 'benevolent',
 'dictator',
 'life',
 'title',
 'python',
 'community',
 'bestowed',
 'upon',
 'reflect',
 'long',
 'term',
 'commitment',
 'project',
 'chief',
 'decision',
 'maker',
 'january',
 'active',
 'python',
 'core',
 'developers',
 'elected',
 'member',
 'steering',
 'council',
 'lead',
 'project',
 'current',
 'members',
 'council',
 'barry',
 'warsaw',
 'brett',
 'cannon',
 'carol',
 'willing',
 'thomas',
 'wouters',
 'pablo',
 'galindo',
 'salgado',
 'python',
 'releas

In [12]:
filtered = [token for token in tokenize if token not in eng_stop_words]        # remove the stop words    

In [17]:
docs = ' '.join(filtered)    

In [18]:
docs

'python conceived late guido van rossum centrum wiskunde informatica cwi netherlands successor abc programming language inspired setl capable exception handling interfacing amoeba operating system implementation began december van rossum shouldered sole responsibility project lead developer july announced permanent vacation responsibilities python benevolent dictator life title python community bestowed upon reflect long term commitment project chief decision maker january active python core developers elected member steering council lead project current members council barry warsaw brett cannon carol willing thomas wouters pablo galindo salgado python released october many major new features including cycle detecting garbage collector support unicode python released december major revision language completely backward compatible many major features backported python x x version series releases python include utility automates least partially translation python code python python end l

In [20]:
len(docs)

1297

# Advanced cleaning Technique - Normalisation

In [4]:
from normalise import normalise

In [6]:
txt = 'On the 30th Jan 2020, corona virus hit india with 1st case in Kerala anywhere G.O.I started acting and allocated fund of 17287 Crores I.N.R'

In [12]:
from nltk.tokenize import word_tokenize

abbr = {'G.O.I': 'Government Of India',
       'I.N.R':'Indian Rupees'
       }
normalise_token = normalise(word_tokenize(txt), user_abbrevs = abbr, verbose = False)
display('normalise_token: ',{' '.join(normalise_token)})

'normalise_token: '

{'On the thirtieth of Jan twenty twenty , corona virus hit india with first case in Kerala anywhere Government Of India started acting and allocated fund of seventeen thousand, two hundred and eighty seven Crores Indian Rupees'}

# Feature Extraction in Text

TF-IDF  -->  Term Frequency - Inverse Document Frequency

CountVectorizer performs the task of tokenizing and counting, while TfidfTransformer normalizes the data. 
TfidfVectorizer, on the other hand, performs all three operations, thereby streamlining the process of natural language processing.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer           # CountVectorizer implements both tokenization and occurrence counting in a single class..

text = ['hello, my name is Sandip and I am a data scientist']
text1 = ['You are watching name']

In [17]:
vectorize = CountVectorizer()

vectorize.fit(text)

CountVectorizer()

In [18]:
# summarize
print(vectorize.vocabulary_)

{'hello': 3, 'my': 5, 'name': 6, 'is': 4, 'sandip': 7, 'and': 1, 'am': 0, 'data': 2, 'scientist': 8}


In [30]:
# encode document
newvector = vectorize.transform(text1)

print('Newvector: ',newvector)
print('newvector_array: ',newvector.toarray())             # in the 'text' name is at 6th position, hence 'text1' has similar word, it's showing 1 at 6th position...

Newvector:    (0, 6)	1
newvector_array:  [[0 0 0 0 0 0 1 0 0]]


### TF-IDF

Purpose of TF-IDF is to highlight the words which are frequent in document but not across the documents...

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = ['Sandip is a Data Scientist in India.','Data Science is a promising field']

In [22]:
vectorizer = TfidfVectorizer()

In [23]:
vectorizer.fit(text)

TfidfVectorizer()

In [24]:
print(vectorizer.vocabulary_)

{'sandip': 6, 'is': 4, 'data': 0, 'scientist': 8, 'in': 2, 'india': 3, 'science': 7, 'promising': 5, 'field': 1}


In [26]:
print(vectorizer.idf_)

[1.         1.40546511 1.40546511 1.40546511 1.         1.40546511
 1.40546511 1.40546511 1.40546511]


In [25]:
text_as_input = text[0]
text_as_input

'Sandip is a Data Scientist in India.'

In [27]:
# encode the document
vector = vectorize.transform([text_as_input])

In [28]:
print(vector.toarray())

[[0 0 1 0 1 0 0 1 1]]


# Word Embedding --- means numerical representation of text

### 1. Using creating Word2Vec model

In [3]:
from gensim.models import Word2Vec

### Using Keras embedding

In [35]:
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding