## TF-IDF

In [8]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
text_data = np.array(['I love Brazil. Brazil!',
                     'Sweden is best',
                     'Germany beats both'])

In [16]:
tfidf = TfidfVectorizer(stop_words='english')

In [17]:
feature_matrix = tfidf.fit_transform(text_data)

In [18]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.89442719, 0.        , 0.4472136 ,
        0.        ],
       [0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.70710678],
       [0.70710678, 0.        , 0.        , 0.70710678, 0.        ,
        0.        ]])

In [19]:
tfidf.get_feature_names()

['beats', 'best', 'brazil', 'germany', 'love', 'sweden']

In [20]:
tfidf.vocabulary_

{'love': 4, 'brazil': 2, 'sweden': 5, 'best': 1, 'germany': 3, 'beats': 0}

In [21]:
import pandas as pd

In [37]:
feature_space = pd.DataFrame(feature_matrix.toarray(), columns = tfidf.get_feature_names())



In [38]:
feature_space

Unnamed: 0,beats,best,brazil,germany,love,sweden
0,0.0,0.0,0.894427,0.0,0.447214,0.0
1,0.0,0.707107,0.0,0.0,0.0,0.707107
2,0.707107,0.0,0.0,0.707107,0.0,0.0


## Text conversion to Lowercase

In [63]:
myString = "The 5 countries include China, United States, Indonesia, India, and Brazil."

In [64]:
str = myString.lower()

In [65]:
str

'the 5 countries include china, united states, indonesia, india, and brazil.'

In [66]:
from nltk import word_tokenize

In [67]:
import re
puntuation_removed_str = re.sub(r'[!@#$%^&*()-+=,.\d+]','', str)
puntuation_removed_str

'the  countries include china united states indonesia india and brazil'

In [72]:
filtered_str = re.findall(r'\w\S+', str)
filtered_str

['the',
 'countries',
 'include',
 'china,',
 'united',
 'states,',
 'indonesia,',
 'india,',
 'and',
 'brazil.']

In [80]:
tokenized_str = word_tokenize(puntuation_removed_str)

In [81]:
tokenized_str

['the',
 'countries',
 'include',
 'china',
 'united',
 'states',
 'indonesia',
 'india',
 'and',
 'brazil']

In [82]:
myString = 'You, {]$%are amazing students:at@@! at @Lambton College ! ;'

In [85]:
import string
test_str = myString.translate(str.maketrans('','', string.punctuation))

In [86]:
test_str

'You are amazing studentsat at Lambton College  '

In [89]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [93]:
puntuation_removed_str = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`\{|\}~]','', myString)
puntuation_removed_str

'You are amazing studentsat at Lambton College  '

In [96]:
puntuation_removed_str2 = re.sub(r'[^\w\s]', '', myString)
puntuation_removed_str2

'You are amazing studentsat at Lambton College  '

# POS - Tagging (Part Of Speech - Tagging)

In [100]:
from textblob import TextBlob

In [102]:
import nltk

In [103]:
text=("Codespeedy is a programming blog. ""Blog posts contain articles and tutorials on Python, CSS and even much more")

In [104]:
text

'Codespeedy is a programming blog. Blog posts contain articles and tutorials on Python, CSS and even much more'

In [105]:
tb = TextBlob(text)

In [107]:
print(tb.tags)

[('Codespeedy', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('programming', 'VBG'), ('blog', 'NN'), ('Blog', 'NNP'), ('posts', 'NNS'), ('contain', 'VBP'), ('articles', 'NNS'), ('and', 'CC'), ('tutorials', 'NNS'), ('on', 'IN'), ('Python', 'NNP'), ('CSS', 'NNP'), ('and', 'CC'), ('even', 'RB'), ('much', 'RB'), ('more', 'JJR')]


In [108]:
tb1 = TextBlob(puntuation_removed_str2)
print(tb1.tags)

[('You', 'PRP'), ('are', 'VBP'), ('amazing', 'VBG'), ('studentsat', 'NN'), ('at', 'IN'), ('Lambton', 'NNP'), ('College', 'NNP')]


In [110]:
print(nltk.help.upenn_tagset())

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Named Entity Recognition

In [227]:
myString = ' Jack Nelson worked for Microsoft and attended a conference in Italy . I Study at Lambton college in toronto.'
myString2 = ' jack Nelson worked for Microsoft and attended a conference in Italy . I study at Lambton college in Toronto.'

In [228]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [229]:
# Tokenize the article into sentences: sentences
sentences = sent_tokenize(myString)

sentences2 = sent_tokenize(myString2)

In [230]:
# Tokenize each sentence into words: token_sentences
token_sentences = [word_tokenize(sent) for sent in sentences]

token_sentences2 = [word_tokenize(sent) for sent in sentences2]

In [231]:
# Tag each tokenized sentence into parts of speech: pos_sentences
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences]

pos_sentences2 = [nltk.pos_tag(sent) for sent in token_sentences2]

In [232]:
pos_sentences

[[('Jack', 'NNP'),
  ('Nelson', 'NNP'),
  ('worked', 'VBD'),
  ('for', 'IN'),
  ('Microsoft', 'NNP'),
  ('and', 'CC'),
  ('attended', 'VBD'),
  ('a', 'DT'),
  ('conference', 'NN'),
  ('in', 'IN'),
  ('Italy', 'NNP'),
  ('.', '.')],
 [('I', 'PRP'),
  ('Study', 'VBP'),
  ('at', 'IN'),
  ('Lambton', 'NNP'),
  ('college', 'NN'),
  ('in', 'IN'),
  ('toronto', 'NN'),
  ('.', '.')]]

In [233]:
# Create the named entity chunks: chunked_sentences
chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary = True)

chunked_sentences2 = nltk.ne_chunk_sents(pos_sentences2, binary = True)

In [236]:
# For myString
for sent in chunked_sentences:
    for chunk in sent:
        if hasattr(chunk, "label") and chunk.label() == 'NE':
            print(chunk)

(NE Jack/NNP Nelson/NNP)
(NE Microsoft/NNP)
(NE Italy/NNP)
(NE Lambton/NNP)


In [237]:
# For myString2
for sent in chunked_sentences2:
    for chunk in sent:
        if isinstance(chunk, nltk.tree.Tree):
            print(chunk)

(NE Nelson/NNP)
(NE Microsoft/NNP)
(NE Italy/NNP)
(NE Lambton/NNP)
(NE Toronto/NNP)


## Synonym and antonym

In [238]:
from nltk.corpus import wordnet

In [241]:
syn = wordnet.synsets("cat")
print(syn)

[Synset('cat.n.01'), Synset('guy.n.01'), Synset('cat.n.03'), Synset('kat.n.01'), Synset('cat-o'-nine-tails.n.01'), Synset('caterpillar.n.02'), Synset('big_cat.n.01'), Synset('computerized_tomography.n.01'), Synset('cat.v.01'), Synset('vomit.v.01')]


In [247]:
synonyms = []
antonyms = []
input_word = input("Enter a word:")

for syn in wordnet.synsets(input_word):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print("Synonyms:", set(synonyms))
print("Antonyms:", set(antonyms))

Enter a word:good
Synonyms: {'upright', 'unspoilt', 'dependable', 'honorable', 'dear', 'effective', 'safe', 'good', 'respectable', 'adept', 'estimable', 'beneficial', 'skillful', 'near', 'soundly', 'salutary', 'sound', 'commodity', 'well', 'just', 'thoroughly', 'goodness', 'practiced', 'serious', 'full', 'secure', 'ripe', 'skilful', 'expert', 'in_force', 'trade_good', 'proficient', 'right', 'in_effect', 'honest', 'undecomposed', 'unspoiled'}
Antonyms: {'evilness', 'evil', 'ill', 'bad', 'badness'}


In [250]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [253]:
w1 = wordnet.synset('jump.n.02')
w2 = wordnet.synset('spring.n.02')
print(w1.wup_similarity(w2))

0.11764705882352941


In [258]:
w1 = wordnet.synset('jump.v.02')
w2 = wordnet.synset('spring.v.02')
print(w1.wup_similarity(w2))

0.3333333333333333


In [259]:
wordnet.synset('jump.v.02').lemmas()

[Lemma('startle.v.02.startle'),
 Lemma('startle.v.02.jump'),
 Lemma('startle.v.02.start')]

In [260]:
wordnet.synset('jump.v.02').examples()

['She startled when I walked into the room']

In [261]:
wordnet.synset('jump.n.02').lemmas()

[Lemma('leap.n.02.leap'),
 Lemma('leap.n.02.jump'),
 Lemma('leap.n.02.saltation')]

In [262]:
wordnet.synset('jump.n.02').examples()

['a successful leap from college to the major leagues']

In [263]:
wordnet.synset('jump.n.01').examples()

['a jump in attendance']

In [264]:
wordnet.synset('jump.n.01').lemmas()

[Lemma('jump.n.01.jump'), Lemma('jump.n.01.leap')]