# Stopwords 

In [9]:
import nltk 
from nltk import word_tokenize
from nltk.corpus import stopwords

In [10]:
stopwords=set(stopwords.words('english'))
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
text="She bought a shirt and a book."


In [12]:
tokens=word_tokenize(text)
tokens

['She', 'bought', 'a', 'shirt', 'and', 'a', 'book', '.']

In [13]:
text_without_stopwords=[w for w in tokens if not w  in stopwords]
print(text_without_stopwords)

['She', 'bought', 'shirt', 'book', '.']


## Bag of Words 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample sentences.
sentences = [
    "This is a sample sentence",
    "I am interested in politics",
    "You are a very good software engineer, engineer.",
]

# Create CountVectorizer, which create bag-of-words model.
# stop_words : Specify language to remove stopwords. 
vectorizer = CountVectorizer(stop_words='english')

# Learn vocabulary in sentences. 
vectorizer.fit(sentences)

# Get dictionary. 
vectorizer.get_feature_names()

['engineer',
 'good',
 'interested',
 'politics',
 'sample',
 'sentence',
 'software']

In [15]:
# Transform each sentences in vector space.
vector = vectorizer.transform(sentences)

print(vector)


vector_spaces = vector.toarray()
vector_spaces

  (0, 4)	1
  (0, 5)	1
  (1, 2)	1
  (1, 3)	1
  (2, 0)	2
  (2, 1)	1
  (2, 6)	1


array([[0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 1, 0, 0, 0],
       [2, 1, 0, 0, 0, 0, 1]], dtype=int64)

# TF- IDF : Term Frequency and Inverse Document Frequency

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    "This is a sample sentence",
    "I am interested in politics",
    "You are a very good software engineer, engineer.",
]

# Create TfidfVectorizer.
# stop_words : Get rid of english stop words. 
vectorizer = TfidfVectorizer(stop_words='english')

# Learn vocabulary from sentences. 
vectorizer.fit(sentences)

# Get vocabularies.
print(vectorizer.get_feature_names())
vectorizer.vocabulary_

['engineer', 'good', 'interested', 'politics', 'sample', 'sentence', 'software']


{'sample': 4,
 'sentence': 5,
 'interested': 2,
 'politics': 3,
 'good': 1,
 'software': 6,
 'engineer': 0}

In [18]:
# Transform to document-term matrix
vector_spaces = vectorizer.transform(sentences)
print(vector_spaces)
vector_spaces.toarray()

  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476
  (1, 3)	0.7071067811865476
  (1, 2)	0.7071067811865476
  (2, 6)	0.40824829046386296
  (2, 1)	0.40824829046386296
  (2, 0)	0.8164965809277259


array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        ],
       [0.        , 0.        , 0.70710678, 0.70710678, 0.        ,
        0.        , 0.        ],
       [0.81649658, 0.40824829, 0.        , 0.        , 0.        ,
        0.        , 0.40824829]])

In [19]:
# Show sentences and vector space representation.
# 
# (A, B) C
# A : Document Index
# B : Specific word-vector index
# C : TF-IDF score
for i, v in zip(sentences, vector_spaces):
    print(i)
    print(v)

This is a sample sentence
  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476
I am interested in politics
  (0, 3)	0.7071067811865476
  (0, 2)	0.7071067811865476
You are a very good software engineer, engineer.
  (0, 6)	0.40824829046386296
  (0, 1)	0.40824829046386296
  (0, 0)	0.8164965809277259


# Parts of Speech Tagging 


In [2]:
! pip install spacy
import spacy



In [3]:
# to import english language dictionary

nlp = spacy.load('en_core_web_sm')

In [4]:
doc= nlp(u"I will google about facebook")

In [5]:
doc.text

'I will google about facebook'

In [7]:
# to check which word is having what parts of speech.
# course - grade POS

doc[0].pos_

'PRON'

In [8]:
# fined- grade POS
doc[0].tag_

'PRP'

In [9]:
spacy.explain('PRP')

'pronoun, personal'

In [16]:
doc2 = nlp(u"I left the room")
for word in doc2:
    print(word.text, "----->",word.pos_ , "----->" , word.tag_,"----->" , spacy.explain(word.tag_))

I -----> PRON -----> PRP -----> pronoun, personal
left -----> VERB -----> VBD -----> verb, past tense
the -----> DET -----> DT -----> determiner
room -----> NOUN -----> NN -----> noun, singular or mass


In [17]:
doc3 = nlp(u"to the left of the room")
for word in doc3:
    print(word.text, "----->",word.pos_ , "----->" , word.tag_,"----->" , spacy.explain(word.tag_))

to -----> ADP -----> IN -----> conjunction, subordinating or preposition
the -----> DET -----> DT -----> determiner
left -----> NOUN -----> NN -----> noun, singular or mass
of -----> ADP -----> IN -----> conjunction, subordinating or preposition
the -----> DET -----> DT -----> determiner
room -----> NOUN -----> NN -----> noun, singular or mass


### THE END 