## Tokenization ---Spacy

In [30]:
import spacy
import spacy
nlp = spacy.load('en_core_web_sm')

In [31]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
   SPACE 
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [32]:
for ent in doc2.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Tesla - ORG - Companies, agencies, institutions, etc.


In [33]:
doc2 = nlp(u"We're here to help :) Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for t in doc2:
    print(t)

We
're
here
to
help
:)
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [34]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [35]:
#counting tokens
len(doc4)

11

## Noun Chunks---Spacy

In [36]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [37]:
for chunk in doc9.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufacturers - manufacturers - pobj - toward


In [38]:
len(list(doc9.noun_chunks))

3

## Visualizong dependency---Spacy

In [39]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

## Entity

In [40]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. Google is good')
displacy.render(doc, style='ent', jupyter=True)

In [41]:
for ent in doc.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

the last quarter - DATE - Absolute or relative dates or periods
Apple - ORG - Companies, agencies, institutions, etc.
nearly 20 thousand - CARDINAL - Numerals that do not fall under another type
iPods - PRODUCT - Objects, vehicles, foods, etc. (not services)
$6 million - MONEY - Monetary values, including unit
Google - ORG - Companies, agencies, institutions, etc.


In [42]:
tuple_list=[(ent.label_,ent.text) for ent in doc.ents]
dic={}
for key, val in tuple_list:
    if key not in dic.keys():
        dic[key]=[val]
    else:
        dic[key].extend([val])
dic   

{'DATE': ['the last quarter'],
 'ORG': ['Apple', 'Google'],
 'CARDINAL': ['nearly 20 thousand'],
 'PRODUCT': ['iPods'],
 'MONEY': ['$6 million']}

## Stemming ----nltk

In [43]:
import nltk
from nltk.stem.porter import *

In [44]:
p_stemmer = PorterStemmer()

In [45]:
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [46]:
from nltk.stem.snowball import SnowballStemmer
# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [47]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


## Lemmatizatio------Spacy

In [48]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [49]:
text=u"I am a runner running in a race because I love to run since I ran today"
doc1 = nlp(text)

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [50]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize

In [51]:
[lemmatizer.lemmatize(word,pos='v') for word in word_tokenize(text)]

['I',
 'be',
 'a',
 'runner',
 'run',
 'in',
 'a',
 'race',
 'because',
 'I',
 'love',
 'to',
 'run',
 'since',
 'I',
 'run',
 'today']

## Stop words------Spacy

In [52]:
import spacy
nlp = spacy.load('en_core_web_sm')
len(nlp.Defaults.stop_words)

326

In [53]:
print(nlp.Defaults.stop_words)

{'therein', 'nine', 'did', 'nobody', 'or', 'every', 'former', 'something', 'himself', 'might', 'together', 'almost', "'m", 'them', 'both', 'make', 'moreover', 'during', 'whose', 'if', 'indeed', 'everywhere', 'alone', 'perhaps', 'all', 'became', 'down', 'bottom', 'much', 'on', 'does', 'could', 'still', 'until', 'never', 'without', 'namely', 'us', 'afterwards', '’s', 'whoever', 'already', 'someone', 'fifteen', 'show', 'yet', 'again', 'another', 'been', 'go', 'one', 'last', 'no', 'sixty', 'by', 're', 'themselves', 'whole', 'between', 'once', 'hereupon', 'twelve', 'with', 'through', 'yourself', 'from', 'just', 'below', 'n‘t', 'side', 'really', 'becomes', 'serious', 'this', 'eleven', 'give', 'see', 'eight', 'get', 'three', 'within', 'often', 'whenever', 'among', 'why', 'it', 'the', 'done', 'towards', 'anywhere', 'seem', 'somewhere', 'whereupon', 'onto', 'your', 'hers', '‘ll', 'myself', 'regarding', '’re', 'please', 'various', 'seemed', 'many', 'few', 'as', 'first', '’ve', 'then', '‘d', 'whi

In [54]:
nlp.vocab['myself'].is_stop

True

In [55]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')
# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True
# nlp.vocab['btw'].is_stop = False #to remove
len(nlp.Defaults.stop_words)

327

In [56]:
nlp.vocab['btw'].is_stop

True

## one function

In [57]:
def tokenize(text):
    '''this method does the following
    1. normalizing all the words to lower size
    2. removes punctuations
    3. splits the words
    4. removes the stopwords like am,is,have,you,...
    5. lammetizes the words for example running-->run
    6. this does better since it lemmetize for verbs too
    '''
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())    # normalize case and remove punctuation
    tokens = word_tokenize(text)    # tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in nlp.Defaults.stop_words]    # lemmatize andremove stop words
    tokens = [lemmatizer.lemmatize(word,pos='v') for word in tokens]    # lemmatize andremove stop words
#     tokens = [PorterStemmer().stem(word) for word in tokens if word not in stop_words]    # lemmatize andremove stop words
    return tokens

In [58]:
documents = ['Have you seen this book?',
             'I need to do my homeworks ',
            'consider it done!',
            'this book is amazing. Have you read it?',
            'I am considering your book by today',
            'this is very bad sentences. we try to make it negative',
            'this is supposed to show very admiring sentences. adorable']

In [59]:
[tokenize (x) for x in documents]

[['see', 'book'],
 ['need', 'homework'],
 ['consider'],
 ['book', 'amaze', 'read'],
 ['consider', 'book', 'today'],
 ['bad', 'sentence', 'try', 'negative'],
 ['suppose', 'admire', 'sentence', 'adorable']]

In [60]:
nlp.vocab['make'].is_stop

True

In [61]:
nlp.vocab['do'].is_stop

True

## CountVectorizer----Sckit-Learn 

In [67]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X_train_counts = vectorizer.fit_transform(documents)
X_train_counts.shape

(7, 32)

<font color=green>This shows that our training set is comprised of 7 documents, and 32 features.</font>

In [68]:
word=vectorizer.get_feature_names()
word[:5]

['admiring', 'adorable', 'am', 'amazing', 'bad']

In [70]:
pd.DataFrame.sparse.from_spmatrix(X_train_counts, columns=word)

Unnamed: 0,admiring,adorable,am,amazing,bad,book,by,consider,considering,do,...,show,supposed,this,to,today,try,very,we,you,your
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,1,0,0,1,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
5,0,0,0,0,1,0,0,0,0,0,...,0,0,1,1,0,1,1,1,0,0
6,1,1,0,0,0,0,0,0,0,0,...,1,1,1,1,0,0,1,0,0,0


In [240]:

def prep_data(text,method=CountVectorizer):
    '''
    this method counts either counts the words 
    in sentences (CountVectorizer) or wights them 
    based on their importance in the sentence 
    and entire data(TfidfVectorizer):
    '''
    count_vector = method(tokenizer=tokenize)
    count_vector.fit(text)
    doc_array = count_vector.transform(text).toarray()
    frequency_matrix_count = pd.DataFrame(doc_array, columns=count_vector.get_feature_names())
    return frequency_matrix_count,frequency_matrix_count.values,count_vector


In [241]:
#lets clean the documents and then apply vectorizer
df,vectorized,vectorizer=prep_data(documents)#uses countvectorizer
df

Unnamed: 0,admire,adorable,amaze,bad,book,consider,homework,need,negative,read,see,sentence,suppose,today,try
0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0
5,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1
6,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0


In [242]:
df.shape

(7, 15)

## TfidVectorizer----Sckit-Learn 

In [243]:
documents

['Have you seen this book?',
 'I need to do my homeworks ',
 'consider it done!',
 'this book is amazing. Have you read it?',
 'I am considering your book by today',
 'this is very bad sentences. we try to make it negative',
 'this is supposed to show very admiring sentences. adorable']

In [244]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(documents) # remember to use the original X_train set
X_train_tfidf.shape

(7, 32)

In [245]:
df,vectorized,vectorizer=prep_data(documents,method=TfidfVectorizer)#uses TfidfVectorizer
df

Unnamed: 0,admire,adorable,amaze,bad,book,consider,homework,need,negative,read,see,sentence,suppose,today,try
0,0.0,0.0,0.0,0.0,0.578667,0.0,0.0,0.0,0.0,0.0,0.815564,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.632022,0.0,0.448438,0.0,0.0,0.0,0.0,0.632022,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.479185,0.560603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.675356,0.0
5,0.0,0.0,0.0,0.520647,0.0,0.0,0.0,0.0,0.520647,0.0,0.0,0.432182,0.0,0.0,0.520647
6,0.520647,0.520647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.432182,0.520647,0.0,0.0


In [246]:
df.shape

(7, 15)