In [2]:
!pip install spacy
!python -m spacy download en

Collecting spacy
  Downloading spacy-2.3.2-cp38-cp38-macosx_10_9_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 1.7 MB/s eta 0:00:01
[?25hCollecting thinc==7.4.1
  Downloading thinc-7.4.1-cp38-cp38-macosx_10_9_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 13.8 MB/s eta 0:00:01
[?25hCollecting plac<1.2.0,>=0.9.6
  Using cached plac-1.1.3-py2.py3-none-any.whl (20 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl (31 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.2-cp38-cp38-macosx_10_9_x86_64.whl (19 kB)
Collecting srsly<1.1.0,>=1.0.2
  Downloading srsly-1.0.2-cp38-cp38-macosx_10_9_x86_64.whl (183 kB)
[K     |████████████████████████████████| 183 kB 50.5 MB/s eta 0:00:01
[?25hCollecting wasabi<1.1.0,>=0.4.0
  Downloading wasabi-0.8.0-py3-none-any.whl (23 kB)
Collecting catalogue<1.1.0,>=0.0.7
  Using cached catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting preshe

In [10]:
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)
print(my_doc)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)


When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!
['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [None]:
# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

In [6]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 326
First ten stop words: ['others', 'you', 'hereupon', 'full', 'moreover', 'other', 'above', 'none', 'your', 'become']


In [7]:
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


In [8]:
# Implementing lemmatization
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run run
runs runs
running running
runner runner


In [9]:
help(nlp)

Help on English in module spacy.lang.en object:

class English(spacy.language.Language)
 |  English(vocab=True, make_doc=True, max_length=1000000, meta={}, **kwargs)
 |  
 |  A text-processing pipeline. Usually you'll load this once per process,
 |  and pass the instance around your application.
 |  
 |  Defaults (class): Settings, data and factory methods for creating the `nlp`
 |      object and processing pipeline.
 |  lang (unicode): Two-letter language ID, i.e. ISO code.
 |  
 |  DOCS: https://spacy.io/api/language
 |  
 |  Method resolution order:
 |      English
 |      spacy.language.Language
 |      builtins.object
 |  
 |  Data and other attributes defined here:
 |  
 |  Defaults = <class 'spacy.lang.en.EnglishDefaults'>
 |  
 |  
 |  lang = 'en'
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from spacy.language.Language:
 |  
 |  __call__(self, text, disable=[], component_cfg=None)
 |      Apply the pipeline to some text

In [10]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All DET
is AUX
well ADV
that DET
ends VERB
well ADV
. PUNCT


In [None]:
import pandas as pd
sample_tweets = pd.read_csv("")

In [11]:
about_text = ('Gus Proto is a Python developer currently'
               ' working for a London-based Fintech'
               ' company. He is interested in learning'
               ' Natural Language Processing.')
nlp_doc = nlp(about_text)
for token in nlp_doc:
    print(token, token.idx)
    


Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [14]:
conference_help_text = ('Gus is helping organize a developer'
     'conference on Applications of Natural Language'
     ' Processing. He keeps organizing local Python meetups'
     ' and several internal talks at his workplace.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    print (token, token.lemma_,token.tag_)

Gus Gus NNP
is be VBZ
helping help VBG
organize organize VB
a a DT
developerconference developerconference NN
on on IN
Applications Applications NNPS
of of IN
Natural Natural NNP
Language Language NNP
Processing Processing NNP
. . .
He -PRON- PRP
keeps keep VBZ
organizing organize VBG
local local JJ
Python Python NNP
meetups meetup NNS
and and CC
several several JJ
internal internal JJ
talks talk NNS
at at IN
his -PRON- PRP$
workplace workplace NN
. . .


In [None]:
from spacy import displacy
# displacy.serve(conference_help_doc, style='dep')
displacy.render(conference_help_doc, style='dep')

In [7]:
import re
link = 'https://t.co/smyYriipxI'
link_pattern = r'http[s]?://[\w|.|/]+'
x = re.sub(link_pattern, 'success', link)
print(x)

success


In [9]:
mention = '@PKuchly57'
mention_pattern = r'@\w+'
x = re.match(mention_pattern, mention)
print(x)

<re.Match object; span=(0, 10), match='@PKuchly57'>


In [14]:
mention = '#'
mention_pattern = r'(@\w+)|(#)'
x = re.match(mention_pattern, mention)
print(x)

<re.Match object; span=(0, 1), match='#'>


In [17]:
import spacy
nlp = spacy.load('en')

doc = nlp('#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)')

for token in doc:
    print(token, token.lemma, token.lemma_)

# 8663801465970268676 #
FollowFriday 4558212729026417302 FollowFriday
@France_Inte 15376876697887500918 @france_inte
@PKuchly57 12282724241219763148 @PKuchly57
@Milipol_Paris 588482831550294546 @milipol_paris
for 16037325823156266367 for
being 10382539506755952630 be
top 8328343100126676325 top
engaged 12307195711836261817 engaged
members 14721843519903598875 member
in 3002984154512732771 in
my 561228191312463089 -PRON-
community 17822516981717808594 community
this 1995909169258310477 this
week 14249255431398666181 week
:) 5920004935509210957 :)


In [18]:
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)
doc = tokenizer('#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)')
for token in doc:
    print(token, token.lemma, token.lemma_)

# 8663801465970268676 #
FollowFriday 4558212729026417302 FollowFriday
@France_Inte 16733713287512299168 @France_Inte
@PKuchly57 12282724241219763148 @PKuchly57
@Milipol_Paris 9110577758730473699 @Milipol_Paris
for 16037325823156266367 for
being 3899131925553995529 being
top 8328343100126676325 top
engaged 12307195711836261817 engaged
members 1000530315840773259 members
in 3002984154512732771 in
my 227504873216781231 my
community 17822516981717808594 community
this 1995909169258310477 this
week 14249255431398666181 week
:) 5920004935509210957 :)


In [20]:

from spacy.lang.en import English
nlp = English()

doc = nlp('#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)')
for token in doc:
    print(token, token.lemma, token.lemma_)

# 8663801465970268676 #
FollowFriday 4558212729026417302 FollowFriday
@France_Inte 16733713287512299168 @France_Inte
@PKuchly57 12282724241219763148 @PKuchly57
@Milipol_Paris 9110577758730473699 @Milipol_Paris
for 16037325823156266367 for
being 3899131925553995529 being
top 8328343100126676325 top
engaged 12307195711836261817 engaged
members 1000530315840773259 members
in 3002984154512732771 in
my 227504873216781231 my
community 17822516981717808594 community
this 1995909169258310477 this
week 14249255431398666181 week
:) 5920004935509210957 :)


In [25]:
text = '123455566'
nums_pattern = r'^[0-9]+$'
if re.match(nums_pattern, text):
    print('True')

True


In [1]:
import sklearn
from sklearn.datasets import load_files
moviedir = r'/Users/somayoshida/nltk_data/corpora/movie_reviews'

# loading all files. 
movie = load_files(moviedir, shuffle=True)



# Split data into training and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(movie.data, movie.target, 
                                                          test_size = 0.20, random_state = 12)


# initialize CountVectorizer
movieVzer= CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=3000) # use top 3000 words only. 78.25% acc.
# movieVzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)         # use all 25K words. Higher accuracy

# fit and tranform using training text 
docs_train_counts = movieVzer.fit_transform(docs_train)


# Convert raw frequency counts into TF-IDF values
movieTfmer = TfidfTransformer()
docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts)




# testing data

# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = movieVzer.transform(docs_test)
docs_test_tfidf = movieTfmer.transform(docs_test_counts)



# Now ready to build a classifier. 
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB


# Train a Multimoda Naive Bayes classifier. Again, we call it "fitting"
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

# Predict the Test set results, find accuracy
y_pred = clf.predict(docs_test_tfidf)
sklearn.metrics.accuracy_score(y_test, y_pred)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)



# trying the classifier
# very short and fake movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride', 
            'Steven Seagal was terrible', 'Steven Seagal shone through.', 
              'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through', 
              "We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough', 
              'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.']

reviews_new_counts = movieVzer.transform(reviews_new)         # turn text into count vector
reviews_new_tfidf = movieTfmer.transform(reviews_new_counts)  # turn into tfidf vector


# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

# print out results
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movie.target_names[category]))

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Lab\\nltk_data\\corpora\\movie_reviews'