## Regex

In [1]:
import re

In [2]:
re.match('abc', 'abcdef')

<re.Match object; span=(0, 3), match='abc'>

In [3]:
word_regex = '\w+'
re.match(word_regex, 'hi there!') 

<re.Match object; span=(0, 2), match='hi'>

In [4]:
re.split('\s+', 'Split on spaces.')

['Split', 'on', 'spaces.']

## Tokenization

In [5]:
import nltk

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
word_tokenize("Hi there!")

['Hi', 'there', '!']

#### Practice (word_tokenize, sent_tokenize)

In [9]:
import re
from nltk.tokenize import word_tokenize,sent_tokenize

In [10]:
scene_one = 'The dataset for training, I chose “Sentiment140”. which originated. from Stanford University. More info on the dataset can be found from the link.'

In [11]:
# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)
print(sentences)

['The dataset for training, I chose “Sentiment140”.', 'which originated.', 'from Stanford University.', 'More info on the dataset can be found from the link.']


In [12]:
tokenized_sent = word_tokenize(sentences[3])
print(tokenized_sent)

['More', 'info', 'on', 'the', 'dataset', 'can', 'be', 'found', 'from', 'the', 'link', '.']


In [13]:
unique_tokens = set(word_tokenize(scene_one))
print(unique_tokens)

{'which', 'for', '“', 'originated', 'link', 'from', '.', 'Stanford', 'info', 'I', ',', 'on', 'the', 'dataset', 'chose', 'University', 'Sentiment140', 'The', 'More', 'be', 'found', '”', 'can', 'training'}


# Bag-of-words: method to find topics

In [14]:
from nltk.tokenize import word_tokenize
from collections import Counter

In [15]:
counter = Counter(word_tokenize(
                """The cat is in the box. The cat likes the box. 
                 The box is over the cat."""))
counter

Counter({'The': 3,
         'cat': 3,
         'is': 2,
         'in': 1,
         'the': 3,
         'box': 3,
         '.': 3,
         'likes': 1,
         'over': 1})

In [16]:
counter.most_common(2)

[('The', 3), ('cat', 3)]

# Simple text preprocessing - stopwords

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords

In [19]:
text = """The cat is in the box. The cat likes the box. 
                  The box is over the cat."""

In [20]:
tokens = [w for w in word_tokenize(text.lower()) 
                  if w.isalpha()]
tokens

['the',
 'cat',
 'is',
 'in',
 'the',
 'box',
 'the',
 'cat',
 'likes',
 'the',
 'box',
 'the',
 'box',
 'is',
 'over',
 'the',
 'cat']

In [21]:
no_stops = [t for t in tokens 
                    if t not in stopwords.words('english')]

In [22]:
Counter(no_stops).most_common(2)

[('cat', 3), ('box', 3)]

# gensim

### gensim dictionary

In [23]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [24]:
# Creating a gensim dictionary
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize

In [25]:
my_documents = ['The movie was about a spaceship and aliens.',
                 'I really liked the movie!',
                'Awesome action scenes, but boring characters.',
                'The movie was awful! I hate alien films.',
                'Space is cool! I liked the movie.',
                'More space films, please!',]

In [26]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in my_documents]

In [27]:
dictionary = Dictionary(tokenized_docs)

In [28]:
dictionary.token2id

{'.': 0,
 'a': 1,
 'about': 2,
 'aliens': 3,
 'and': 4,
 'movie': 5,
 'spaceship': 6,
 'the': 7,
 'was': 8,
 '!': 9,
 'i': 10,
 'liked': 11,
 'really': 12,
 ',': 13,
 'action': 14,
 'awesome': 15,
 'boring': 16,
 'but': 17,
 'characters': 18,
 'scenes': 19,
 'alien': 20,
 'awful': 21,
 'films': 22,
 'hate': 23,
 'cool': 24,
 'is': 25,
 'space': 26,
 'more': 27,
 'please': 28}

### gensim corpus

In [29]:
# each list is one document, each document is series of tuples
# (id, frequency)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(0, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(0, 1),
  (5, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(0, 1), (5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (24, 1), (25, 1), (26, 1)],
 [(9, 1), (13, 1), (22, 1), (26, 1), (27, 1), (28, 1)]]

# Tf-idf with gensim (Term frequency - inverse document frequency)

#### Tf-idf: Ensures most common words don't show up as key words. Keeps document specific frequent words weighted high

In [30]:
from gensim.models.tfidfmodel import TfidfModel

In [31]:
tfidf = TfidfModel(corpus)

In [32]:
tfidf[corpus[1]]

[(5, 0.1746298276735174),
 (7, 0.1746298276735174),
 (9, 0.1746298276735174),
 (10, 0.29853166221463673),
 (11, 0.47316148988815415),
 (12, 0.7716931521027908)]

### practice

In [33]:
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

doc = ''

# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Print the first five weights
print(tfidf_weights[:5])

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

[]


# Named Entity Recognition


In [34]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [35]:
import nltk

In [36]:
sentence = '''In New York, I like to ride the Metro to visit MOMA 
                      and some restaurants rated well by Ruth Reichl.'''

In [37]:
tokenized_sent = nltk.word_tokenize(sentence)

In [38]:
tagged_sent = nltk.pos_tag(tokenized_sent)
tagged_sent

[('In', 'IN'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 (',', ','),
 ('I', 'PRP'),
 ('like', 'VBP'),
 ('to', 'TO'),
 ('ride', 'VB'),
 ('the', 'DT'),
 ('Metro', 'NNP'),
 ('to', 'TO'),
 ('visit', 'VB'),
 ('MOMA', 'NNP'),
 ('and', 'CC'),
 ('some', 'DT'),
 ('restaurants', 'NNS'),
 ('rated', 'VBN'),
 ('well', 'RB'),
 ('by', 'IN'),
 ('Ruth', 'NNP'),
 ('Reichl', 'NNP'),
 ('.', '.')]

In [39]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [40]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [41]:
print(nltk.ne_chunk(tagged_sent))

(S
  In/IN
  (GPE New/NNP York/NNP)
  ,/,
  I/PRP
  like/VBP
  to/TO
  ride/VB
  the/DT
  (ORGANIZATION Metro/NNP)
  to/TO
  visit/VB
  (ORGANIZATION MOMA/NNP)
  and/CC
  some/DT
  restaurants/NNS
  rated/VBN
  well/RB
  by/IN
  (PERSON Ruth/NNP Reichl/NNP)
  ./.)


# Introduction to SpaCy

In [53]:
# import spacy

In [58]:
import en_core_web_sm

In [59]:
nlp = en_core_web_sm.load()

In [60]:
doc = nlp("""Berlin is the capital of Germany; 
                  and the residence of Chancellor Angela Merkel.""")

In [61]:
doc.ents

(Berlin, Germany, Angela Merkel)

In [62]:
print(doc.ents[0], doc.ents[0].label_)

Berlin GPE


In [63]:
print(doc.ents[0], doc.ents[0].text)

Berlin Berlin


# Multilingual NER with polyglot
NER for 130 language

In [None]:
# pip install polyglot

In [None]:
# from polyglot.text import Text
# ẗext = """El presidente de la Generalitat de Cataluña,
#                   Carles Puigdemont, ha afirmado hoy a la alcaldesa 
#                   de Madrid, Manuela Carmena, que en su etapa de 
#                   alcalde de Girona (de julio de 2011 a enero de 2016) 
#                   hizo una gran promoción de Madrid."""

# ptext = Text(text)

# Classifying fake news using supervised learning with NLP

### Building word count vectors with scikit-learn

In [None]:
import pandas as pd

In [2]: from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
df = ... # Load data into DataFrame
y = df['Sci-Fi']
X_train, X_test, y_train, y_test = train_test_split(
                                             df['plot'], y, 
                                             test_size=0.33, 
                                             random_state=53)
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

# Naive Bayes with scikit-learn

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

In [None]:
pred = nb_classifier.predict(count_test)
metrics.accuracy_score(y_test, pred)

metrics.confusion_matrix(y_test, pred, labels=[0,1])