In [1]:
#Unstructure Text Data
#How to transform text into information-rich features
#Some commonly use techniques

#Cleaning Test
#Problem
#You have some unstructured text data and want to complete some basic cleaning
#Solution
#Most basic text cleaning operations should only replace Python's core string operations, 
#in particular strip,replace, and split:
#create text
text_data = ["    Interrobang. By Aishwarya Henrietta     ",
            "Parking and Going. By Karl Gautier", 
            "     Today Is The night. By Jarek Prakas    "]
#Strip whitespaces
strip_whitespace = [string.strip() for string in text_data]
#Show text
strip_whitespace

['Interrobang. By Aishwarya Henrietta',
 'Parking and Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakas']

In [33]:
#Remove periods
remove_periods = [string.replace(".","") for string in strip_whitespace]

In [34]:
#Show text
remove_periods 

['Interrobang By Aishwarya Henrietta',
 'Parking and Going By Karl Gautier',
 'Today Is The night By Jarek Prakas']

In [35]:
#We also create and apply a custom transformation function:
def capitalizer(string:str)-> str:
    return string.upper()
#Apply function
[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTA',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKAS']

In [36]:
#Finally, we can use regular expressions to make powerful string opewrations:
#import library
import re
#Create function
def replace_letters_with_X(string:str)->str:
    return re.sub(r"[a-zA-Z]", 'X', string)

#Apply function
[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXX']

In [37]:
###Discussion
##
#
#Parsing and Cleaning HTML
#Problem : You have text data with HTML elements and want to extract just the text.
#Solution
#Use Beautiful Soup's extensive set of options to parse and extracxt from HTML:
#Load library

from bs4 import BeautifulSoup
#Create some HTML code
html = """
       <div class ='full_name'><span style = 'font-weight:bold'>
       Masego</span> Azra</div>"
       """
#Parse html
soup =BeautifulSoup(html,"lxml")

#Find the div with the class "full_name", show text
soup.find("div", {"class": "full_name"}).text

'\n       Masego Azra'

In [38]:
###Discussion

In [39]:
#Removing Punctuation
#Solution
#Define a function that uses translate with a dictionary of punctuation characters:
#Load libraries
import unicodedata
import sys
#Create text
text_data = ['Hi!!!!!!!!!!!!!!!!I. Love. This. Song.......',
            '10000% Agree!!!!#LoveIT',
            'Right?!?!']
#Create a dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                          if unicodedata.category(chr(i)).startswith('P'))
#For each string, remove any punctuation characters
[string.translate(punctuation) for string in text_data]

['HiI Love This Song', '10000 AgreeLoveIT', 'Right']

In [40]:
###Discussion

In [41]:
#Tokenizing Text 
#Problem: You have text and want to break it up into individual words.
#Solution
#Natural Language Toolkit for Python (NLTK)has a powerful set of text manupualtion operations, including word tokenizing:
#Load library


from nltk.tokenize import word_tokenize


In [42]:
#Create text 
string = 'The science of today is the technology of tomorrow'
#Tokenize words
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [43]:
#We can also tokenize into sentences:
from nltk.tokenize import sent_tokenize

#create text
string ='The science of today is the technology of tomorrow. Tomorrow is today.'

#Tokenize sentences
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

In [44]:
##Discussion

In [45]:
#Removing Stop Words
#Problem
#Given tokenized text data, you want to remove extreamely common words (e.g., a, is, of, on) 
#that contain little informormational value.
#Solution

#Use NLTK's stopwards

from nltk.corpus import stopwords

#You will have to download the set of stop words the first time
#import nltk
#nltk.download('stopwords')
#Create word tokens
tokenized_words = [ 'i',
                  'am',
                  'going',
                  'to',
                  'go',
                  'to',
                  'the',
                  'store',
                  'and',
                  'park']
#Load stop words
stop_words=stopwords.words('english')
#Remove stop words
[word for word in tokenized_words if word not in stop_words]


['going', 'go', 'store', 'park']

In [46]:
#
##
#show stop words
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [47]:
#Stemming Words
#Problem : You have tokenizd words and want to convert them into their root forms.
#Solution
#Use NLTK's PorterStemmer:
#Load Library
from nltk.stem.porter import PorterStemmer
#Create word tokens 
tokenized_words =["i","am","humbled","by","this", "traditional",'meeting']
#Create stemmer
porter =PorterStemmer()

#Apply stemmer
[porter.stem(word) for word in tokenized_words]


['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

In [48]:
####Discussion

In [49]:
#Tagging Parts of Speech
#Problem: You have text data and want to tag each word or character with its parts of speech.
#Solution
#Use NLTK's pre-trained parts-of-speech tagger:
#Load Libraries
from nltk import pos_tag
from nltk import word_tokenize

#Create text
text_data = 'Chris loved outdoor running'
#Use pre-trained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))

#show parts of speech
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [50]:
### Tag Parts of Speech
### NNP Proper Noun, Singular
### NN  Noun, Singular or Mass
### RB  Adverb
### VBD Verb, Past tense

In [51]:
#Once the text has been tagged we can us the tag to find certain parts of speech. For example, here are all the nouns:
#Filter words
[word for word, tag in text_tagged if tag in ["NN", "NNS", "NNP", "NNPS"]]

['Chris']

In [55]:
#A more realiustic situation would be that we have data where every observation contains a tweet 
#and we want to convert those sentences into features for individual parts of speech (e.g. a feature with 1 
#if proper noun is present, and zero otherwise):
#Create text
from sklearn.preprocessing import MultiLabelBinarizer
tweets = ["I am eating a buritto fro breakfact",
        "Political science is an amazing field",
        "San Francisco is an awesome city"]
#Create list
tagged_tweets = []

#Tag each word and each tweet
for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
    
#Use one-hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)


array([[1, 0, 1, 0, 1, 1, 1, 0],
       [1, 1, 1, 0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0, 0, 0, 1]])

In [56]:
#Using classes_we can see that each feature is a part-of-speech tag:
one_hot_multi.classes_

array(['DT', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'], dtype=object)

In [57]:
###Discussion
####

In [61]:
#Load library
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

#Get some text from the Brown Corpus, broken into sentences
sentences= brown.tagged_sents(categories ='news')

#Split into 4000 sentences for training and 623 for testing
train =sentences[:4000]
test = sentences [4000:]

#create backoff tagger
unigram = UnigramTagger(train)
bigram =BigramTagger(train, backoff =unigram)
trigram =TrigramTagger(train, backoff =bigram)

#show accuracy
trigram.evaluate(test)

0.8174734002697437

In [66]:
#Encoding Text as a Bag of Words
#Problem:You have text data and want to create a set of features indicating the number of the times an
#onservation's text contains a particular word.
#Solution
#Use scikit-learn's CountVectorizer:
#Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

#Create text
text_data =np.array(['I love Brazil, Brazil!',
                    'Sweden is best',
                     'But Germany beats both'])

#Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

#Show feature matrix
bag_of_words

<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [68]:
##This output is a sparse array, which is often necessary when we have a large amount of text.
#However, in our toy example we can use toarray to view a matrix of word counts for each observation:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 1, 0, 0, 0]], dtype=int64)

In [69]:
# We can use the vocabulary_ method to view the words associated with each feature:

#Show feature names
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'but', 'germany', 'is', 'love', 'sweden']

In [70]:
###Discussion
##

In [71]:
###Create feature matrix with arguments
count_2gram = CountVectorizer(ngram_range =(1,2),
                             stop_words = "english",
                             vocabulary =['brazil'])
bag =count_2gram.fit_transform(text_data)
#View feature matrix
bag.toarray()

array([[2],
       [0],
       [0]], dtype=int64)

In [73]:
#View the 1-Grams and the 2-grams
count_2gram.vocabulary_

{'brazil': 0}

In [76]:
###Weighting Word Importance

#Problem: You want a bag of words, but with words weighted by their importance to an observation.

#Solution
#Compate the frequency of the word in a document (a tweet, movie review, speech transcript, etc.) 
#with the word in all other documents using term frequency-inverse document frequency (tf-idf).
#scikit-learn makes this easy with TfidfVectorizer:

#Load libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

#Create text
text_data =np.array(['I love Brazil, Brazil!',
                    'Sweden is best',
                     'But Germany beats both'])

#Create the tf-idf feature matrix
tfidf=TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

#Show td-idf feature matrix
feature_matrix

<3x9 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [77]:
##Just as in Recipe 6.8 the output is a spare matrix. 
#However, if we want to view the output as a dense matrix. We can use .toarray:

#Show tf-idf feature matrix as desnse matrix
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.57735027],
       [0.5       , 0.        , 0.5       , 0.        , 0.5       ,
        0.5       , 0.        , 0.        , 0.        ]])

In [78]:
#vocabulary_shows us the word of each feature:
#Show feature names
tfidf.vocabulary_

{'love': 7,
 'brazil': 3,
 'sweden': 8,
 'is': 6,
 'best': 1,
 'but': 4,
 'germany': 5,
 'beats': 0,
 'both': 2}

In [None]:
###Discussion