## Handling Text

In [4]:
# Create text
text_data = [" Interrobang. By Aishwarya Henriette ",
"Parking And Going. By Karl Gautier",
" Today Is The night. By Jarek Prakash "]# Strip whitespaces
strip_whitespace = [string.strip() for string in text_data]
# Show text
strip_whitespace

remove_periods = [string.replace('.', ',') for string in strip_whitespace]
remove_periods
def capitalizer(string:str)->str:
    return string.upper()
[capitalizer(string) for string in remove_periods]

['INTERROBANG, BY AISHWARYA HENRIETTE',
 'PARKING AND GOING, BY KARL GAUTIER',
 'TODAY IS THE NIGHT, BY JAREK PRAKASH']

In [6]:
# Import library
import re
# Create function
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)
# Apply function
[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX, XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX, XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX, XX XXXXX XXXXXXX']

## Parsing and Cleaning HTML

In [13]:
# Load library
from bs4 import BeautifulSoup
# Create some HTML code
html = """
<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"
"""
# Parse html
soup = BeautifulSoup(html, "lxml")
soup.find('div', {'class':'full_name'}).text

'Masego Azra'

### Removing Punctuation

In [14]:
# Load libraries
import unicodedata
import sys
# Create text
text_data = ['Hi!!!! I. Love. This. Song....',
'10000% Agree!!!! #LoveIT',
'Right?!?!']
# Create a dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P'))
# For each string, remove any punctuation characters
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

### Tokenizing Text

In [8]:
import nltk
from nltk.tokenize import word_tokenize

# Set up a data directory for NLTK
nltk.data.path.append("path/to/nltk_data")  # Replace this with the path to your NLTK data directory
nltk.download("punkt", download_dir="path/to/nltk_data")  # Download the 'punkt' tokenizer data

string = "The science of today is the technology of tomorrow"
# Tokenize words
word_tokenize(string)

[nltk_data] Downloading package punkt to path/to/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

### Removing Stop Words

In [9]:
# Load library
from nltk.corpus import stopwords
# You will have to download the set of stop words the first time
# import nltk
# nltk.download('stopwords')
# Create word tokens
tokenized_words = ['i',
'am',
'going',
'to',
'go',
'to',
'the',
'store',
'and',
'park']
# Load stop words
stop_words = stopwords.words('english')
# Remove stop words
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

### Stemming Words

In [11]:
# Load library
from nltk.stem.porter import PorterStemmer
# Create word tokens
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
# Create stemmer
porter = PorterStemmer()
# Apply stemmer
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

### Tagging Parts of Speech

In [17]:
from nltk import pos_tag
from nltk import word_tokenize
# Set up a data directory for NLTK
nltk.data.path.append("path/to/nltk_data")  # Replace this with the path to your NLTK data directory

# Download necessary NLTK data
nltk.download("punkt", download_dir="path/to/nltk_data")
nltk.download("averaged_perceptron_tagger", download_dir="path/to/nltk_data")

text_data = "Chris loved outdoor running"
# Use pre-trained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))
# Show parts of speech
text_tagged

[nltk_data] Downloading package punkt to path/to/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     path/to/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer
# Create text
tweets = ["I am eating a burrito for breakfast",
"Political science is an amazing field",
"San Francisco is an awesome city"]
# Create list
tagged_tweets = []
# Tag each word and each tweet
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
# Use one-hot encoding to convert the tags into features
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)


array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

### Encoding Text as a Bag of words

In [21]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# Create text
text_data = np.array(['I love Brazil. Brazil!',
'Sweden is best',
'Germany beats both'])# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
# Show feature matrix
bag_of_words
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

### Weighting Word Importance

In [23]:
# Load libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# Create text
text_data = np.array(['I love Brazil. Brazil!',
'Sweden is best',
'Germany beats both'])
# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
# Show tf-idf feature matrix
feature_matrix
# Show tf-idf feature matrix as dense matrix
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])