In [1]:
import numpy as np

# Importing an cleaning html

In [2]:
from bs4 import BeautifulSoup

In [3]:
html = """
 <div class='full_name'><span style='font-weight:bold'>
 Masego</span> Azra</div>"
 """

In [4]:
soup = BeautifulSoup(html, "lxml")

In [5]:
soup.find('div',{'class':'full_name'}).text

'\n Masego Azra'

# Removing Punctuation

In [6]:
text = "hi& ! lol I am8 here, waitning. stfu. m.f. :p"

In [7]:
punct = r"""`~!@#$%^&*()-=_+[]{}:;,.<>/?\|"""

In [8]:
''.join([letter for letter in text if letter not in punct])

'hi  lol I am8 here waitning stfu mf p'

In [9]:
import unicodedata
import sys

In [10]:
text_data = ['Hi!!!! I. Love. This. Song....','10000% Agree!!!! #LoveIT','Right?!?!']

In [11]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

In [12]:
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

In [13]:
# what they did, is to create a table(a dictionary mapping to be precise)
# of unicode ordinals for all possible punctutation
# and the table points None for these ordinal values
# they used str.translate to map the table on text string to remove any punctuation

# Tokenizing Text

In [14]:
import nltk

## word tokenizer

In [15]:
string = "The science of today is the technology of tomorrow"

In [16]:
nltk.tokenize.word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

## Sentence tokenizer

In [17]:
string = "The science of today is the technology of tomorrow. The end is near"

In [18]:
nltk.tokenize.sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'The end is near']

# Removing Stopwords

In [19]:
from nltk.corpus import stopwords

In [20]:
stop_words = stopwords.words('english')

In [21]:
tokenized_words = ['i',
 'am',
 'going',
 'to',
 'go',
 'to',
 'the',
 'store',
 'and',
 'park']

In [22]:
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

# Stemming words

In [23]:
from nltk.stem.porter import PorterStemmer

In [24]:
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

In [25]:
porter_stemmer = PorterStemmer()

In [26]:
[porter_stemmer.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

# Tagging parts of speech (PoS)

In [27]:
from nltk import pos_tag
from nltk import word_tokenize

In [28]:
text_data = "Chris loved outdoor running"

In [29]:
pos_tag(word_tokenize(text_data))

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

# Encoding Text as a Bag of Words

In [30]:
# create a sparse one hot encoded representation of text

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
text_data = np.array(['I love Brazil. Brazil!','Sweden is best','Germany beats both'])

In [33]:
cv = CountVectorizer()

In [34]:
cv.fit_transform(text_data).toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [35]:
cv.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

# TFIDF

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
tfidf = TfidfVectorizer()

In [38]:
tfidf.fit_transform(text_data).toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])