# Natural Language Processing

# Loading Dataset

In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [3]:
df = df.drop(['id', 'keyword', 'location'], axis=1)

In [4]:
# Getting Value Counts

df['target'].value_counts()

target
0    9256
1    2114
Name: count, dtype: int64

In [5]:
pd.set_option('display.max_colwidth', 1)
df.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0


## Lower Casing

In [6]:
method1 = df['text'].str.lower()
method2 = df['text'].apply(str.lower)
method3 = df['text'].apply(lambda x : x.lower())
method4 = df['text'].map(str.lower)

## Removing Punctuations

In [7]:
# Method 1
punc1 = df['text'].str.replace(r'[^\w\s]', '', regex=True)

# Method 2
import string
translator = str.maketrans('', '', string.punctuation)
punc2 = df['text'].apply(lambda x: x.translate(translator))

# Method 3
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])
punc3 = df['text'].apply(remove_punctuation)

# Method 4
import re
def remove_punctuation_with_re(text):
    return re.sub(r'^[\w\s]','', text)
punc4 = df['text'].apply(remove_punctuation_with_re)

# Method 5
def remove_punctuation_with_filter(text):
    return ''.join(filter(lambda x: x not in string.punctuation, text))
punc5 = df['text'].apply(remove_punctuation_with_filter)
punc5.head()

0    Communal violence in Bhainsa Telangana Stones were pelted on Muslims houses and some houses and vehicles were set ablaze…           
1    Telangana Section 144 has been imposed in Bhainsa from January 13 to 15 after clash erupted between two groups on January 12 Po…    
2    Arsonist sets cars ablaze at dealership httpstcogOQvyJbpVI                                                                          
3    Arsonist sets cars ablaze at dealership httpstco0gL7NUCPlb httpstcou1CcBhOWh9                                                       
4    Lord Jesus your love brings freedom and pardon Fill me with your Holy Spirit and set my heart ablaze with your l… httpstcoVlTznnPNi8
Name: text, dtype: object

## Removing Numbers

In [8]:
# Method 1
rem_num1 = df['text'].str.replace(r'\d+', '', regex=True)

# Method 2
def remove_numbers_with_isdigits(text):
    return ''.join([char for char in text if not char.isdigit()])
rem_num2 = df['text'].apply(remove_numbers_with_isdigits)

# Method 3
import string
translation_table = str.maketrans('', '', string.digits)
rem_num3 = df['text'].apply(lambda x: x.translate(translation_table))

# Method 4
import re
def remove_numbers_with_re(text):
    return re.sub(r'\d+', '', text)
rem_num4 = df['text'].apply(remove_numbers_with_re)

# Method 5
def remove_numbers_with_filter(text):
    return ''.join(filter(lambda x: not x.isdigit(), text))
rem_num5 = df['text'].apply(remove_numbers_with_filter)

rem_num5.head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…              
1    Telangana: Section  has been imposed in Bhainsa from January  to , after clash erupted between two groups on January . Po…                 
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                            
3    Arsonist sets cars ablaze at dealership https://t.co/gLNUCPlb https://t.co/uCcBhOWh                                                        
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi
Name: text, dtype: object

## Removal of Extra Spaces

In [9]:
df['text'].head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…               
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…         
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9                                                     
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8
Name: text, dtype: object

In [10]:
# Method 1
rem_space1 = df['text'].str.strip().str.replace(r'\s+', ' ', regex=True)

# Method 2
def remove_extra_spaces(text):
    return ' '.join(text.split())
rem_space2 = df['text'].apply(remove_extra_spaces)

# Method 3
rem_space3 = df['text'].str.replace(r'\s+', ' ', regex=True)

# Method 4
import re
def remove_extra_spaces_with_re(text):
    return re.sub(r'\s+', ' ', text.strip())

rem_space4 = df['text'].apply(remove_extra_spaces_with_re)

# Method 5
def remove_extra_spaces_filter(text):
    return ' '.join(filter(lambda x: x.strip(), text.split()))
rem_space5 = df['text'].apply(remove_extra_spaces_filter)

rem_space5.head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…               
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…         
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9                                                     
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8
Name: text, dtype: object

## Replacing Repeated Punctuation

In [11]:
df['text'].head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…               
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…         
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9                                                     
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8
Name: text, dtype: object

In [12]:
# Method 1
punc_rep1 = df['text'].str.replace(r'([!?./\@])\1+', r'\1', regex=True)

# Method 2
import re
def replace_repeated_puncs(text):
    return re.sub(r'([!?/\.])\1+', r'\1', text)
punc_rep2 = df['text'].apply(replace_repeated_puncs)

# Method 3
def remove_repeated_punc_lists(text):
    result = []
    for char in text:
        if result and char ==  result[-1] and char in "!?/\.":
            continue
        result.append(char)
    return ''.join(result)
punc_rep3 = df['text'].apply(remove_repeated_punc_lists)

# Method 4
def replace_repeated_puncs_translate(text):
    punctuations = r'!/\?.@'
    for p in punctuations:
        text = text.replace(p * 2, p)
    return text
punc_rep4 = df['text'].apply(replace_repeated_puncs_translate)

punc_rep4.head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…              
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…        
2    Arsonist sets cars ablaze at dealership https:/t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https:/t.co/0gL7NUCPlb https:/t.co/u1CcBhOWh9                                                      
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https:/t.co/VlTznnPNi8
Name: text, dtype: object

## Removing Emojis

In [13]:
# Dataset containing emojis
df_emoji = pd.DataFrame({
    'text': [
        "Hello there! 😀 How are you? 🤔",
        "I love programming! 💻✨",
        "Let's remove emojis! 🎉🎈",
        "No emojis here."
    ]
})
df_emoji.head()

Unnamed: 0,text
0,Hello there! 😀 How are you? 🤔
1,I love programming! 💻✨
2,Let's remove emojis! 🎉🎈
3,No emojis here.


In [14]:
# pip install clean-text
%pip install clean-text demoji




In [15]:
# Method 1
from cleantext import clean
emoji_rem1 = df_emoji['text'].apply(lambda x: clean(x, no_emoji=True))

# Method 2
def remove_emojis(text):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # emoticons
                                 "\U0001F300-\U0001F5FF"  # symbols & pictographs
                                 "\U0001F680-\U0001F6FF"  # transport & map symbols
                                 "\U0001F700-\U0001F77F"  # alchemical symbols
                                 "\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
                                 "\U00002702-\U000027B0"  # dingbats
                                 "\U000024C2-\U0001F251"  # enclosed characters
                                 "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
emoji_rem2 = df_emoji['text'].apply(remove_emojis)

# Method 3
import demoji
demoji.download_codes()
emoji_rem3 = df_emoji['text'].apply(lambda x: demoji.replace(x, ""))

# Method 4
emoji_rem4 = df_emoji['text'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

emoji_rem4.head()

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
  demoji.download_codes()


0    Hello there!  How are you? 
1    I love programming!        
2    Let's remove emojis!       
3    No emojis here.            
Name: text, dtype: object

## Removing Emoticons

In [16]:
# Dataset for Emoticons
df_emoticons = pd.DataFrame({
    'text': [
        "Hello there! :) How are you? :D",
        "I love programming! <3",
        "Let's remove emoticons! :P :o",
        "No emoticons here."
    ]
})
df_emoticons.head()

Unnamed: 0,text
0,Hello there! :) How are you? :D
1,I love programming! <3
2,Let's remove emoticons! :P :o
3,No emoticons here.


In [17]:
# Method 1
def remove_emoticons(text):
    emoticon_pattern = re.compile(r'[:;=][)DdpP\(\[<3]')
    return emoticon_pattern.sub('', text)
emoticon_rem1 = df_emoticons['text'].apply(remove_emoticons)

# Method 2
emoticon_rem2 = df_emoticons['text'].str.replace(r'[:;=][)DdoOpP\(\[<]', '', regex=True)

# Method 3
emoticons = [':)', ':(', ':D', ':P', '<3', ':o']

def remove_custom_emoticons(text):
    for emoticon in emoticons:
        text = text.replace(emoticon, '')
    return text
emoticon_rem3 = df_emoticons['text'].apply(remove_custom_emoticons)

emoticon_rem2.head()

0    Hello there!  How are you? 
1    I love programming! <3     
2    Let's remove emoticons!    
3    No emoticons here.         
Name: text, dtype: object

## Handling Contractions

In [18]:
%pip install contractions

Note: you may need to restart the kernel to use updated packages.


In [19]:
# Dataset for contractions
df_con = pd.DataFrame({
    'text': [
        "I'll be there within 5 min.",
        "She'd like to know how I'd done that!",
        "It's awesome to meet new friends.",
        "We've been waiting for this day for so long."
    ]
})

df_con.head()

Unnamed: 0,text
0,I'll be there within 5 min.
1,She'd like to know how I'd done that!
2,It's awesome to meet new friends.
3,We've been waiting for this day for so long.


In [20]:
# Method 1
import contractions
def expand_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])
con1 = df_con['text'].apply(expand_contractions)

# Method 2
contraction_mapping = {
    "I'll": "I will",
    "She'd": "She would",
    "It's": "It is",
    "We've": "We have",
    "I'd": "I would",
    "don't": "do not",
    "can't": "cannot"
}
def expand_contractions_regex(text):
    pattern = re.compile(r'\b(' + '|'.join(contraction_mapping.keys()) + r')\b')
    return pattern.sub(lambda x: contraction_mapping[x.group(0)], text)
con2 = df_con['text'].apply(expand_contractions_regex)

# Method 3
contraction_mapping = {
    "I'll": "I will",
    "She'd": "She would",
    "It's": "It is",
    "We've": "We have",
    "I'd": "I would",
}

# Function to expand contractions using manual mapping
def manual_expand_contractions(text):
    for contraction, expansion in contraction_mapping.items():
        text = text.replace(contraction, expansion)
    return text
con3 = df_con['text'].apply(manual_expand_contractions)

con3.head()

0    I will be there within 5 min.                 
1    She would like to know how I would done that! 
2    It is awesome to meet new friends.            
3    We have been waiting for this day for so long.
Name: text, dtype: object

## Finding and Removing Html Tags

In [21]:
# Dataset
df_html = pd.DataFrame({
    'text': [
        "<h1>Hello World!</h1>",
        "<p>This is a <strong>test</strong> string.</p>",
        "<div>Another <em>example</em> with <a href='#'>links</a>.</div>",
        "No HTML tags here."
    ]
})

df_html.head()

Unnamed: 0,text
0,<h1>Hello World!</h1>
1,<p>This is a <strong>test</strong> string.</p>
2,<div>Another <em>example</em> with <a href='#'>links</a>.</div>
3,No HTML tags here.


In [22]:
%pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [23]:
# Method 1
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)
html_rem1 = df_html['text'].apply(remove_html_tags)

# Method 2
from bs4 import BeautifulSoup
def remove_html_tags_bs(text):
    return BeautifulSoup(text, "html.parser").get_text()
html_rem2 = df_html['text'].apply(remove_html_tags_bs)


## Finding and Removing URL's

In [24]:
# Dataset
url_df = pd.DataFrame({
    'Text': [
        "Check out this link: https://www.example.com and let me know what you think.",
        "Visit http://example.org for more information.",
        "No links here, just plain text.",
        "Another link: www.test.com is also available."
    ]
})

url_df.head()

Unnamed: 0,Text
0,Check out this link: https://www.example.com and let me know what you think.
1,Visit http://example.org for more information.
2,"No links here, just plain text."
3,Another link: www.test.com is also available.


In [25]:
# Method 1
url_rem1 = url_df['Text'].str.replace(r'http[s]?://\S+|www\.\S+', '', regex=True)

# Method 2
def url_remove(text):
    return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
url_rem2 = url_df['Text'].apply(url_remove)

# Method 3
url_rem3 = url_df['Text'].apply(lambda x: clean(x, no_urls=True, lower=False, replace_with_url='', ))

# Method 4
from urllib.parse import urlparse
def url_remove_parser(text):
    words = text.split()
    return ' '.join(word for word in words if not urlparse(word).scheme)

url_rem4 = url_df['Text'].apply(url_remove_parser)
url_rem4.head()

0    Check out this and let me know what you think.
1    Visit for more information.                   
2    No links here, just plain text.               
3    Another www.test.com is also available.       
Name: Text, dtype: object

## Removing Emails

In [26]:
# Dataset
email_df = pd.DataFrame({
    'Text': [
        "Contact us at support@example.com for assistance.",
        "My email is john.doe@gmail.com and I need help.",
        "No emails here, just plain text.",
        "Reach out to info@company.org."
    ]
})

email_df.head()

Unnamed: 0,Text
0,Contact us at support@example.com for assistance.
1,My email is john.doe@gmail.com and I need help.
2,"No emails here, just plain text."
3,Reach out to info@company.org.


In [27]:
# Method 1
email_rem1 = email_df['Text'].str.replace(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b', '', regex=True)

# Method 2
def remove_emails(text):
    return re.sub(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b', '', text)
email_rem2 = email_df['Text'].apply(remove_emails)

# Method 3 (Makes all small case)
email_rem3 = email_df['Text'].apply(lambda x: clean(x, lower=False, no_emails=True, replace_with_email=""))

# Method 4 
def is_email(word):
    return re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', word) is not None

def remove_emails_custom(text):
    words = text.split()
    return ' '.join(word for word in words if not is_email(word))
email_rem4 = email_df['Text'].apply(remove_emails_custom)

email_rem4.head()

0    Contact us at for assistance.   
1    My email is and I need help.    
2    No emails here, just plain text.
3    Reach out to info@company.org.  
Name: Text, dtype: object

## Extracting URL & Emails

In [28]:
email_df.head()

Unnamed: 0,Text
0,Contact us at support@example.com for assistance.
1,My email is john.doe@gmail.com and I need help.
2,"No emails here, just plain text."
3,Reach out to info@company.org.


In [29]:
# Using str functions
get_email1 = email_df['Text'].str.extract(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})')

# Using regex library
import re
def extract_emails(text):
    return re.findall(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
get_email2 = email_df['Text'].apply(extract_emails)

get_email2.head()

0    [support@example.com]
1    [john.doe@gmail.com] 
2    []                   
3    [info@company.org]   
Name: Text, dtype: object

## Extracting URL's

In [30]:
url_df.head()

Unnamed: 0,Text
0,Check out this link: https://www.example.com and let me know what you think.
1,Visit http://example.org for more information.
2,"No links here, just plain text."
3,Another link: www.test.com is also available.


In [31]:
# Using str function
expression = r'(https?://[^\s]+|www\.[^\s]+)'
get_url1 = url_df['Text'].str.extract(pat=expression, expand=True)

# Using regex
def extract_urls(text):
    return re.findall(r'(https?://[^\s]+|www\.[^\s]+)', text)
get_url2 = url_df['Text'].apply(extract_urls)

get_url1.head()

Unnamed: 0,0
0,https://www.example.com
1,http://example.org
2,
3,www.test.com


## Standardizing and Spell Check

In [32]:
%pip install textblob pyspellchecker




In [33]:
from textblob import TextBlob

spell_df = pd.DataFrame({
    'Text': [
        "I havv a dreem.",
        "This is an exmple of spell chekcing.",
        "Python is a gr8 programming language.",
        "Lets go to the park tommorow."
    ]
})

spell_df.head()

Unnamed: 0,Text
0,I havv a dreem.
1,This is an exmple of spell chekcing.
2,Python is a gr8 programming language.
3,Lets go to the park tommorow.


In [34]:
%pip install autocorrect manual_spellchecker

Note: you may need to restart the kernel to use updated packages.


In [35]:
# Using TextBlob
def correct_spelling(text):
    return str(TextBlob(text).correct())
spell1 = spell_df['Text'].apply(correct_spelling)

# Using autocorrect (Gives good results)
import itertools
from autocorrect import Speller
def standardize_text(text):
    spell = Speller(lang='en')
    return spell(text)
spell2 = spell_df['Text'].apply(standardize_text)

spell1.head()

0    I have a dream.                      
1    His is an example of spell checking. 
2    Python is a grm programming language.
3    Gets go to the park tomorrow.        
Name: Text, dtype: object

# Tokenization

##### Tokenization is process of splitting a phrase, sentence, paragraph or text in a document into smaller units called 'TOKENS'.
* Tokens are building blocks of Natural Language.
* Tokens are used to construct Vocabulary [ set of unique tokens].
* Types of Tokenization:
    * Sentence Tokenization
    * Word Tokenization
    * Sub-words Tokenization
    * Character Tokenization
    * Regular Expression Tokenization (for custom tokens)

### Sample Dataset

In [36]:
data = {
    'text': [
        "This is the first sentence. This is the second sentence.",
        "The quick brown fox jumps over the lazy dog. It is a common saying.",
        "Sentence 1. Sentence 2! Sentence 3?",
        "This is a longer sentence with multiple clauses and punctuation marks.",
        "A single sentence without any punctuation."
    ]
}

sam = pd.DataFrame(data)
sam.head()

Unnamed: 0,text
0,This is the first sentence. This is the second sentence.
1,The quick brown fox jumps over the lazy dog. It is a common saying.
2,Sentence 1. Sentence 2! Sentence 3?
3,This is a longer sentence with multiple clauses and punctuation marks.
4,A single sentence without any punctuation.


## Sentence Tokenization

#### Python str function

In [37]:
sent_token1 = sam['text'].str.split('.')
sent_token1.head()

0    [This is the first sentence,  This is the second sentence, ]             
1    [The quick brown fox jumps over the lazy dog,  It is a common saying, ]  
2    [Sentence 1,  Sentence 2! Sentence 3?]                                   
3    [This is a longer sentence with multiple clauses and punctuation marks, ]
4    [A single sentence without any punctuation, ]                            
Name: text, dtype: object

#### Using Regex Library

In [38]:
import re
sent_token2 = sam['text'].apply(lambda x: re.split(r'(?<=[.!?]) +', x))
sent_token2.head()

0    [This is the first sentence., This is the second sentence.]             
1    [The quick brown fox jumps over the lazy dog., It is a common saying.]  
2    [Sentence 1., Sentence 2!, Sentence 3?]                                 
3    [This is a longer sentence with multiple clauses and punctuation marks.]
4    [A single sentence without any punctuation.]                            
Name: text, dtype: object

#### NLTK

In [39]:
# Using NLTK
from nltk.tokenize import sent_tokenize
def sentence_tokenize(text):
    return sent_tokenize(text, language='english')
sent_token3 = sam['text'].apply(sentence_tokenize)
sent_token3.head()

0    [This is the first sentence., This is the second sentence.]             
1    [The quick brown fox jumps over the lazy dog., It is a common saying.]  
2    [Sentence 1., Sentence 2!, Sentence 3?]                                 
3    [This is a longer sentence with multiple clauses and punctuation marks.]
4    [A single sentence without any punctuation.]                            
Name: text, dtype: object

#### Using Spacy

In [40]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [41]:
import spacy
nlp = spacy.load('en_core_web_sm')
def sent_tokenize_spacy(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

#sent_token4 = sam['text'].apply(sent_tokenize_spacy)
# or
sent_token4 = sam['text'].apply(lambda x:[sent.text for sent in nlp(x).sents])

sent_token4.head()

0    [This is the first sentence., This is the second sentence.]             
1    [The quick brown fox jumps over the lazy dog., It is a common saying.]  
2    [Sentence 1., Sentence 2! Sentence 3?]                                  
3    [This is a longer sentence with multiple clauses and punctuation marks.]
4    [A single sentence without any punctuation.]                            
Name: text, dtype: object

## Word Tokenization

#### Using Python

In [42]:
word_tokenize1 = sam['text'].str.split(" ")
word_tokenize1.head()

0    [This, is, the, first, sentence., This, is, the, second, sentence.]               
1    [The, quick, brown, fox, jumps, over, the, lazy, dog., It, is, a, common, saying.]
2    [Sentence, 1., Sentence, 2!, Sentence, 3?]                                        
3    [This, is, a, longer, sentence, with, multiple, clauses, and, punctuation, marks.]
4    [A, single, sentence, without, any, punctuation.]                                 
Name: text, dtype: object

#### Using Regex Library

In [47]:
import re
word_tokenize2 = sam['text'].apply(lambda x: re.findall(r'\b\w+\b', x))
word_tokenize2.head()

0    [This, is, the, first, sentence, This, is, the, second, sentence]                
1    [The, quick, brown, fox, jumps, over, the, lazy, dog, It, is, a, common, saying] 
2    [Sentence, 1, Sentence, 2, Sentence, 3]                                          
3    [This, is, a, longer, sentence, with, multiple, clauses, and, punctuation, marks]
4    [A, single, sentence, without, any, punctuation]                                 
Name: text, dtype: object

#### Using NLTK

In [49]:
from nltk.tokenize import word_tokenize
word_tokenize3 = sam['text'].apply(word_tokenize)
word_tokenize3.head()

0    [This, is, the, first, sentence, ., This, is, the, second, sentence, .]               
1    [The, quick, brown, fox, jumps, over, the, lazy, dog, ., It, is, a, common, saying, .]
2    [Sentence, 1, ., Sentence, 2, !, Sentence, 3, ?]                                      
3    [This, is, a, longer, sentence, with, multiple, clauses, and, punctuation, marks, .]  
4    [A, single, sentence, without, any, punctuation, .]                                   
Name: text, dtype: object

#### Using Spacy

In [51]:
word_tokenize4 = sam['text'].apply(lambda x: [token.text for token in nlp(x)])
word_tokenize4.head()

0    [This, is, the, first, sentence, ., This, is, the, second, sentence, .]               
1    [The, quick, brown, fox, jumps, over, the, lazy, dog, ., It, is, a, common, saying, .]
2    [Sentence, 1, ., Sentence, 2, !, Sentence, 3, ?]                                      
3    [This, is, a, longer, sentence, with, multiple, clauses, and, punctuation, marks, .]  
4    [A, single, sentence, without, any, punctuation, .]                                   
Name: text, dtype: object

#### Using Textblob

In [52]:
from textblob import TextBlob

word_tokenize5 = sam['text'].apply(lambda x: TextBlob(x).words)
word_tokenize5.head()

0    [This, is, the, first, sentence, This, is, the, second, sentence]                
1    [The, quick, brown, fox, jumps, over, the, lazy, dog, It, is, a, common, saying] 
2    [Sentence, 1, Sentence, 2, Sentence, 3]                                          
3    [This, is, a, longer, sentence, with, multiple, clauses, and, punctuation, marks]
4    [A, single, sentence, without, any, punctuation]                                 
Name: text, dtype: object

#### Using Gensim

In [53]:
from gensim.utils import simple_preprocess

word_tokenize4 = sam['text'].apply(simple_preprocess)
word_tokenize4.head()

0    [this, is, the, first, sentence, this, is, the, second, sentence]             
1    [the, quick, brown, fox, jumps, over, the, lazy, dog, it, is, common, saying] 
2    [sentence, sentence, sentence]                                                
3    [this, is, longer, sentence, with, multiple, clauses, and, punctuation, marks]
4    [single, sentence, without, any, punctuation]                                 
Name: text, dtype: object

#### Using Keras

In [54]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
word_tokenize5 = sam['text'].apply(text_to_word_sequence)
word_tokenize5.head()

0    [this, is, the, first, sentence, this, is, the, second, sentence]                
1    [the, quick, brown, fox, jumps, over, the, lazy, dog, it, is, a, common, saying] 
2    [sentence, 1, sentence, 2, sentence, 3]                                          
3    [this, is, a, longer, sentence, with, multiple, clauses, and, punctuation, marks]
4    [a, single, sentence, without, any, punctuation]                                 
Name: text, dtype: object

## Sub-word Tokenization

In [66]:
sam['text'].head()

0    This is the first sentence. This is the second sentence.              
1    The quick brown fox jumps over the lazy dog. It is a common saying.   
2    Sentence 1. Sentence 2! Sentence 3?                                   
3    This is a longer sentence with multiple clauses and punctuation marks.
4    A single sentence without any punctuation.                            
Name: text, dtype: object

#### Using Transformers library

In [61]:
%pip install transformers




In [65]:
from transformers import BertTokenizer, AutoTokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
auto_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_subword_tokenize1 = sam['text'].apply(lambda x: bert_tokenizer.tokenize(x))
auto_subword_tokenize1 = sam['text'].apply(lambda x: auto_tokenizer.tokenize(x))
# bert_subword_tokenize1.head()
print("===================")
auto_subword_tokenize1.head()




0    [this, is, the, first, sentence, ., this, is, the, second, sentence, .]                     
1    [the, quick, brown, fox, jumps, over, the, lazy, dog, ., it, is, a, common, saying, .]      
2    [sentence, 1, ., sentence, 2, !, sentence, 3, ?]                                            
3    [this, is, a, longer, sentence, with, multiple, clauses, and, pun, ##ct, ##uation, marks, .]
4    [a, single, sentence, without, any, pun, ##ct, ##uation, .]                                 
Name: text, dtype: object