<a href="https://colab.research.google.com/github/realbluesnail/UNCC_DSBA6188/blob/main/DSBA6188_Preprocessing_Conventional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup
- Import packages
- [Optional] Mount your google drive (if you would like to you read and save data files from your google drive)

In [None]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
# from wordcloud import WordCloud ## don't need it this time

In [None]:
import nltk
import nltk.corpus


In [None]:
# Importing word_tokenize from nltk
from nltk.tokenize import (word_tokenize,
                           sent_tokenize,
                           TreebankWordTokenizer,
                           wordpunct_tokenize,
                           TweetTokenizer,
                           MWETokenizer)
# Get the tokenizer to divide text into sentences
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# If you would like to save and read data files from your Google drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Tokenization

- Expriment and compare different tokenization techniques


In [None]:
# Sample text: a twitter message

sample_text = "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. Terrible! You shoulda got David Carr Carr of Third Day to do it. ;D ☹️👽"




In [None]:
# Whitespace tokenization
tokens = sample_text.split(',')
print(tokens)

['@switchfoot http://twitpic.com/2y1zl - Awww', " that's a bummer. Terrible! You shoulda got David Carr of Third Day to do it. ;D ☹️👽"]


In [None]:
# Split text using a specfic symbol (or letter)
tokens = sample_text.split('w')
print(tokens)

['@s', 'itchfoot http://t', 'itpic.com/2y1zl - A', '', '', ", that's a bummer. Terrible! You shoulda got David Carr of Third Day to do it. ;D ☹️👽"]


In [None]:
# Try different tokenization tools under NLTK

# 1. Word tokenizer
# from nltk.tokenize import word_tokenize
tokens = word_tokenize(sample_text)
print(tokens)

['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D', '☹️👽']


In [None]:
# 2. Word punctuation tokenizer
# from nltk.tokenize import wordpunct_tokenize
tokens = wordpunct_tokenize(sample_text)
print(tokens)


['@', 'switchfoot', 'http', '://', 'twitpic', '.', 'com', '/', '2y1zl', '-', 'Awww', ',', 'that', "'", 's', 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D', '☹️👽']


In [None]:
# 3. Treebank word tokenizer
# from nltk.tokenize import TreebankWordTokenizer
my_tokenizer = TreebankWordTokenizer()
tokens = my_tokenizer.tokenize(sample_text)
print(tokens)


['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it.', ';', 'D', '☹️👽']


In [None]:
# 4. Tweet Tokenizer
# from nltk.tokenize import TweetTokenizer
my_tokenizer = TweetTokenizer()
tokens = my_tokenizer.tokenize(sample_text)
print(tokens)


['@switchfoot', 'http://twitpic.com/2y1zl', '-', 'Awww', ',', "that's", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';D', '☹', '️', '👽']


In [None]:
# 5. MWE Tokenizer
# from nltk.tokenize import MWETokenizer
my_tokenizer = MWETokenizer()
my_tokenizer.add_mwe(('David', 'Carr'))
tokens = my_tokenizer.tokenize(TweetTokenizer().tokenize(sample_text))
print(tokens)

['@switchfoot', 'http://twitpic.com/2y1zl', '-', 'Awww', ',', "that's", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David_Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';D', '☹', '️', '👽']


In [None]:
# SpaCy
import spacy
nlp = spacy.load('en_core_web_sm')



In [None]:
doc = nlp(sample_text)
tokens = [token.text for token in doc]
print(tokens)

['@switchfoot', 'http://twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';D', '☹', '️', '👽']


In [None]:
# Gensim
from gensim.utils import tokenize
tokens = list(tokenize(sample_text))
print(tokens)


['switchfoot', 'http', 'twitpic', 'com', 'y', 'zl', 'Awww', 'that', 's', 'a', 'bummer', 'Terrible', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', 'D']


In [None]:
# Keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence
tokens = text_to_word_sequence(sample_text)
print(tokens)


['switchfoot', 'http', 'twitpic', 'com', '2y1zl', 'awww', "that's", 'a', 'bummer', 'terrible', 'you', 'shoulda', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', 'd', '☹️👽']


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
my_tokenizer = Tokenizer()
my_tokenizer.fit_on_texts([sample_text])
tokens = my_tokenizer.texts_to_sequences([sample_text])
print(my_tokenizer.word_index)


{'carr': 1, 'switchfoot': 2, 'http': 3, 'twitpic': 4, 'com': 5, '2y1zl': 6, 'awww': 7, "that's": 8, 'a': 9, 'bummer': 10, 'terrible': 11, 'you': 12, 'shoulda': 13, 'got': 14, 'david': 15, 'of': 16, 'third': 17, 'day': 18, 'to': 19, 'do': 20, 'it': 21, 'd': 22, '☹️👽': 23}


In [None]:
# Other tokenizers
# Sentence tokenizer
tokens = sent_tokenize(sample_text)
print(tokens)


["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.", 'Terrible!', 'You shoulda got David Carr Carr of Third Day to do it.', ';D ☹️👽']


In [None]:
# Subwords tokenizer


# Pre-processing raw text

- Remove Stopwords
- Remove Punctuations
- Stemming
- Lemmatization
- Remove URLs

In [None]:
# Remove Stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


tokens = word_tokenize(sample_text)
print(tokens)
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)




['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D', '☹️👽']
['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', "'s", 'bummer', '.', 'Terrible', '!', 'shoulda', 'got', 'David', 'Carr', 'Carr', 'Third', 'Day', '.', ';', '☹️👽']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Remove Punctuations
import string
tokens = word_tokenize(sample_text)
print(tokens)
filtered_tokens = [token for token in tokens if token not in string.punctuation]
print(filtered_tokens)


['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D', '☹️👽']
['switchfoot', 'http', '//twitpic.com/2y1zl', 'Awww', 'that', "'s", 'a', 'bummer', 'Terrible', 'You', 'shoulda', 'got', 'David', 'Carr', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', 'D', '☹️👽']


In [None]:
# Stemming

from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

tokens = word_tokenize(sample_text)
print(tokens)
stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
print(stemmed_tokens)


['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D', '☹️👽']
['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'awww', ',', 'that', "'s", 'a', 'bummer', '.', 'terribl', '!', 'you', 'shoulda', 'got', 'david', 'carr', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.', ';', 'd', '☹️👽']


In [None]:
# Lemmatization 1

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary data if you haven't already
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

tokens = word_tokenize("good better great goose geese meeting meet meets going go gone")
print(tokens)
lemmatized_tokens = [lemmatizer.lemmatize(token, pos = 'n') for token in tokens]
print(lemmatized_tokens)



['good', 'better', 'great', 'goose', 'geese', 'meeting', 'meet', 'meets', 'going', 'go', 'gone']
['good', 'better', 'great', 'goose', 'goose', 'meeting', 'meet', 'meet', 'going', 'go', 'gone']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Lemmatization 2

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

tokens = word_tokenize("good better great goose geese meeting meet going go gone")

tokens_pos_tagged = nltk.pos_tag(tokens)
print(tokens_pos_tagged)

tokens_wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), tokens_pos_tagged))
print(tokens_wordnet_tagged)

tokens_lemma = []
for w, tag in tokens_wordnet_tagged:
    if tag is None:
        tokens_lemma.append(w)
    else:
        tokens_lemma.append(lemmatizer.lemmatize(w, tag))

#tokens_lemma = " ".join(tokens_lemma)
print(tokens)
print(tokens_lemma)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('good', 'JJ'), ('better', 'JJR'), ('great', 'JJ'), ('goose', 'JJ'), ('geese', 'JJ'), ('meeting', 'NN'), ('meet', 'NN'), ('going', 'VBG'), ('go', 'VB'), ('gone', 'VBN')]
[('good', 'a'), ('better', 'a'), ('great', 'a'), ('goose', 'a'), ('geese', 'a'), ('meeting', 'n'), ('meet', 'n'), ('going', 'v'), ('go', 'v'), ('gone', 'v')]
['good', 'better', 'great', 'goose', 'geese', 'meeting', 'meet', 'going', 'go', 'gone']
['good', 'good', 'great', 'goose', 'geese', 'meeting', 'meet', 'go', 'go', 'go']


In [None]:
# Remove URL using Regular Expression
import re # Regular Expression

# Define a regular expression pattern
url_pattern = re.compile(r'https?://\S+|www\.\S+')

# tokens = word_tokenize(sample_text)
# print(tokens)
# filtered_tokens = [token for token in tokens if not url_pattern.search(token)]
# print(filtered_tokens)

sample_text_wo_url = url_pattern.sub('', sample_text)
tokens = word_tokenize(sample_text_wo_url)
print(tokens)

['@', 'switchfoot', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'Terrible', '!', 'You', 'shoulda', 'got', 'David', 'Carr', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D', '☹️👽']


# Take home Assignment:
Create your own pre-processing pipeline (from raw text to tokens)

In [None]:
# [Optional step]: If you would like to take a look at more sample data from the twitter dataset
# To load the example data set, not that you might need to change the file path to where you save the tweeter_training.csv. The data set is also avaiable on canvas for download
tw_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tweeter_training.csv', encoding='ISO-8859-1', header=None)
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names