In [21]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/chandlerbeon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chandlerbeon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chandlerbeon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/chandlerbeon/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/chandlerbeon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [7]:
# Load in dataset
df = pd.read_csv("data/Project1-ClassificationDataset.csv")
df.head()

Unnamed: 0,full_text,summary,keywords,publish_date,authors,url,leaf_label,root_label
0,'Personalize Your NBA App Experience for the '...,'Personalize Your NBA App Experience for the '...,"['original', 'content', 'live', 'slate', 'game...",,['Official Release'],https://www.nba.com/news/nba-app-new-features-...,basketball,sports
1,'Mike Will attends the Pre-GRAMMY Gala and GRA...,'Mike WiLL Made-It has secured a partnership w...,"['lead', 'espn', 'nbas', 'madeit', 'nba', 'lat...",2023-10-18 16:22:29+00:00,['Marc Griffin'],https://www.vibe.com/news/entertainment/mike-w...,basketball,sports
2,'The Golden State Warriors are struggling to f...,'The Golden State Warriors are struggling to f...,"['insider', 'york', 'thing', 'nbc', 'tag', 'nb...",,[],https://www.nbcnewyork.com/tag/featured-nba/,basketball,sports
3,"'On Nov. 28, the NBA and Nike will collaborate...","'On Nov. 28, the NBA and Nike will collaborate...","['watch', 'telecast', 'ultimate', 'membership'...",,['Official Release'],https://www.nba.com/news/watch-nba-games-ultim...,basketball,sports
4,'The NBA announced additions and innovations t...,'The NBA announced additions and innovations t...,"['experience', 'bring', 'media', 'crennan', 'n...",2023-10-17 12:00:17+00:00,"['Chris Novak', 'About Chris Novak']",https://awfulannouncing.com/tech/nba-app-2023-...,basketball,sports


In [8]:
# Set seeds
np.random.seed(42)
random.seed(42)

# Split dataset
train, test = train_test_split(df[["full_text","root_label"]], test_size=0.2)

## Preprocessing & Feature Extraction using TF-IDF

In [None]:
# Given function to parse out HTML-related characters
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

In [85]:
# Define custom transformer to clean the dataset
class DocumentPreprocessingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self

    def transform(self, X):
        # Strip all HTML-related artifacts
        X = X.apply(clean)

        # Remove all punctuation and digits
        ref_punct_digits = string.punctuation + string.digits
        X = X.apply(lambda x: x.translate(str.maketrans('', '', ref_punct_digits)))

        # Makes all characters lower-case
        # Note: Technically the CountVectorizer already handles it, but is necessary
        #       for lemmatization.
        X = X.apply(lambda x: x.lower())
        return X

# Define custom transfomer for lemmiatizatino
class LemmatizationPOSTransfomer(BaseEstimator, TransformerMixin):
    def lemmatize_documents(self, doc, wnl):
        """
        Lemmatizes documents (i.e. each entry) and returns the lemmatized
        document as a single string.
        """
        return ' '.join([wnl.lemmatize(w[0], self.get_wordnet_pos(w[1])) for w in pos_tag(word_tokenize(doc))])


    # Nested function for getting part of speech
    def get_wordnet_pos(self, tag):
        """
        Maps POS tags to WordNet POS tags.

        Default to Noun.
        """
        if tag.startswith('J'):  # Adjective
            return wordnet.ADJ
        elif tag.startswith('V'):  # Verb
            return wordnet.VERB
        elif tag.startswith('N'):  # Noun
            return wordnet.NOUN
        elif tag.startswith('R'):  # Adverb
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def fit(self, X):
        return self

    def transform(self, X):
        wnl = WordNetLemmatizer()
        X = X.apply(lambda x: self.lemmatize_documents(x, wnl))
        return X

    

In [87]:
pipe = Pipeline([
    # Step 1: Data Cleaning
    ('doc_preprocess', DocumentPreprocessingTransformer()),
    # Step 2: Lemmatization
    ('lemmatization', LemmatizationPOSTransfomer()),
    # Step 3: Count Vectorization + Stop Word Removal
    ('cvector', CountVectorizer(stop_words='english', min_df=3)),
    # Step 4: TF-IDF Transformation
    ('tfidf', TfidfTransformer())
])

In [88]:
tfidf_processed = pipe.fit_transform(train['full_text'])
tfidf_processed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 412591 stored elements and shape (2780, 13795)>