### Tokenization and Lemmatization/Stemming in python
 
- The goal of this notebok is to demonstrate the word stemming capabilities of the nltk and spaCy package


In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import porter, WordNetLemmatizer


In [5]:
with open('/Users/nmiles/PACMan_dist/libs/stopwords.txt', 'r') as test_file:
    tmp = test_file.readlines()
    stop_words = [val.strip('\n') for val in tmp]


In [6]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /Users/nmiles/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nmiles/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nmiles/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Read in some example text to play with.

In [7]:
with open('../C25/0001.txtx', 'r') as test_file:
    text = test_file.readlines()
    text = [val.strip('\n') for val in text]
# text = ' '.join(text) 

In [8]:
text = [val.strip('\n') for val in text]
# text = ' '.join(text) 

In [9]:
text

['Hubble Space Telescope',
 '',
 'Cycle 25 AR Proposal',
 '',
 '1',
 '',
 'Numerical Modeling of Superluminous and Peculiar',
 'Supernovae',
 'Scientific Category: Stellar Physics',
 'Scientific Keywords: Circumstellar Matter, Massive Stars, Radiative Transfer, Supernovae, Transients',
 'Budget Size: Regular',
 'Theory: Yes',
 '',
 'Abstract',
 'The Hubble Space Telescope (HST) has been instrumental in elucidating the nature of the intriguing',
 'superluminous supernovae (SLSNe) explosions by providing unparalleled observations of the progenitor stars,',
 'supernova imposters such as "Luminous Blue Variables" (LBVs) and their host galaxy properties. Furthermore,',
 'HST has directly imaged one of the earliest SLSN discovered, SN 2006gy, more than two years after the',
 'explosion. Now, more than a decade since the first modern discovery of SLSNe and with more than a hundred',
 'members of the class observed, the question on the explosion and energy input mechanism of these',
 'unpreced

In [10]:
lexicon = [val.split(' ')[0] for val in text if val != '']

In [11]:
for word in lexicon[:10]:
        print(word)

Hubble
Cycle
1
Numerical
Supernovae
Scientific
Scientific
Budget
Theory:
Abstract


In [12]:
lexicon = [word_tokenize(word) for word in lexicon if len(word) != 0]

In [13]:
def nltk2wn_tag(nltk_tag):
    """Convenience function for converting NLTK POS tags to wordnet equivalents
    """
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        # If it's unclear what it is, just assume the default [NOUN]
        return wordnet.NOUN

In [14]:
# Determine the proper parts-of-speech tag for each token and convert them from NLTK to wordnet
final_lexicon = []
for lex in lexicon:
    pos_tag = nltk.pos_tag(lex)
    wdnet_pos_tag = nltk2wn_tag(pos_tag[0][1])
    final_lexicon.append((lex[0], wdnet_pos_tag))

IndexError: list index out of range

In [None]:
final_lexicon[0]

In [15]:
# Use the wordnet lemmatizer 
lemmatizer = WordNetLemmatizer()

In [16]:
# Use the robust Porter Stemmer
stemmer = porter.PorterStemmer()

In [17]:
for lex in final_lexicon[:10]:
    print(f'Word: {lex} \nLemma: {lemmatizer.lemmatize(lex[0], pos=lex[1])}\nStem: {stemmer.stem(lex[0])}\n')

Word: ('Hubble', 'a') 
Lemma: Hubble
Stem: hubbl

Word: ('Cycle', 'n') 
Lemma: Cycle
Stem: cycl

Word: ('1', 'n') 
Lemma: 1
Stem: 1

Word: ('Numerical', 'a') 
Lemma: Numerical
Stem: numer

Word: ('Supernovae', 'n') 
Lemma: Supernovae
Stem: supernova

Word: ('Scientific', 'n') 
Lemma: Scientific
Stem: scientif

Word: ('Scientific', 'n') 
Lemma: Scientific
Stem: scientif

Word: ('Budget', 'v') 
Lemma: Budget
Stem: budget

Word: ('Theory', 'n') 
Lemma: Theory
Stem: theori

Word: ('Abstract', 'n') 
Lemma: Abstract
Stem: abstract



<hr>

Perform the same steps using a class-based approach with spaCy.

- spaCy is different in that it prefers to receive the abstract in a single chunk 

In [50]:
import spacy
from spacy.lang.en import English
import string

In [51]:
nlp = spacy.load("en_core_web_sm")

In [52]:
print(len(nlp.Defaults.stop_words))

354


In [53]:
print(len(stop_words))

339


In [54]:
spacy_stop = set(nlp.Defaults.stop_words)
custom_stop = set(stop_words)


In [55]:
missing_stop_words = custom_stop.difference(spacy_stop)
print(len(missing_stop_words))

0


In [56]:
# combine them into a single list of stop words
nlp.Defaults.stop_words |= set(missing_stop_words)

In [57]:
print(len(nlp.Defaults.stop_words))

354


In [60]:
with open('../C25/0002.txtx', 'r') as test_file:
    t = test_file.readlines()
    scijust = [val.strip('\n') for val in t]
    scijust = ' '.join(scijust) 

In [62]:
scidoc = nlp(scijust)
print(len(scidoc))

6119


In [42]:
trim_stop_words = []
autogen_stop_words = []

In [44]:
for token in scidoc:
    if token.is_stop:
        autogen_stop_words.append(token)
        continue
    trim_stop_words.append(token)

In [46]:
print(len(trim_stop_words)/len(scidoc))

0.71762534604737


In [47]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(text, return_type='str'):
    # Creating our token object, which contains each word token parsed from the text.
    mytokens = parser(text)
    num_tokens = len(mytokens)
    # Next, lemmatize each token and standardize the capitalization to be lower case
    mytokens = [
        word.lemma_.lower().strip()
        if word.lemma_ != "-PRON-" else word.lower_ 
        for word in mytokens 
    ]

    # Removing stop words and punctuation
    mytokens = [
        word for word in mytokens 
        if word not in stop_words and word not in punctuations
    ]
    print(f"Processed text represents {len(mytokens)/num_tokens:0.2f}% of the input text")
    
    return mytokens

In [48]:
tokens = spacy_tokenizer(scijust)

Processed text represents 0.60% of the input text


In [49]:
tokens

['scientific',
 'justification',
 'metal',
 'poor',
 'dwarf',
 'galaxies',
 'cosmological',
 'probes',
 'high',
 'z',
 'star',
 'formation',
 '9',
 'according',
 'hierarchical',
 'formation',
 'dwarf',
 'm',
 '10',
 'm',
 'galaxy',
 '\ufeff1',
 'collapse',
 'start',
 'form',
 'star',
 'supply',
 'build',
 'block',
 'formation',
 'massive',
 'galaxy',
 'merge',
 'accretion',
 'remnant',
 'process',
 'present',
 'day',
 'dwarf',
 'site',
 'early',
 'star',
 'formation',
 'sf',
 'activity',
 'universe',
 'chemically',
 'unevolved',
 'young',
 'stellar',
 'population',
 'nearby',
 'guide',
 'provide',
 'essential',
 'information',
 'early',
 'galaxy',
 'formation',
 'stellar',
 'evolution',
 'extract',
 'faint',
 'detect',
 'z',
 '9',
 '11',
 'e',
 'g',
 'ellis',
 '2013',
 'oesch',
 '2016',
 'understand',
 'true',
 'mode',
 'sf',
 'cosmic',
 'time',
 'remain',
 'longstanding',
 'subject',
 'extensive',
 'effort',
 'make',
 'past',
 'pin',
 'star',
 'formation',
 'history',
 'sfh',
 'metal'