Example tokenization

In [4]:
import nltk
nltk.download('punkt')

sentence = "This is a cat."
tokens = nltk.word_tokenize(sentence)
print(tokens)

['This', 'is', 'a', 'cat', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     /storage/home/sqs6406/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
sentence = "What're you doing?"
tokens = nltk.word_tokenize(sentence)
print(tokens)

['What', "'re", 'you', 'doing', '?']


In [6]:
sentence = "State College is a home rule municipality in Centre County in the Commonwealth of Pennsylvania. "
tokens = nltk.word_tokenize(sentence)
print(tokens)

['State', 'College', 'is', 'a', 'home', 'rule', 'municipality', 'in', 'Centre', 'County', 'in', 'the', 'Commonwealth', 'of', 'Pennsylvania', '.']


Lemmatization

In [7]:
# import these modules 
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

sentence = "The boy's cars are different colors."

tokens = nltk.word_tokenize(sentence)
print(tokens)

lemmatized_tokens = [lemmatizer.lemmatize(w) for w in tokens]
print(lemmatized_tokens)

lemmatized_tokens = [lemmatizer.lemmatize(w, pos='v') for w in tokens]
print(lemmatized_tokens)

['The', 'boy', "'s", 'cars', 'are', 'different', 'colors', '.']
['The', 'boy', "'s", 'car', 'are', 'different', 'color', '.']
['The', 'boy', "'s", 'cars', 'be', 'different', 'color', '.']


[nltk_data] Downloading package wordnet to
[nltk_data]     /storage/home/sqs6406/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stemming

In [8]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


Stemming v.s. Lemmatization

In [9]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

tokens = ['studies', 'studying']

stemmed_tokens = [ps.stem(w) for w in tokens]
lemmatized_tokens = [lemmatizer.lemmatize(w, pos='v') 
                      for w in tokens]

print('Stemming: ', stemmed_tokens)
print('Lemmatization: ', lemmatized_tokens)

Stemming:  ['studi', 'studi']
Lemmatization:  ['study', 'study']


Named Entity Recognition

In [21]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp("State College is a home rule municipality in Centre County in the Commonwealth of Pennsylvania.")

print([ent for ent in doc.ents])

[State College, Centre County, the Commonwealth of Pennsylvania]


## Document Representation

### Dataset

The data set we’ll use is a list of over one million news headlines published over a period of 15 years and can be downloaded from [Kaggle](https://www.kaggle.com/therohk/million-headlines/data).

In [23]:
conf = SparkConf()
sc=SparkContext(conf=conf)
sc


NameError: name 'SparkConf' is not defined

In [24]:
import pandas as pd
data = pd.read_csv("/storage/home/sqs6406/StoryResults.csv", error_bad_lines=False);
data_text = data[['Story']]
data_text['index'] = data_text.index
documents = data_text[:10]

print(len(documents))
print(documents[:5])

10
                                               Story  index
0  My lover has left. I miss him so much. I write...      0
1  My mom was sending me a letter in the mail abo...      1
2  I had to buy a stamp at the post office. I am ...      2
3  I decided it was time for me to send my mom an...      3
4  Rachel was sitting in her dorm and decided to ...      4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


## Data Pre-processing
We will perform the following steps:
1.   Tokenization: Split the text into sentences and the sentences into words. 
2.   Lowercase the words and remove punctuation.
3.   Words that have fewer than 3 characters are removed.
4.   All stopwords are removed.
5.   Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
6.   Words are stemmed — words are reduced to their root form.

Loading gensim and nltk libraries

In [25]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from nltk.stem.porter import *
import numpy as np
np.random.seed(16)
import nltk
nltk.download('wordnet')

ModuleNotFoundError: No module named 'gensim'

Write a function to perform lemmatize and stem preprocessing steps on the data set.

In [None]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


Select a document to preview after preprocessing.

In [None]:
doc_sample = documents[documents['index'] == 3].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

Preprocess the headline text, saving the results as ‘processed_docs’

In [None]:
processed_docs = documents['Story'].map(preprocess)
print(processed_docs)

##Bag of Words on the Data set

Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
print('# of tokens: ', len(dictionary))
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

###Gensim filter_extremes

Filter out tokens that appear in
*   less than 2 documents (absolute number) or
*   more than 0.1 documents (fraction of total corpus size, not absolute number).
*   after the above two steps, keep only the first 1000 most frequent tokens.



In [None]:
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=1000)
print('# of tokens after filtering: ', len(dictionary))

### Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[3]

Preview Bag Of Words for our sample preprocessed document.

In [None]:
bow_doc_3 = bow_corpus[3]
for i in range(len(bow_doc_3)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_3[i][0], 
                                               dictionary[bow_doc_3[i][0]], 
bow_doc_3[i][1]))

###TF-IDF

Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]


score = 0

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    score += 1
    #break
print(score)

Transform into sparse matrix

In [None]:
from gensim.matutils import corpus2csc
corpus_tfidf_matrix = corpus2csc(corpus_tfidf)

### Document clustering

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=7)
clusters = model.fit_predict(corpus_tfidf_matrix.T)
print(model.labels_)