# Text Data Cleaning and Preprocessing Assignment

In [19]:
# import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [36]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np

In [2]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [3]:
PATH = 'oreilly_radar/'
DOC_PATTERN = r'articles_rss.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 60
Number of paragraphs: 61
Number of sentences: 1801
Number of words: 58848
Vocabulary: 6866
Avg chars per word: 5.2
Avg words per sentence: 32.7


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [4]:
docs = []

for fileid in corpus.fileids():
    doc = corpus.raw(fileid)
    docs.append(doc)

### Sentence tokenize each document in the list of documents.

In [5]:
doc_sents = [sent_tokenize(doc) for doc in docs]
doc_sents[0]

['Perhaps the most important event this month isn’t technical, but the start of the US Justice Dept.’s lawsuit against Google.',
 'That will certainly play out over years rather than months, but it’s significance is less about this particular case than the idea that legal and regulatory systems will play a large role in the evolution of technology in the US.',
 'In the short term, it’s worth watching the CPPA, GDPR, California’s Props 22 and 24, and FCC interference with social media’s enforcement of rules around community behavior.',
 'Long term, this is only the beginning.Artificial Intelligence and Machine LearningPartial differential equations are the key to a number of difficult and important problems.',
 'In a surprising breakthrough, it’s been shown that deep learning can be used to solve PDEs, and that they are orders of magnitude faster than typical numerical methods.solve PDEsAgence is a dynamic film/multiplayer VR game with intelligent agents.',
 'AI might not be what pushes

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [41]:
s_tokenized = []
for sents in doc_sents:
    temp = []
    for sent in sents:
        temp.append(word_tokenize(sent))
    s_tokenized.append(temp)
    
s_tokenized[0]

[['Perhaps',
  'the',
  'most',
  'important',
  'event',
  'this',
  'month',
  'isn',
  '’',
  't',
  'technical',
  ',',
  'but',
  'the',
  'start',
  'of',
  'the',
  'US',
  'Justice',
  'Dept.',
  '’',
  's',
  'lawsuit',
  'against',
  'Google',
  '.'],
 ['That',
  'will',
  'certainly',
  'play',
  'out',
  'over',
  'years',
  'rather',
  'than',
  'months',
  ',',
  'but',
  'it',
  '’',
  's',
  'significance',
  'is',
  'less',
  'about',
  'this',
  'particular',
  'case',
  'than',
  'the',
  'idea',
  'that',
  'legal',
  'and',
  'regulatory',
  'systems',
  'will',
  'play',
  'a',
  'large',
  'role',
  'in',
  'the',
  'evolution',
  'of',
  'technology',
  'in',
  'the',
  'US',
  '.'],
 ['In',
  'the',
  'short',
  'term',
  ',',
  'it',
  '’',
  's',
  'worth',
  'watching',
  'the',
  'CPPA',
  ',',
  'GDPR',
  ',',
  'California',
  '’',
  's',
  'Props',
  '22',
  'and',
  '24',
  ',',
  'and',
  'FCC',
  'interference',
  'with',
  'social',
  'media',
  '’',

### Tag each token with its part of speech.

In [44]:
s_pos = []
for sents in s_tokenized:
    temp = []
    for sent in sents:
        temp.append(pos_tag(sent))
    s_pos.append(temp)
        
s_pos[0]

[[('Perhaps', 'RB'),
  ('the', 'DT'),
  ('most', 'RBS'),
  ('important', 'JJ'),
  ('event', 'NN'),
  ('this', 'DT'),
  ('month', 'NN'),
  ('isn', 'NN'),
  ('’', 'NNP'),
  ('t', 'NN'),
  ('technical', 'JJ'),
  (',', ','),
  ('but', 'CC'),
  ('the', 'DT'),
  ('start', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('US', 'NNP'),
  ('Justice', 'NNP'),
  ('Dept.', 'NNP'),
  ('’', 'NNP'),
  ('s', 'VBD'),
  ('lawsuit', 'NN'),
  ('against', 'IN'),
  ('Google', 'NNP'),
  ('.', '.')],
 [('That', 'DT'),
  ('will', 'MD'),
  ('certainly', 'RB'),
  ('play', 'VB'),
  ('out', 'RP'),
  ('over', 'IN'),
  ('years', 'NNS'),
  ('rather', 'RB'),
  ('than', 'IN'),
  ('months', 'NNS'),
  (',', ','),
  ('but', 'CC'),
  ('it', 'PRP'),
  ('’', 'VBZ'),
  ('s', 'JJ'),
  ('significance', 'NN'),
  ('is', 'VBZ'),
  ('less', 'RBR'),
  ('about', 'IN'),
  ('this', 'DT'),
  ('particular', 'JJ'),
  ('case', 'NN'),
  ('than', 'IN'),
  ('the', 'DT'),
  ('idea', 'NN'),
  ('that', 'IN'),
  ('legal', 'JJ'),
  ('and', 'CC'),
  ('re

### Word tokenize the raw text of each document and remove stop words.

In [46]:
tokenized = [word_tokenize(doc) for doc in docs]

In [47]:
no_stopwords = []
for token in tokenized:
    temp = []
    for word in token:
        if word.lower() not in stopwords.words('english'):
            temp.append(word.lower())
    no_stopwords.append(temp)
        
no_stopwords[0]

['perhaps',
 'important',
 'event',
 'month',
 '’',
 'technical',
 ',',
 'start',
 'us',
 'justice',
 'dept.',
 '’',
 'lawsuit',
 'google',
 '.',
 'certainly',
 'play',
 'years',
 'rather',
 'months',
 ',',
 '’',
 'significance',
 'less',
 'particular',
 'case',
 'idea',
 'legal',
 'regulatory',
 'systems',
 'play',
 'large',
 'role',
 'evolution',
 'technology',
 'us',
 '.',
 'short',
 'term',
 ',',
 '’',
 'worth',
 'watching',
 'cppa',
 ',',
 'gdpr',
 ',',
 'california',
 '’',
 'props',
 '22',
 '24',
 ',',
 'fcc',
 'interference',
 'social',
 'media',
 '’',
 'enforcement',
 'rules',
 'around',
 'community',
 'behavior',
 '.',
 'long',
 'term',
 ',',
 'beginning.artificial',
 'intelligence',
 'machine',
 'learningpartial',
 'differential',
 'equations',
 'key',
 'number',
 'difficult',
 'important',
 'problems',
 '.',
 'surprising',
 'breakthrough',
 ',',
 '’',
 'shown',
 'deep',
 'learning',
 'used',
 'solve',
 'pdes',
 ',',
 'orders',
 'magnitude',
 'faster',
 'typical',
 'numerical

### For every document, stem all the words in the document.

In [26]:
stemmer = SnowballStemmer('english')

stem_words = []
for doc in doc_tokenized:
    for sent in doc:
        stem_words.append([stemmer.stem(word.lower()) for word in sent])
        
stem_words[0]

['perhap',
 'the',
 'most',
 'import',
 'event',
 'this',
 'month',
 'isn',
 '’',
 't',
 'technic',
 ',',
 'but',
 'the',
 'start',
 'of',
 'the',
 'us',
 'justic',
 'dept.',
 '’',
 's',
 'lawsuit',
 'against',
 'googl',
 '.']

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [48]:
num_sent = len(doc[0])
avg_words_sent = sum([len(sent) for sent in doc]) / num_sent
    
doc_words = []
for sent in doc:
    doc_words = np.unique(doc_words + sent)
    print(doc_words)
    
print('-----------------------------')
print('Number of sentences = ', num_sent)
print('Average words per sentence = ', avg_words_sent)
#print('Unique words (vocabulary) = ', vocab)

[',' '.' 'Dept.' 'Google' 'Justice' 'Perhaps' 'US' 'against' 'but' 'event'
 'important' 'isn' 'lawsuit' 'month' 'most' 'of' 's' 'start' 't'
 'technical' 'the' 'this' '’']


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U12'), dtype('<U12')) -> dtype('<U12')