# Text Data Cleaning and Preprocessing Assignment

In [1]:
# import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

In [2]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = 'oreilly_radar/'
DOC_PATTERN = r'articles_rss.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 60
Number of paragraphs: 61
Number of sentences: 1801
Number of words: 58848
Vocabulary: 6866
Avg chars per word: 5.2
Avg words per sentence: 32.7


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = []

for fileid in corpus.fileids():
    doc = corpus.raw(fileid)
    docs.append(doc)

### Sentence tokenize each document in the list of documents.

In [6]:
doc_sents = [sent_tokenize(doc) for doc in docs]
doc_sents[0]

['Perhaps the most important event this month isn’t technical, but the start of the US Justice Dept.’s lawsuit against Google.',
 'That will certainly play out over years rather than months, but it’s significance is less about this particular case than the idea that legal and regulatory systems will play a large role in the evolution of technology in the US.',
 'In the short term, it’s worth watching the CPPA, GDPR, California’s Props 22 and 24, and FCC interference with social media’s enforcement of rules around community behavior.',
 'Long term, this is only the beginning.Artificial Intelligence and Machine LearningPartial differential equations are the key to a number of difficult and important problems.',
 'In a surprising breakthrough, it’s been shown that deep learning can be used to solve PDEs, and that they are orders of magnitude faster than typical numerical methods.solve PDEsAgence is a dynamic film/multiplayer VR game with intelligent agents.',
 'AI might not be what pushes

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [7]:
s_tokenized = []
for sents in doc_sents:
    temp = []
    for sent in sents:
        temp.append(word_tokenize(sent))
    s_tokenized.append(temp)

### Tag each token with its part of speech.

In [8]:
s_pos = []
for sents in s_tokenized:
    temp = []
    for sent in sents:
        temp.append(pos_tag(sent))
    s_pos.append(temp)

### Word tokenize the raw text of each document and remove stop words.

In [9]:
tokenized = [word_tokenize(doc) for doc in docs]

In [10]:
no_stopwords = []
for token in tokenized:
    temp = []
    for word in token:
        if word.lower() not in stopwords.words('english'):
            temp.append(word.lower())
    no_stopwords.append(temp)

### For every document, stem all the words in the document.

In [11]:
stemmer = SnowballStemmer('english')

stem_words = []
for token in tokenized:
    temp = []
    for word in token:
        temp.append(stemmer.stem(word.lower()))
    stem_words.append(temp)

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [31]:
for i in range(0, len(doc_sents)):
    num_sent = len(doc_sents[i])
    avg_words_sent = sum([len(sent) for sent in s_tokenized[i]]) / num_sent
    vocab = len(set([word.lower() for word in word_tokenize(docs[i])]))
    lex_div = vocab / len(word_tokenize(docs[i]))

    print('-----------------------------')
    print('Number of sentences = ', num_sent)
    print('Average words per sentence = ', avg_words_sent)
    print('Unique words (vocabulary) = ', vocab)
    print('Lexical diversity = ', lex_div)

-----------------------------
Number of sentences =  34
Average words per sentence =  33.6764705882353
Unique words (vocabulary) =  545
Lexical diversity =  0.4759825327510917
-----------------------------
Number of sentences =  10
Average words per sentence =  28.8
Unique words (vocabulary) =  156
Lexical diversity =  0.5416666666666666
-----------------------------
Number of sentences =  8
Average words per sentence =  34.5
Unique words (vocabulary) =  167
Lexical diversity =  0.605072463768116
-----------------------------
Number of sentences =  4
Average words per sentence =  53.75
Unique words (vocabulary) =  130
Lexical diversity =  0.6046511627906976
-----------------------------
Number of sentences =  43
Average words per sentence =  32.395348837209305
Unique words (vocabulary) =  599
Lexical diversity =  0.43000717875089733
-----------------------------
Number of sentences =  8
Average words per sentence =  25.75
Unique words (vocabulary) =  143
Lexical diversity =  0.69417475

Number of sentences =  10
Average words per sentence =  38.8
Unique words (vocabulary) =  208
Lexical diversity =  0.5360824742268041
-----------------------------
Number of sentences =  84
Average words per sentence =  40.98809523809524
Unique words (vocabulary) =  1011
Lexical diversity =  0.29363926808016266
-----------------------------
Number of sentences =  12
Average words per sentence =  27.25
Unique words (vocabulary) =  171
Lexical diversity =  0.5229357798165137
-----------------------------
Number of sentences =  51
Average words per sentence =  25.705882352941178
Unique words (vocabulary) =  454
Lexical diversity =  0.34630053394355453
