# Text Data Cleaning and Preprocessing Assignment

In [1]:
# import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

In [2]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = 'oreilly_radar/'
DOC_PATTERN = r'articles_rss.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 60
Number of paragraphs: 61
Number of sentences: 1810
Number of words: 59115
Vocabulary: 6899
Avg chars per word: 5.2
Avg words per sentence: 32.7


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = []

for fileid in corpus.fileids():
    doc = corpus.raw(fileid)
    docs.append(doc)

### Sentence tokenize each document in the list of documents.

In [6]:
doc_sents = [sent_tokenize(doc) for doc in docs]
doc_sents[0]

['The AI Who Mistook a Bald Head for a Football — Second-tier Scottish football club Inverness Caledonian Thistle doesn’t have a camera operator for matches at their stadium so the club uses an AI-controlled camera that’s programmed to follow the ball for their broadcasts.',
 'But in a recent match against Ayr United, the AI controller kept moving the camera off the ball to focus on the bald head of the linesman, making the match all but unwatchable.',
 'No fans allowed in the stadium either, so the broadcast was the only way to watch.',
 'Watch the video, it is hilarious and tragic.',
 'I’m sure there’s a serious lesson to be drawn from this, but I’m too busy snickering to draw it.The AI Who Mistook a Bald Head for a FootballWhy Is Subtracting These Two Times (in 1927) Giving a Strange Result?',
 '— You already knew timezones are a hellmouth, but now you have another example of how deep the hellmouth goes.',
 'Basically at midnight at the end of 1927, the clocks went back 5 minutes an

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [7]:
s_tokenized = []
for sents in doc_sents:
    temp = []
    for sent in sents:
        temp.append(word_tokenize(sent))
    s_tokenized.append(temp)

### Tag each token with its part of speech.

In [8]:
s_pos = []
for sents in s_tokenized:
    temp = []
    for sent in sents:
        temp.append(pos_tag(sent))
    s_pos.append(temp)

### Word tokenize the raw text of each document and remove stop words.

In [9]:
tokenized = [word_tokenize(doc) for doc in docs]

In [10]:
no_stopwords = []
for token in tokenized:
    temp = []
    for word in token:
        if word.lower() not in stopwords.words('english'):
            temp.append(word.lower())
    no_stopwords.append(temp)

### For every document, stem all the words in the document.

In [11]:
stemmer = SnowballStemmer('english')

stem_words = []
for token in tokenized:
    temp = []
    for word in token:
        temp.append(stemmer.stem(word.lower()))
    stem_words.append(temp)

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [12]:
for i in range(0, len(doc_sents)):
    num_sent = len(doc_sents[i])
    avg_words_sent = sum([len(sent) for sent in s_tokenized[i]]) / num_sent
    vocab = len(set([word.lower() for word in word_tokenize(docs[i])]))
    lex_div = vocab / len(word_tokenize(docs[i]))

    print('-----------------------------')
    print('Number of sentences = ', num_sent)
    print('Average words per sentence = ', avg_words_sent)
    print('Unique words (vocabulary) = ', vocab)
    print('Lexical diversity = ', lex_div)

-----------------------------
Number of sentences =  14
Average words per sentence =  29.5
Unique words (vocabulary) =  216
Lexical diversity =  0.5230024213075061
-----------------------------
Number of sentences =  34
Average words per sentence =  33.6764705882353
Unique words (vocabulary) =  545
Lexical diversity =  0.4759825327510917
-----------------------------
Number of sentences =  51
Average words per sentence =  25.705882352941178
Unique words (vocabulary) =  454
Lexical diversity =  0.34630053394355453
-----------------------------
Number of sentences =  8
Average words per sentence =  34.5
Unique words (vocabulary) =  167
Lexical diversity =  0.605072463768116
-----------------------------
Number of sentences =  4
Average words per sentence =  53.75
Unique words (vocabulary) =  130
Lexical diversity =  0.6046511627906976
-----------------------------
Number of sentences =  43
Average words per sentence =  32.395348837209305
Unique words (vocabulary) =  599
Lexical diversity