# Text Vectorization and Feature Engineering Assignment

In [1]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# import nltk
# nltk.download('wordnet')

In [2]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
    print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [3]:
PATH = 'lite_cnn/'
DOC_PATTERN = r'articles_text.*\.p'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 14
Number of paragraphs: 14
Number of sentences: 427
Number of words: 13668
Vocabulary: 2927
Avg chars per word: 5.0
Avg words per sentence: 32.0


### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [4]:
docs = []

for fileid in corpus.fileids():
    doc = corpus.raw(fileid)
    docs.append(doc)
    
docs[4]

'Most important takeaways from the first day of public impeachment hearings(CNN) - The House Intelligence Committee opened historic impeachment hearings Wednesday to investigate whether President Donald Trump (and his allies) abused his office in an attempt to strong-arm Ukraine into opening an inquiry into his political rival, former Vice President Joe Biden.I monitored the highly anticipated -- and hugely high-stakes -- hearing as it happened. Below, my thoughts on the biggest moments of the day.Adam Schiff appeals to history -- and the futureIntelligence Committee Chairman Adam Schiff\'s opening statement had a very clear message: These hearings aren\'t about just Trump. They\'re about how the presidency functions (and should function) within our democracy -- and about the checks and balances between the legislative and executive branches built into the Constitution."Our answer to these questions will affect not only the future of this presidency, but the future of the presidency it

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [5]:
# tokenize
doc_words = [word_tokenize(doc) for doc in docs]

In [6]:
# lowercase
doc_words_lower = []
for i in range(0, len(doc_words)):
    temp = [word.lower() for word in doc_words[i]]
    doc_words_lower.append(temp)

In [7]:
# remove stop words
doc_words_nostop = []
for i in range(0, len(doc_words_lower)):
    temp = []
    for word in doc_words_lower[i]:
        if word not in stopwords.words('english'):
            temp.append(word)
    doc_words_nostop.append(temp)

In [8]:
# remove punctuation
doc_words_nopunc = []
for i in range(0, len(doc_words_nostop)):
    temp = []
    for word in doc_words_nostop[i]:
        if word.isalpha():
            temp.append(word)
    doc_words_nopunc.append(temp)

In [9]:
# lemmatize
doc_words_lemma = []
lemmatizer = WordNetLemmatizer()
for i in range(0, len(doc_words_nopunc)):
    temp = []
    for word in doc_words_nopunc[i]:
        temp.append(lemmatizer.lemmatize(word))
    doc_words_lemma.append(temp)

In [10]:
# Stemming
doc_words_stem = []
stemmer = SnowballStemmer('english')
for i in range(0, len(doc_words_lemma)):
    temp = []
    for word in doc_words_lemma[i]:
        temp.append(stemmer.stem(word))
    doc_words_stem.append(temp)

In [11]:
doc_words_clean = doc_words_stem

### Count vectorize the preprocessed documents.

In [12]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(docs)

count = pd.DataFrame(vectors.toarray(),
                    columns=vectorizer.get_feature_names())
count

Unnamed: 0,000,10,10th,11,11if,13,14,15,154,16,...,yes,yesterday,yet,york,yorker,you,young,younger,your,zelensky
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,0,1,...,0,0,0,2,0,4,0,1,1,0
4,0,0,0,2,1,0,0,0,0,0,...,1,0,0,1,0,4,0,0,0,7
5,0,1,0,0,0,0,0,0,0,1,...,4,0,0,1,0,9,1,0,0,0
6,0,2,0,0,0,0,0,0,0,0,...,0,0,0,1,1,5,0,0,0,0
7,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,2,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### One hot vectorize the preprocessed documents.

In [13]:
vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(docs)

one_hot = pd.DataFrame(vectors.toarray(),
                    columns=vectorizer.get_feature_names())
one_hot

Unnamed: 0,000,10,10th,11,11if,13,14,15,154,16,...,yes,yesterday,yet,york,yorker,you,young,younger,your,zelensky
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,0,1,...,0,0,0,1,0,1,0,1,1,0
4,0,0,0,1,1,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
5,0,1,0,0,0,0,0,0,0,1,...,1,0,0,1,0,1,1,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,0
7,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,1,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF vectorize the preprocessed documents.

In [14]:
vectorizer = TfidfVectorizer(binary=True)
vectors = vectorizer.fit_transform(docs)

tfidf = pd.DataFrame(vectors.toarray(),
                    columns=vectorizer.get_feature_names())
tfidf

Unnamed: 0,000,10,10th,11,11if,13,14,15,154,16,...,yes,yesterday,yet,york,yorker,you,young,younger,your,zelensky
0,0.0,0.0,0.123131,0.0,0.0,0.0,0.085709,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.041976,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.057857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.064009,0.0,0.083118,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.062667,0.043621,0.054239,0.0,0.048259,...,0.0,0.0,0.0,0.036627,0.0,0.033852,0.0,0.062667,0.043621,0.0
4,0.0,0.0,0.0,0.045821,0.045821,0.0,0.0,0.0,0.0,0.0,...,0.035286,0.0,0.0,0.026781,0.0,0.024752,0.0,0.0,0.0,0.039659
5,0.0,0.039138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0433,...,0.0433,0.0,0.0,0.032863,0.0,0.030373,0.0433,0.0,0.0,0.0
6,0.0,0.034151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.028675,0.049062,0.026503,0.0,0.0,0.0,0.0
7,0.058286,0.046876,0.0,0.0,0.0,0.0,0.0,0.0,0.067342,0.0,...,0.0,0.067342,0.0,0.0,0.0,0.0,0.05186,0.0,0.046876,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.081234,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [15]:
!pip install gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_documents = [TaggedDocument(doc, [i]) 
             for i, doc in enumerate(docs)]

tagged_documents[0]



TaggedDocument(words='Pink taking a break to focus on family(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been. "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family."The star also praised her husband, with whom she will celebrate 14 years of marriage in January."Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn."According to Billboard, Pink\'s Beautifu

In [16]:
model = Doc2Vec(tagged_documents)

In [17]:
doc2vec = pd.DataFrame([[document]+list(model[document]) 
                        for document in range(len(tagged_documents))]).drop(0, axis=1)

In [18]:
doc2vec.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,0.047925,-0.005739,-0.171166,-0.055812,-0.23052,-0.173506,-0.091844,0.01166,-0.066426,0.034212,...,-0.035768,-0.108113,0.051208,0.174373,0.158844,0.184676,-0.085756,-0.006547,0.135862,0.078276
1,0.068717,-0.016631,-0.242746,-0.084747,-0.361728,-0.266111,-0.145188,0.011542,-0.093729,0.045493,...,-0.055433,-0.16208,0.079884,0.257535,0.238913,0.274923,-0.123338,-0.011578,0.212131,0.120439
2,0.058669,-0.007198,-0.225815,-0.075253,-0.316862,-0.239437,-0.128044,0.011317,-0.08177,0.035624,...,-0.048052,-0.142826,0.067961,0.235669,0.205295,0.254463,-0.112174,-0.019673,0.178726,0.108304
3,0.066797,-0.003521,-0.23476,-0.081026,-0.342171,-0.256211,-0.13444,0.019612,-0.087049,0.038552,...,-0.0512,-0.156206,0.06971,0.253198,0.221818,0.26475,-0.121015,-0.013008,0.194999,0.116466
4,0.049966,-0.004302,-0.264636,-0.083893,-0.354937,-0.255226,-0.136958,0.016981,-0.08031,0.033115,...,-0.064819,-0.153612,0.08361,0.253169,0.225385,0.275139,-0.123308,-0.016878,0.202967,0.118105
