# Project2 Part1 - Text Analysis through TFIDF computation


In [77]:
from text_analyzer import read_sonnets, clean_corpus, tf, get_top_k, idf, tf_idf, cosine_sim

import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
# run text_analyzer.py with default arguments
!python text_analyzer.py


Sonnet 1 TF (Top 20):
[('the', 6), ('thy', 5), ('to', 4), ('and', 3), ('that', 2), ('might', 2), ('but', 2), ('by', 2), ('his', 2), ('tender', 2), ('thou', 2), ('thine', 2), ('own', 2), ('self', 2), ('worlds', 2), ('from', 1), ('fairest', 1), ('creatures', 1), ('we', 1), ('desire', 1), ('increase', 1), ('thereby', 1), ('beautys', 1), ('rose', 1), ('never', 1), ('die', 1), ('as', 1), ('riper', 1), ('should', 1), ('time', 1), ('decease', 1), ('heir', 1), ('bear', 1), ('memory', 1), ('contracted', 1), ('bright', 1), ('eyes', 1), ('feedst', 1), ('lights', 1), ('flame', 1), ('with', 1), ('selfsubstantial', 1), ('fuel', 1), ('making', 1), ('a', 1), ('famine', 1), ('where', 1), ('abundance', 1), ('lies', 1), ('foe', 1), ('sweet', 1), ('too', 1), ('cruel', 1), ('art', 1), ('now', 1), ('fresh', 1), ('ornament', 1), ('only', 1), ('herald', 1), ('gaudy', 1), ('spring', 1), ('within', 1), ('bud', 1), ('buriest', 1), ('content', 1), ('churl', 1), ('makst', 1), ('waste', 1), ('in', 1), ('niggardi

## a. Read about argparse.
Look at its implementation in the Python Script. Follow the instruction and answer the questions in the Argparse section.

#### TODO: answer here


## b. Read and Clean the data

In [79]:
d_corpus='data/shakespeare_sonnets/'

# return dictionary with keys corresponding to file names and values being the respective contents
corpus = read_sonnets(d_corpus)

# return corpus (dict) with each sonnet cleaned and tokenized for further processing
corpus = clean_corpus(corpus)

In [80]:
corpus['1'] # keys 1-154 for all sonnets (type: str)

['from',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase',
 'that',
 'thereby',
 'beautys',
 'rose',
 'might',
 'never',
 'die',
 'but',
 'as',
 'the',
 'riper',
 'should',
 'by',
 'time',
 'decease',
 'his',
 'tender',
 'heir',
 'might',
 'bear',
 'his',
 'memory',
 'but',
 'thou',
 'contracted',
 'to',
 'thine',
 'own',
 'bright',
 'eyes',
 'feedst',
 'thy',
 'lights',
 'flame',
 'with',
 'selfsubstantial',
 'fuel',
 'making',
 'a',
 'famine',
 'where',
 'abundance',
 'lies',
 'thy',
 'self',
 'thy',
 'foe',
 'to',
 'thy',
 'sweet',
 'self',
 'too',
 'cruel',
 'thou',
 'that',
 'art',
 'now',
 'the',
 'worlds',
 'fresh',
 'ornament',
 'and',
 'only',
 'herald',
 'to',
 'the',
 'gaudy',
 'spring',
 'within',
 'thine',
 'own',
 'bud',
 'buriest',
 'thy',
 'content',
 'and',
 'tender',
 'churl',
 'makst',
 'waste',
 'in',
 'niggarding',
 'pity',
 'the',
 'world',
 'or',
 'else',
 'this',
 'glutton',
 'be',
 'to',
 'eat',
 'the',
 'worlds',
 'due',
 'by',
 'the',
 'grave',
 'and',

## c. TF

In [81]:
# assign 1.txt to variable sonnet to process and find its TF (Note corpus is of type dic, but sonnet1 is just a str)
sonnet1 = corpus['1']

# determine tf of sonnet
sonnet1_tf = tf(sonnet1)

# get sorted list and slice out top 20
sonnet1_top20 = get_top_k(sonnet1_tf)
# print
# print("Sonnet 1 (Top 20):")
df = pd.DataFrame(sonnet1_top20, columns=["word", "count"])
df.head(20)

Unnamed: 0,word,count
0,the,6
1,thy,5
2,to,4
3,and,3
4,that,2
5,might,2
6,but,2
7,by,2
8,his,2
9,tender,2


In [82]:
# TF of entire corpus
flattened_corpus = [word for sonnet in corpus.values() for word in sonnet]
corpus_tf = tf(flattened_corpus)
corpus_top20 = get_top_k(corpus_tf)
# print
# print("Corpus TF (Top 20):")
df = pd.DataFrame(corpus_top20, columns=["word", "count"])
df.head(20)

Unnamed: 0,word,count
0,and,491
1,the,430
2,to,408
3,my,397
4,of,372
5,i,343
6,in,322
7,that,320
8,thy,287
9,thou,235


### Q: Discussion
Do you believe the most frequent words would discriminate between documents well? Why or why not? Any thoughts on how we can improve this representation? Does there appear to be any ‘noise’? If so, where? If not, it should be clear by the end of the assignment.

#### TODO: answer here

## d. IDF

In [83]:
# IDF of corpus
corpus_idf = idf(corpus)

corpus_tf_ordered = get_top_k(corpus_idf)
# print top 20 to add to report
df = pd.DataFrame(corpus_tf_ordered, columns=["word", "score"])
df.head(20)

Unnamed: 0,word,score
0,a,0.462242
1,abhor,5.036953
2,abide,4.343805
3,able,5.036953
4,about,5.036953
5,above,3.650658
6,absence,3.427515
7,absent,3.93834
8,abundance,3.650658
9,abundant,5.036953


### Q: observe and briefly comment on the difference in top 20 lists (comparing TF of corpus vs its IDF).

#### TODO: answer here

## e. TF-IDF

In [84]:
# TFIDF of Sonnet1 w.r.t. corpus
sonnet1_tfidf = tf_idf(corpus_idf, sonnet1_tf)
sonnet1_tfidf_ordered = get_top_k(sonnet1_tfidf)
# print
# print("Sonnet 1 TFIDF (Top 20):")
df = pd.DataFrame(sonnet1_tfidf_ordered, columns=["word", "score"])
df.head(20)

Unnamed: 0,word,score
0,the,0.834677
1,thy,3.151167
2,to,0.213384
3,and,0.119221
4,that,0.323511
5,might,3.89182
6,but,0.672944
7,by,1.725131
8,his,2.023202
9,tender,6.490386


### Q. What is different with this list than just using TF?

#### TODO: answer here

## f. Compare all documents

In [85]:
# TODO: Visualize as a heatmap


### Q. Observe the heatmap. What insight do you get from it?

#### TODO: answer here