In [1]:
# Import all of the things you need to import!
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem.porter import PorterStemmer

pd.options.display.max_columns = 30
%matplotlib inline

# Homework 14 (or so): TF-IDF text analysis and clustering

Hooray, we kind of figured out how text analysis works! Some of it is still magic, but at least the **TF** and **IDF** parts make a little sense. Kind of. Somewhat.

No, just kidding, we're *professionals* now.

## Investigating the Congressional Record

The [Congressional Record](https://en.wikipedia.org/wiki/Congressional_Record) is more or less what happened in Congress every single day. Speeches and all that. A good large source of text data, maybe?

Let's pretend it's totally secret but we just got it leaked to us in a data dump, and we need to check it out. It was leaked from [this page here](http://www.cs.cornell.edu/home/llee/data/convote.html).

In [None]:
# If you'd like to download it through the command line...
#!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz

'curl' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
# And then extract it through the command line...
#!tar -zxf convote_v1.1.tar.gz

You can explore the files if you'd like, but we're going to get the ones from `convote_v1.1/data_stage_one/development_set/`. It's a bunch of text files.

In [2]:
# glob finds files matching a certain filename pattern
import glob

# Give me all the text files
paths = glob.glob('convote_v1.1/data_stage_one/development_set/*')
paths[:5]

['convote_v1.1/data_stage_one/development_set\\052_400011_0327014_DON.txt',
 'convote_v1.1/data_stage_one/development_set\\052_400011_0327025_DON.txt',
 'convote_v1.1/data_stage_one/development_set\\052_400011_0327044_DON.txt',
 'convote_v1.1/data_stage_one/development_set\\052_400011_0327046_DON.txt',
 'convote_v1.1/data_stage_one/development_set\\052_400011_1479036_DON.txt']

In [3]:
len(paths)

702

So great, we have 702 of them. Now let's import them.

In [13]:
speeches = []
for path in paths:
    with open(path) as speech_file:
        speech = {
            'pathname': path,
            'filename': path.split('/')[-1],
            'content': speech_file.read()
        }
    speeches.append(speech)
speeches_df = pd.DataFrame(speeches)
#speeches_df.head()
speeches_df['pathname'][0]

'convote_v1.1/data_stage_one/development_set\\052_400011_0327014_DON.txt'

In class we had the `texts` variable. For the homework can just do `speeches_df['content']` to get the same sort of list of stuff.

**Take a look at the contents of the first 5 speeches**

In [14]:
texts =speeches_df['content']
texts[:5]

0    mr. chairman , i thank the gentlewoman for yie...
1    mr. chairman , i want to thank my good friend ...
2    mr. chairman , i rise to make two fundamental ...
3    mr. chairman , reclaiming my time , let me mak...
4    mr. chairman , i thank my distinguished collea...
Name: content, dtype: object

# Doing our analysis

Use the `sklearn` package and a plain boring `CountVectorizer` to get a list of all of the tokens used in the speeches. If it won't list them all, that's ok! Make a dataframe with those terms as columns.

**Be sure to include English-language stopwords**

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')

In [17]:
Xc = count_vectorizer.fit_transform(texts)

In [18]:
Xc

<702x9106 sparse matrix of type '<class 'numpy.int64'>'
	with 56106 stored elements in Compressed Sparse Row format>

In [19]:
Xc.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
pd.DataFrame(Xc.toarray()).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,9091,9092,9093,9094,9095,9096,9097,9098,9099,9100,9101,9102,9103,9104,9105
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
Xc_feature= pd.DataFrame(Xc.toarray(), columns=count_vectorizer.get_feature_names())
Xc_feature.head(3)

Unnamed: 0,000,00007,018,050,092,10,100,106,107,108,108th,109th,10th,11,110,...,yields,york,yorkers,young,younger,youngsters,youth,yuan,zero,zeroing,zeros,zigler,zirkin,zoe,zoellick
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Okay, it's **far** too big to even look at. Let's try to get a list of features from a new `CountVectorizer` that only takes the top 100 words.

In [24]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [25]:
porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
    
    words = re.sub(r"[^A-Za-z]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    #print(words)
    return words
    


In [27]:
count_vectorizer = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer, max_features=100)
Xc100 = count_vectorizer.fit_transform(texts)
print(count_vectorizer.get_feature_names())

['act', 'allow', 'amend', 'american', 'amp', 'ani', 'appropri', 'associ', 'balanc', 'base', 'becaus', 'believ', 'chairman', 'children', 'china', 'civil', 'colleagu', 'committe', 'congress', 'continu', 'countri', 'court', 'day', 'debat', 'discrimin', 'doe', 'educ', 'elect', 'faith', 'feder', 'fund', 'gentleman', 'good', 'govern', 'gt', 'h', 'ha', 'head', 'help', 'hous', 'import', 'includ', 'issu', 'just', 'know', 'law', 'lawsuit', 'legisl', 'like', 'make', 'mani', 'member', 'million', 'mr', 'nation', 'nbsp', 'need', 'new', 'offer', 'onli', 'organ', 'peopl', 'polici', 'program', 'protect', 'provid', 'provis', 'r', 'religi', 'repres', 'requir', 'right', 'rule', 's', 'say', 'school', 'servic', 'speaker', 'start', 'state', 'support', 'teacher', 'thank', 'thi', 'think', 'time', 'today', 'trade', 'unit', 'urg', 'use', 'veri', 'vote', 'wa', 'want', 'way', 'work', 'xz', 'year', 'yield']


In [None]:
#count_vectorizer.get_feature_names()

Now let's push all of that into a dataframe with nicely named columns.

In [31]:
df_Xc = pd.DataFrame(Xc100.toarray(), columns=count_vectorizer.get_feature_names())
df_Xc.head(3)

Unnamed: 0,act,allow,amend,american,amp,ani,appropri,associ,balanc,base,becaus,believ,chairman,children,china,...,time,today,trade,unit,urg,use,veri,vote,wa,want,way,work,xz,year,yield
0,3,1,2,3,0,0,0,0,4,0,1,1,3,0,0,...,3,2,0,1,0,0,2,1,1,1,2,0,1,0,2
1,1,1,1,0,0,0,0,0,2,0,1,0,2,0,0,...,2,2,0,0,0,0,1,1,0,1,3,0,3,1,0
2,1,0,0,1,0,0,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1


Everyone seems to start their speeches with "mr chairman" - how many speeches are there total, and many don't mention "chairman" and how many mention neither "mr" nor "chairman"?

In [32]:
df_Xc['act'].count()

702

In [38]:
df_Xc[df_Xc["chairman"]==0]['chairman'].count()

250

In [43]:
df_Xc[df_Xc["mr"]==0]['mr'].count()

78

In [48]:
total = df_Xc[df_Xc["mr"]==0]['mr'].count() + df_Xc[df_Xc["chairman"]==0]['chairman'].count()
print(total,"speaches in total do not mention neither 'mr' nor 'chairman'")

328 speaches in total do not mention neither 'mr' nor 'chairman'


What is the index of the speech thank is the most thankful, a.k.a. includes the word 'thank' the most times?

In [49]:
thank = df_Xc[df_Xc["thank"]!=0]
thank.head(3)

Unnamed: 0,act,allow,amend,american,amp,ani,appropri,associ,balanc,base,becaus,believ,chairman,children,china,...,time,today,trade,unit,urg,use,veri,vote,wa,want,way,work,xz,year,yield
0,3,1,2,3,0,0,0,0,4,0,1,1,3,0,0,...,3,2,0,1,0,0,2,1,1,1,2,0,1,0,2
1,1,1,1,0,0,0,0,0,2,0,1,0,2,0,0,...,2,2,0,0,0,0,1,1,0,1,3,0,3,1,0
4,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,1,2,0,0,0,0,1,0,2


In [50]:
thank_column = thank['thank']
thank_column.sort(inplace=False, ascending=False).head(1)

  from ipykernel import kernelapp as app


577    9
Name: thank, dtype: int64

If I'm searching for `China` and `trade`, what are the top 3 speeches to read according to the `CountVectoriser`?

In [51]:
china_trade = df_Xc['china'] + df_Xc['trade']

In [52]:
china_trade.sort(inplace=False, ascending=False).head(3)

  if __name__ == '__main__':


379    93
399    37
367    27
dtype: int64

Now what if I'm using a `TfidfVectorizer`?

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(texts)
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [None]:
# checking l2_vectorizer.get_feature_names()
l2_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=True)
X = l2_vectorizer.fit_transform(texts)
l2_df = pd.DataFrame(X.toarray(), columns=l2_vectorizer.get_feature_names())
l2_df

**What's the content of the speeches?** Here's a way to get them:

In [None]:
# index 0 is the first speech, which was the first one imported.
paths[0]

In [None]:
# Pass that into 'cat' using { } which lets you put variables in shell commands
# that way you can pass the path to cat
!echo {paths[0]}
!type a.text

**Now search for something else!** Another two terms that might show up. `elections` and `chaos`? Whatever you thnik might be interesting.

In [None]:
df.columns

In [None]:
election_chaos = df['elect'] + df['chao']

In [None]:
election_chaos.sort(inplace=False, ascending=False).head(5)

In [None]:
pd.DataFrame([df['elect'], df['chao'], df['elect'] + df['chao']], index=["elect", "chao", "elect + chao"]).T

# Enough of this garbage, let's cluster

Using a **simple counting vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

Using a **term frequency vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

Using a **term frequency inverse document frequency vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

In [None]:
from sklearn.cluster import KMeans

number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

In [None]:
#count vectorization
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = count_vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

In [None]:
#texts

In [None]:
results = pd.DataFrame()
results['text'] = texts
results['category'] = km.labels_
results

In [None]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

In [None]:
results = pd.DataFrame()
results['text'] = texts
results['category'] = km.labels_
results

In [None]:

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = l2_vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

**Which one do you think works the best?**

# Harry Potter time

I have a scraped collection of Harry Potter fanfiction at https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip.

I want you to read them in, vectorize them and cluster them. Use this process to find out **the two types of Harry Potter fanfiction**. What is your hypothesis?

In [3]:
import glob
paths = glob.glob('hp/hp/*')
paths[:5]

['hp/hp\\10001898.txt',
 'hp/hp\\10004131.txt',
 'hp/hp\\10004927.txt',
 'hp/hp\\10007980.txt',
 'hp/hp\\10010343.txt']

In [4]:
len(paths)

1328

In [31]:
reviews = []
for path in paths:
    with open(path) as review_file:
        review = {
            'pathname': path,
            'filename': path.split('/')[-1],
            'content': review_file.read()
        }
    reviews.append(review)
reviews_df = pd.DataFrame(reviews)
reviews_df.head()

Unnamed: 0,content,filename,pathname
0,Prologue: The MissionDisclaimer: All character...,hp\10001898.txt,hp/hp\10001898.txt
1,BlackDisclaimer: I do not own Harry PotterAuth...,hp\10004131.txt,hp/hp\10004131.txt
2,"Chapter 1""I'm pregnant.""""""""Mum please say some...",hp\10004927.txt,hp/hp\10004927.txt
3,"Author's Note: Hey, just so you know, this is ...",hp\10007980.txt,hp/hp\10007980.txt
4,Disclaimer: I do not own Harry Potter and frie...,hp\10010343.txt,hp/hp\10010343.txt


In [20]:
texts =reviews_df['content']
texts

0       Prologue: The MissionDisclaimer: All character...
1       BlackDisclaimer: I do not own Harry PotterAuth...
2       Chapter 1"I'm pregnant.""""Mum please say some...
3       Author's Note: Hey, just so you know, this is ...
4       Disclaimer: I do not own Harry Potter and frie...
5       Disclaimer: I don't own any character in the H...
6       DISCLAIMER: I don't own Harry Potter and its c...
7       Katherine Rose-TylerChapter One: the Introduct...
8       I am no longer that shy little boy anymore.I w...
9       Happy New year! *throws confetti*I've really b...
10      2014"It's ridiculous." The red-headed boy shoo...
11      Disclaimer: Did you really think I was J.K. Ro...
12      This is my first story on fanfic and I'm nervo...
13      DISCLAIMER: I don't own anything here that loo...
14      A/N: So, this is my second ongoing story, and ...
15      Disclaimer: I do not own Harry Potter. Enjoy t...
16      For my friend, constant cheerleader and talent...
17      Discla

### Vectorize
#### Count Vectorization

In [21]:
from sklearn.feature_extraction.text import CountVectorizer


In [22]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [23]:
def stemming_tokenizer(str_input):
    
    words = re.sub(r"[^A-Za-z]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    #print(words)
    return words

In [29]:
count_vectorizer = CountVectorizer(stop_words="english", tokenizer=stemming_tokenizer, max_features = 100)
Z = count_vectorizer.fit_transform(texts)
#print(count_vectorizer.get_feature_names())

In [None]:
pd.DataFrame(Z.toarray(), columns=count_vectorizer.get_feature_names()).head(3)

In [32]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l2')
Z = tfidf_vectorizer.fit_transform(texts)
pd.DataFrame(Z.toarray(), columns=tfidf_vectorizer.get_feature_names()).head(3)

MemoryError: 