# KMeans clustering ANY documents

## Read in your files if they're all separate

In [24]:
import pandas as pd
import glob

filenames = glob.glob("fanfiction-twilight/*.txt")

# Maybe use open(filename, encoding='latin-1').read()
# if you're running into trouble
contents = [open(filename, encoding='latin-1').read() for filename in filenames]

df = pd.DataFrame({
    'text': contents,
    'filenames': filenames
})
df.head()

Unnamed: 0,filenames,text
0,fanfiction-twilight/10016071.txt,What follows is the original one-shot-no edits...
1,fanfiction-twilight/10016524.txt,A/N: Hey guys this is my new story hope you e...
2,fanfiction-twilight/10019441.txt,I saw him. Him and Bella. But who cares about ...
3,fanfiction-twilight/10021891.txt,Disclaimer: All publicly recognizable characte...
4,fanfiction-twilight/10029139.txt,Thou art a heartless monsterDisclaimer: All re...


## Or read in your CSV with the text column if not

In [None]:
# import pandas as pd
# df = pd.read_csv("")

## Vectorize your documents

What are the options when creating a `TfidfVectorizer`?

In [3]:
TfidfVectorizer?

Object `TfidfVectorizer` not found.


Let's think about:

* **ngram_rang**: Do we just want single words? Or more? (1,2) is one- and two-word phrases, etc.
* **max_features**: Can it make things faster? `1` and up
* **max_df**: Should we ignore words that show up too often? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **min_df**: Should we ignore words that show up too little? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **vocabulary**: Only care about certain words

Also... how many documents do we have?

In [25]:
df.shape

(623, 2)

In [38]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

from sklearn.feature_extraction import stop_words
custom_stopwords = ['just', 'said'] + list(stop_words.ENGLISH_STOP_WORDS)

# Vectorize and save into a new dataframe
vec = TfidfVectorizer(tokenizer=textblob_tokenizer,
                      #stop_words='english',
                      stop_words=custom_stopwords,
                      max_df=0.8, 
                      min_df=0.15,
                      #max_features=250,
                      use_idf=True)

# Fit from the 'text' column of our dataframe
matrix = vec.fit_transform(df['text'])

# Then turn it into a new dataframe
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

CPU times: user 44.5 s, sys: 132 ms, total: 44.6 s
Wall time: 44.8 s


In [39]:
results.head()

Unnamed: 0,'d,'ll,'m,'re,'ve,1,abl,abov,accept,act,...,wors,wrap,write,wrong,ye,yeah,year,yell,young,younger
0,0.120373,0.17439,0.072623,0.068492,0.0,0.0,0.0,0.082942,0.0,0.0,...,0.0,0.1045,0.0,0.030917,0.025884,0.0,0.042692,0.0,0.0,0.0
1,0.0,0.043417,0.084375,0.0,0.0,0.025429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.041052,0.034368,0.0,0.014172,0.023702,0.0,0.0
2,0.0,0.020692,0.120637,0.021671,0.022188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033064,0.0,0.0,0.024569,0.029347,0.0,0.0,0.0,0.0
3,0.227994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0488,0.0,0.0,0.235848,0.0,0.050682,0.0
4,0.039813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.042804,0.0,0.0,0.0,0.0,0.0


> ...Try it without the TextBlob tokenizer

## Cluster your documents

In [40]:
%%time
from sklearn.cluster import KMeans

# How many clusters?
number_of_clusters=4
km = KMeans(n_clusters=number_of_clusters)

print("Fitting", number_of_clusters, "clusters usinga ", matrix.shape, "matrix")

# Let's fit it!
km.fit(matrix)
km.fit

Fitting 4 clusters usinga  (623, 665) matrix
CPU times: user 12 s, sys: 57.6 ms, total: 12 s
Wall time: 12.1 s


## See what they look like

In [43]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: 'm hand befor ask say smile head bella 're mom
Cluster 1: stori vampir love human 'm hope girl onli way life
Cluster 2: jacob bella charli edward 'm hand love befor away year
Cluster 3: edward bella alic jasper carlisl emmett 'm hand love ask


## Push the category back to the original dataframe

In [42]:
df['category'] = km.labels_
df

Unnamed: 0,filenames,text,category
0,fanfiction-twilight/10016071.txt,What follows is the original one-shot-no edits...,0
1,fanfiction-twilight/10016524.txt,A/N: Hey guys this is my new story hope you e...,3
2,fanfiction-twilight/10019441.txt,I saw him. Him and Bella. But who cares about ...,3
3,fanfiction-twilight/10021891.txt,Disclaimer: All publicly recognizable characte...,0
4,fanfiction-twilight/10029139.txt,Thou art a heartless monsterDisclaimer: All re...,1
5,fanfiction-twilight/10029797.txt,"Hey guys, Rayne here. Just letting you know th...",3
6,fanfiction-twilight/10030147.txt,I know I know a new story. I couldn't resist i...,3
7,fanfiction-twilight/10042940.txt,"AN: Hello, welcome to my new story! I'm bounci...",1
8,fanfiction-twilight/10046233.txt,"After a long time, I don't know anybody is sti...",1
9,fanfiction-twilight/10047055.txt,"Cole's POV I watched the time on my phone, tap...",0


## Be pleased

In [31]:
df[df.category == 1]

Unnamed: 0,filenames,text,category
2,fanfiction-twilight/10019441.txt,I saw him. Him and Bella. But who cares about ...,1
6,fanfiction-twilight/10030147.txt,I know I know a new story. I couldn't resist i...,1
9,fanfiction-twilight/10047055.txt,"Cole's POV I watched the time on my phone, tap...",1
25,fanfiction-twilight/10103988.txt,those who sparklechapter 1I never looked at Ed...,1
26,fanfiction-twilight/10109144.txt,"""Someday someone is going to look at you with ...",1
27,fanfiction-twilight/10109176.txt,"Summary: After being changed against her will,...",1
29,fanfiction-twilight/10120506.txt,"I don't own anything Twilight, though I am tak...",1
30,fanfiction-twilight/10130957.txt,"Chapter 1 Birthday BoyMonday, June 20""Good mo...",1
39,fanfiction-twilight/10161589.txt,"She stumbled as she tried to pick up her pace,...",1
46,fanfiction-twilight/10189248.txt,Disclaimer: The author does not own any public...,1
