In [2]:
import pandas as pd
import numpy as np
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
okc = pd.read_csv('../Assets/A/one_long_essay.csv', index_col='Unnamed: 0')

In [4]:
okc.shape

(53951, 32)

In [5]:
def denull(essay):
    if type(essay) == float:
        return ''
    else: return essay
    
okc.essays = okc.essays.apply(denull)

In [17]:
t0 = time()
vec = TfidfVectorizer(encoding='utf-8', stop_words='english', max_features=2000)
tf = vec.fit_transform(okc.essays)
print "vectorized essays in %g seconds" %(time()-t0)

vectorized essays in 17.2678 seconds


In [18]:
tf = pd.DataFrame(tf.toarray(), columns=vec.get_feature_names())

mean_words = tf.iloc[:,:2000].mean(axis=0)
mean_words.sort_values(inplace=True, ascending=False)

In [23]:
for word in mean_words.head(300).index:
    print word

like
love
good
friends
music
people
life
time
just
things
food
new
don
really
work
know
want
movies
family
ve
enjoy
going
fun
books
think
working
looking
make
lot
pretty
favorite
making
great
world
trying
person
ll
live
day
laugh
school
home
years
read
try
way
say
city
movie
big
doing
im
right
reading
little
smile
travel
having
cooking
art
long
meet
shows
open
tv
guy
getting
watching
san
living
rock
book
feel
stuff
bay
best
night
dancing
better
humor
thing
job
usually
kind
playing
hard
play
eyes
old
sense
probably
happy
sf
watch
coffee
francisco
currently
man
area
wine
year
eat
dinner
games
listening
mind
need
nice
friend
girl
dance
traveling
maybe
enjoying
moved
bit
talk
sports
funny
learning
water
spend
recently
tell
future
ask
hanging
bad
writing
easy
interesting
got
taking
house
thinking
let
list
hair
game
look
different
interested
times
awesome
self
especially
places
place
company
week
cook
days
actually
dog
adventure
eating
conversation
social
real
fiction
heart
sex
men
country
c

In [47]:
# clustering may work better (or at least not work faster) if I run on a small subset of features
# try excluding the 50 most common words and only include the 500 after that

t0 = time()
vec = TfidfVectorizer(encoding='utf-8', stop_words='english',max_df=50, max_features=500)
tf = vec.fit_transform(okc.essays)
print "vectorized essays in %g seconds" %(time()-t0)

vectorized essays in 18.9986 seconds


In [28]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

In [33]:
# Compute DBSCAN
db = DBSCAN(eps=0.05, min_samples=10).fit(tf)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(tf, labels))

Estimated number of clusters: 492


KeyboardInterrupt: 

In [48]:
# Can I get fewer clusters?
print labels

[-1  0  1 ...,  0  0 -1]


In [49]:
# Save feature names to tf
tf = pd.DataFrame(tf.toarray(), columns=vec.get_feature_names())
# Save labels to tf and examine content
tf['labels'] = pd.Series(labels)

In [60]:
# Most essays are in cluster 0, the other 491 are spread out among the others and about 5000 are not in a cluster
# Do these nonzero clusters MEAN anything?
# Lets examine the original text?
tf['labels'].value_counts()

 0      37877
-1       4945
 42        40
 322       38
 468       37
 280       37
 320       36
 26        35
 90        34
 104       34
 425       33
 324       33
 383       32
 118       32
 6         32
 46        32
 186       32
 211       32
 423       32
 216       32
 200       32
 71        31
 135       31
 40        31
 276       31
 108       31
 73        31
 293       31
 470       31
 334       31
        ...  
 369       15
 368       15
 465       15
 197       15
 91        15
 270       15
 281       15
 237       15
 351       15
 480       15
 323       14
 102       14
 410       14
 309       14
 233       14
 82        13
 204       13
 483       13
 459       13
 433       13
 37        13
 249       13
 466       13
 397       13
 74        13
 478       12
 112       12
 144       12
 491       11
 239       10
Name: labels, dtype: int64

In [65]:
okc.ix[tf[tf['labels']==1].index].essays[2]

'my name is paige  i am smart  have a great sense of humor  a nice smile  extroverted  and pick up on things quickly  i m laid back and am trying to find a work life balance because my job challenges me every day  i love being challenged  i like my work  i like to golf  go for a run  play soccer  practice various musical instruments  sing  dance  learn about stuff all the time  and meet new people hang out with friends  full time environmental consultant in the east bay  i am so far living out my  near term  dream from my college days   so far so good  i am still happy with where my life is headed  i have future plans to go back to school and get a graduate degree  not sure what yet   being coordinated  attracting others to the dance floor  being fun  creative  music  observing things  catching details  analyzing and breaking down stuff  explaining things simply on a broad scale  expressing myself pretty simply when there s a lot more going on besides what i may say  being myself  maps

In [67]:
okc.ix[tf[tf['labels']==1].index].essays[2695]

'i am a california native who has recently returned to the bay area after living on the east coast for 15 years  currently  i am spending a lot of time getting to know my new surroundings  preferably on foot  and am enjoying being near to family and old friends   i have an adventurous spirit and like to find the adventure in any activity  from a mundane trip to the grocery store to heli hiking on a glacier   i am inquisitive  happy  and love to travel i am helping people organize their finances  finding great new places to visit  my smile and that i laugh a lot  i just subscribed to the ft and think it s a great paper  i would like to get back to reading the new yorker   my favorite book is seabiscuit and my favorite author is jane austen  right now i am reading a biography of andrew carnegie and julia child s  my life in france    just a few movies and tv shows that i like in no particular order  swingers  into the wild  juno  movies by woodie allen  state of mind  bbc doc on north ko

In [70]:
okc.ix[tf[tf['labels']==1].index].essays[3527]

' update this was a little bit out of date  so i ve added more about my life   i m a chilean guy that had a dream to enjoy the world  that dream brought me to the united states of america  in order to use my creativity at its full extent  and pursue a master s degree at carnegie mellon university  i ve been here in the usa for 4 years  and i love it  i ve been discovering new ways to enjoy life  having fun  meeting people  making relationships and friendships better  and acting like everyday is my last day on earth  i am here because i m looking for an adventure buddy  someone who wants to do crazy things like skydiving  going to a fun concert  and dancing till our bodies give up  or just sitting down and watching a movie in my couch  i work as a game developer  sitting in the computer all day  but working with really interesting creative people from many weird disciplines  you might ask why  because i love making people happy  i enjoy the fact that i can create some kind of reality wh

In [74]:
okc.ix[tf[tf['labels']==1].index].essays[4084]

'my life is a struggle between the quixotic and the practical  on one hand  i like dive bars  tom waits and the rewards of manual labor  on the other  i enjoy romanticizing  video games  intellectual pursuits  and other irresponsible flights of fancy   i ll be working in mountain view for the summer  and intend to take in as much west coast authenticity as possible in the months i have there  earning my masters degree in hci at carnegie mellon  working on personal projects  making tools for nasa astronauts  and trying to design exceptional user experiences every day   for reals  i give the greatest hugs and have references to prove it   i don t know who that man with the glasses is  but i like his tabasco   books   kingkiller chronicles  pratchett  heinlein  catch 22  don quixote  tolkien  douglas adams  lovecraft  twain  calvino  gaiman  graphic novels   movies   the social network  ferris bueller s day off  big fish  high fidelity  the nightmare before christmas  shaun of the dead  i

In [80]:
# Do people from cluster 1 have much in common?  Not sure.

# Try some others
# 42 was a popular label
label_42 = okc.ix[tf[tf['labels']==42].index]

In [82]:
label_42.sex.value_counts()

f    34
m     6
Name: sex, dtype: int64

In [84]:
label_42[label_42['sex']=='m'].orientation.value_counts()

straight    4
gay         2
Name: orientation, dtype: int64

In [87]:
label_42

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,...,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,essays
254,25,average,mostly anything,socially,never,graduated from high school,hey well to describe myself im a person w...,i do alot with my life first and and foremos...,cutting hair making people laugh giving adv...,probably that im short but not snooki short h...,...,has a kid,straight,likes dogs and has cats,catholicism but not too serious about it,f,taurus and it&rsquo;s fun to think about,sometimes,"english (fluently), spanish (poorly)",single,hey well to describe myself im a person w...
823,40,average,strictly anything,not at all,never,,i d like to meet some new people i consider m...,,,,...,,straight,likes dogs and likes cats,christianity,m,taurus,sometimes,"english, portuguese (okay), spanish (poorly)",single,i d like to meet some new people i consider m...
1630,22,fit,,rarely,never,graduated from two-year college,i m proud to say i m an bay area native i gre...,,,,...,,bisexual,likes dogs and has cats,,f,sagittarius,sometimes,english,single,i m proud to say i m an bay area native i gre...
2441,30,,anything,often,never,working on college/university,oaktown girl who enjoys road trips whether it ...,i am a hairstylist and a student at the art in...,,,...,,straight,,other and laughing about it,f,libra and it&rsquo;s fun to think about,,english,single,oaktown girl who enjoys road trips whether it ...
2935,31,athletic,,socially,never,graduated from high school,,i m a hairstylist in downtown sf love what i d...,,i m real and people recognize that in me,...,,straight,,catholicism,m,sagittarius,,"english, italian, spanish, french",single,i m a hairstylist in downtown sf love what i d...
2985,30,thin,,socially,sometimes,working on college/university,ima stylist with a penchant for booze bobs and...,pulling out the big guns im a hairstylist i ...,doing hair drinking beer zelda,probably my hair it s can be funky my accen...,...,,straight,likes dogs and has cats,atheism and laughing about it,f,aries and it&rsquo;s fun to think about,when drinking,english (fluently),single,ima stylist with a penchant for booze bobs and...
6003,26,average,,socially,never,,i fall victim to my own repetition i m someti...,i m an rda that helps fix smiles hairstylist ...,being reliable and loyal standing up for what...,my hair my smile my tattoos my contagio...,...,,straight,likes dogs,catholicism but not too serious about it,f,pisces but it doesn&rsquo;t matter,no,"english, spanish (poorly)",single,i fall victim to my own repetition i m someti...
6801,36,used up,mostly anything,socially,never,graduated from college/university,seems like these self summaries are a work in ...,ex corporate girl turned hairstylist aka part...,,,...,doesn&rsquo;t have kids,straight,likes dogs and likes cats,catholicism but not too serious about it,f,leo and it matters a lot,no,english (fluently),single,seems like these self summaries are a work in ...
9403,25,full figured,strictly anything,rarely,never,graduated from two-year college,i m a newly married hairstylist who loves craf...,i m working on building my clientele to eventu...,cutting hair picking things up with my toes ...,,...,,bisexual,likes dogs,agnosticism and laughing about it,f,aries and it&rsquo;s fun to think about,when drinking,"english (fluently), spanish (okay)",married,i m a newly married hairstylist who loves craf...
9926,30,curvy,mostly anything,socially,,graduated from college/university,hi i m ceci i work as a hairstylist apprenti...,working laughing learning eating drinking ...,lounging by the pool,,...,doesn&rsquo;t have kids,straight,likes dogs and likes cats,,f,pisces and it&rsquo;s fun to think about,,english,single,hi i m ceci i work as a hairstylist apprenti...


In [88]:
label_42.essays[254]

'hey    well to describe myself   im a person who loves to hav fun and loves to laugh   doin spur of the moment things and goin on adventures   im very much into music such as rockabilly   metal   classic rock etc  car shows anything vintage and fun  and movies  and art   i also have a child that keeps me very busy haha    i do alot with my life   first and and foremost im a mother   next comes my job   im hairstylist  barber cutting hair   making people laugh  giving advice   liking good music    probably that im short  but not snooki short haha   chelsea handler books   twilight books  cook books and biographys    movies  fear and loathing   casino   jackie brown  cars  vegas vacation etc  tv shows  it changes from day to day but right now big brother   basket ball wives  awkward etc  music  ahem    i really do love all kinds but the tops are   mostly any rockabilly  phsycobilly   the cramps  bob marley  bay area rap  tupac  rhianna  beyonce   janet jackson  nicki minaj  the eagles  

In [89]:
label_42.essays[823]

'i d like to meet some new people  i consider myself to be creative  i feel like i never have to work because i love my job  i m a hairstylist   i work at a salon for men in the financial district of san francisco  i ve been thinking about trying a site like this but never have  i find it hard describing myself  i m a cool person   truely a nice guy     i m told i act younger than i am  but my values are pretty old school  i m fun to be around   i like having a good time  i think i m pretty funny too  i m very easy going   i get along with pretty much everybody  i love to travel  mostly to latin america or wherever i can experience new cultures  i speak portuguese pretty well   some spanish  i love sports  i like working out at the gym or playing outdoors when it s nice  i feel blessed to live in the san francisco bay area      family friends sports exercise love god    '

In [90]:
label_42.essays[1630]

'i m proud to say i m an bay area native  i grew up in the east bay and just recently moved to san francisco  finally   to be closer to my workplace  i cut hair at an awsome salon downtown and love my job more than i ever thought i could  making new clients  following the latest trends  and learning new cutting techniques is more fun than it is work   i attended an advanced placement high school that allowed me to attain both my high school diploma and associate s degree while also letting me graduate from high school a year before the rest of my class  when my career as a hairstylist reaches its  peak  i plan on going back to school to study world history and physical anthropology just for the hell of it  yeah  i m kind of a huge nerd    i love living in this city and meeting people who are driven  hard working  and responsible  but know how to cut loose and have a good time every now and again   '

Label 42 seems to be mostly hair stylists and other people who are into hair. Vast majority are women although data set is majority male.

In [91]:
# 322 is another popular label.  What's that?
label_322 = okc.ix[tf[tf['labels']==322].index]

In [92]:
label_322

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,...,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,essays
2552,24,average,,socially,never,working on college/university,well what can i say about myself i am here to...,i am currently going to sfsu finishing my b a ...,cooking i love to cook and i have a passion f...,well i am assuming this is more psychical i g...,...,,gay,has dogs,agnosticism but not too serious about it,f,capricorn but it doesn&rsquo;t matter,no,english,single,well what can i say about myself i am here to...
3092,20,average,mostly vegetarian,socially,never,working on college/university,apparently has become a new favorite word o...,working at a doggy daycare it s really fun an...,being a girl cooking vegetarian cuisine and...,people start speaking to me in spanish and i r...,...,doesn&rsquo;t want kids,straight,has dogs and likes cats,,f,cancer,no,"english (fluently), spanish (fluently)",single,apparently has become a new favorite word o...
3884,18,a little extra,strictly vegetarian,not at all,never,working on two-year college,i m currently a pre nursing student attending ...,i go to school work part time and i aim to tr...,cooking i love breakfast foods and desserts ...,my height i m a lot taller than most women ...,...,"doesn&rsquo;t have kids, and doesn&rsquo;t wan...",straight,has dogs and likes cats,agnosticism but not too serious about it,f,leo and it&rsquo;s fun to think about,no,"english (fluently), spanish (poorly)",single,i m currently a pre nursing student attending ...
4081,23,thin,strictly anything,not at all,never,graduated from high school,first i am a mommy that s my number one prior...,being a mom and part time inventory management,everything i try,my eyes,...,has a kid,straight,has dogs,other but not too serious about it,f,leo and it matters a lot,trying to quit,english (fluently),single,first i am a mommy that s my number one prior...
4546,33,,mostly anything,socially,sometimes,graduated from college/university,90 of the world s problems can be solved by c...,trying to live a life in balance of freelance ...,beating you at scrabble losing umbrellas and ...,i am funnier than i am patient i don t rea...,...,,straight,has dogs and likes cats,judaism and laughing about it,m,aquarius but it doesn&rsquo;t matter,sometimes,"english (fluently), portuguese (fluently), spa...",single,90 of the world s problems can be solved by c...
4657,34,average,anything,socially,,dropped out of college/university,is not typed here,working learning muay thai and getting in the...,my job driving at a brisk pace ordering out ...,two days ago i sneezed while on a parking lot ...,...,"doesn&rsquo;t have kids, but might want them",straight,has dogs,atheism,m,aquarius but it doesn&rsquo;t matter,no,"english (fluently), italian (poorly)",single,is not typed here working learning muay thai...
4717,29,average,,socially,never,graduated from masters program,life is good i work hard have amazing people...,i got my masters in social welfare a few years...,being bad at something and enjoying it anyway ...,i think the order depends on who you ask but p...,...,"doesn&rsquo;t have kids, but wants them",straight,has dogs and likes cats,judaism and laughing about it,f,cancer and it&rsquo;s fun to think about,when drinking,"english (okay), spanish (poorly)",single,life is good i work hard have amazing people...
8082,20,average,,,never,,i am willing to doi new things and metting new...,right now im goin to school gettin my aa degree,what i good at is takibg care of animal i lea...,,...,,straight,,,m,,,english,single,i am willing to doi new things and metting new...
8232,25,average,mostly anything,socially,never,working on space camp,i m a crazy fun looking kind of guy i do enjo...,i would love to become a manager at a hotel,sports and anything that has to do with a quic...,my hair i have distinctive hair its the slic...,...,,straight,likes dogs and likes cats,catholicism but not too serious about it,m,cancer but it doesn&rsquo;t matter,when drinking,"english (fluently), spanish (fluently), italia...",single,i m a crazy fun looking kind of guy i do enjo...
8319,33,a little extra,anything,often,sometimes,graduated from two-year college,i really dont enjoy typing all that much i d...,learning living experiencing and loving eve...,understanding people,how loud i am my smile and dimples,...,"doesn&rsquo;t have kids, but wants them",bisexual,has dogs,other but not too serious about it,f,sagittarius and it matters a lot,yes,english,single,i really dont enjoy typing all that much i d...


In [93]:
# Label 322 looks like people who talk about cooking and food a lot
label_322.essays[2552]

'well what can i say about myself  i am here to look for new friends  i have lived in the bay area for only about a year now and i love it  would love to share my adventures with someone  i really love the outdoors  i love hiking  cycling  and sight seeing i also love trying new things you can say i have a bucket list  i am passionate about life  funny or so i am told   and focused  i am currently going to sfsu finishing my b a in psychology  i also work part time  i really want more out of life so i am just looking for new people that want the same things  cooking  i love to cook and i have a passion for trying new foods  ummm lol thats all i have talent in  well i am assuming this is more psychical  i guess my smile and my sense of humor  i love any book that has to do with history or culture  i love horror movies and drama  i can t stand chick flicks  i love all music except country and heavy metal  i am open to listening to it if someone thinks they can turn me on to it  i love asi

In [94]:
label_322.essays[3092]

' apparently   has become a new favorite word of mine  like  perpendicular   it s fun to say don t you think  do you have any favorite words   perrrr piiinnn diiic uuulaarrr  as naturally for myself  i m laid back  ummm    witty  and uber sarcastic  i cannot really describe myself without the fear of sounding    vain  always been a happy chick and i tend to have a good time everywhere i go and i also tend to dance out of nowhere  it s my thang  ya dig  what comes after happiness  humor  sure  humor  it s a little dark  a little vulgar  and it involves making fun of myself and my friends  have you ever played  your team   i appreciate everything in life  my friends  my family  my co workers  a blanket  nature  the city  psh  even those creepy crawlies  i also say thank you a lot because i enjoy making others feel appreciated and important  it totally boosts their esteem up  it makes me happy   i swear too much by the way  hehe   can t think of anything else  i was born and raised in nor

In [95]:
label_322.essays[3884]

'i m currently a pre nursing student attending community college in the bay area  i absolutely love animals  and although i currently only have one dog  in the future i dream of owning many horses  pigs  dogs and chickens  i d like to be in a situation where i ideally have 1 2 amazing dogs that are mine  while fostering 2 3 shelter dogs   i do photography as a minor hobby  and while i do not see it as a career option for myself  it is something i find very enjoyable  highly intelligent people are instantly attractive to me  i enjoy science fiction books and movies as well as the fantasy genre   i played water polo and swam all four years of high school and currently play polo on a master s team  i started off the first day of polo practice freshman year hardcore doggy paddling it  senior year i was a captain and earned the mvp award  the product of determination and countless hours of hard work and possibly a bit of stubbornness as well   recently  i have gotten into backpacking and hi

In [96]:
# Try 468 now
label_468 = okc.ix[tf[tf['labels']==468].index]

In [97]:
label_468

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,...,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,essays
7166,36,fit,,socially,never,graduated from college/university,,,,,...,,straight,,catholicism but not too serious about it,f,sagittarius,no,"english, spanish (okay)",single,books authors outliers the help maya ange...
7325,42,a little extra,,socially,never,,hi thanks for taking the time to check me out...,i m trying to make the most of it and enjoy my...,,my laugh,...,,straight,has dogs,,f,,no,english,single,hi thanks for taking the time to check me out...
7791,23,,,socially,never,graduated from college/university,i love traveling and exploring new places and ...,i m currently working as a behavior therapist ...,making time for those i care about i don t li...,my smile,...,"doesn&rsquo;t have kids, but wants them",straight,has dogs,,f,leo but it doesn&rsquo;t matter,no,"english (fluently), chinese (okay), indonesian...",single,i love traveling and exploring new places and ...
9096,20,athletic,,often,sometimes,working on college/university,hi im kaciah im not exactly sure what im doi...,i am in college right now and am really enjoyi...,singing listening loving,my eyes my personality haha i dont know ...,...,,straight,has dogs and likes cats,agnosticism but not too serious about it,f,scorpio but it doesn&rsquo;t matter,sometimes,"english (fluently), spanish (okay)",single,hi im kaciah im not exactly sure what im doi...
9410,29,thin,,socially,,graduated from college/university,i believe in pink i believe that laughing is...,kicking ass and taking names,turning a negative into a positive breaking t...,,...,,straight,likes dogs and dislikes cats,catholicism,f,virgo,no,english,single,i believe in pink i believe that laughing is...
12307,28,full figured,mostly anything,socially,never,college/university,just got settled in oakland working as a studi...,lots of things i never thought i would be doin...,umm figuring out how things work making thi...,tits and ass not really usually my hair,...,wants kids,straight,likes dogs and likes cats,,f,sagittarius and it&rsquo;s fun to think about,no,english (fluently),single,just got settled in oakland working as a studi...
13035,45,thin,anything,socially,never,graduated from masters program,laughter and communication keep me going as do...,enjoying it,reading people thinking of others parenting ...,my smile and my positive energy,...,"has kids, but doesn&rsquo;t want more",straight,,,f,,no,english,single,laughter and communication keep me going as do...
14858,29,skinny,mostly anything,socially,never,graduated from masters program,i m a fun loving midwest girl trying to enjoy ...,besides loving my job i recently started road...,being open minded and non judgmental being pa...,some people say my smile other say my blonde ...,...,doesn&rsquo;t have kids,straight,,catholicism,f,libra but it doesn&rsquo;t matter,no,english,single,i m a fun loving midwest girl trying to enjoy ...
17971,37,athletic,anything,often,,graduated from masters program,dear you note as of 2 12 welcome to my self...,i m a family nurse practitioner i love it it...,making people feel good endurance sports bad...,1 dude she s tall 2 she calls everyone my...,...,,bisexual,has dogs and likes cats,other and somewhat serious about it,f,scorpio and it&rsquo;s fun to think about,no,"english (fluently), spanish (okay), french (po...",seeing someone,dear you note as of 2 12 welcome to my self...
18040,30,athletic,anything,socially,,graduated from college/university,i m outgoing and love to meet new people i ge...,enjoying every minute,singing karaoke,my smile,...,"doesn&rsquo;t have kids, but might want them",straight,likes dogs and has cats,agnosticism but not too serious about it,f,libra,no,"english, french (poorly)",single,i m outgoing and love to meet new people i ge...


In [100]:
label_468.sex.value_counts()

f    35
m     2
Name: sex, dtype: int64

In [101]:
label_468.essays[34722]

'i ll be your best friend  i m generally easy to talk to and i have a pretty fantastic sense of humor in that there aren t too many jokes that can offend me  i m better at being one of the guys than one of the girls  and am pretty easy going and laidback   i like to think that i m kind of smart  i m kind of a nerd too  having graduated with an engineering degree  i like  and understand   xkcd  math jokes are fun too   in general  i ll try most things at least once   like anybody else  i enjoy having a good time  whether i m staying in or going out  i m loud and i m fun  and i try to make the most of my time with other people  i m currently working as a support rep at one of the tech start ups in san francisco  i m working on my health and fitness  as well  my 2012 fitness goals are to be strong enough to do an unassisted pull up  don t laugh   and run a half marathon  to that end  i m signed up for the norcal tough mudder in september and the us half in november  i m working on traveli

In [102]:
label_468.essays[38605]

'i m a biggg dork  i enjoy playing guitar bass  finding new music  tattoos  piercings  sculpture  finding awesome grafitti  eating oreos  photography  riding go carts  eating chocolate ice cream  watching movies  eating leftover mac n  cheese with ketchup  drinking yuengling  building stuff  making ceramics  going camping  hiking  or getting lost  taking long drives to random places  finding new favorite spots  eating ketchup  and lots of other stuff  i ll add more        you have to leave the city of your comfort and go into the wilderness of your intuition  what you will discover will be wonderful  what you will discover will be yourself    i am creative  laid back  and eclectic just graduated college with 2 degrees   one in studio ceramics and the other psych  right now i m just working and saving for grad school    fixing things  listening  giving advice  thinking of random things to do  and some other things that only certain people get to experience    i m short   and i have nice

In [103]:
label_468.essays[32368]

'enjoy savoring every moment of life  trying to concentrate my senses on even the littlest of things like the warmth of the sun hitting my cheeks  or the wind blowing through my hair like gentle fingers massaging my scalp  or the beauty of taking in one breath at a time  i am very easily amused as i can laugh and be entertained by looking at the clouds in the sky and deciding what they look like and comparing with friends  views of the clouds  log rolling down a hill can be fun  i adore swimmng  even if it just means being underwater  floating  or feeling at one with the peaceful rhythm of the water and its waves  my absolute favorite hobbies are helping and loving  even when i am really sick  i have a pretty severe case of lupus  but have for about ten years  so i know how to listen to my body and make the best out of pretty much anything  including month   hospitalizations  what brings me the most joy is observing others and figuring out how they really feel  even if they are trying 

In [104]:
label_468.essays[30370]

'hello all     my name is brenda and i was convinced to go on this dating site by my crazy friend amanda whose username is giggletrons  i m super busy working most of the time and rarely get to go out like i used to in order to meet new people so i ll give this a spin or two  i like to read and play video games  i enjoy laughing with my friends and when i m in a good mood  i might just dance a little bit  i grew up in new york but since then i have moved from city to city and am currently living in san francisco  i would love to meet a guy who is nice  has a great sense of humor  and is level headed but spontaneous     i m currently interning with an animation company and working part time with an airline company   i m trying to find a more permanent position but this economy sucks  video games and essays  ha ha  also driving like a new yorker    that s a plus right  my height  well that s for my friends  i m taller than most of my friends  or my eyes  favorite books   my sister s keep

In [105]:
# Some of these clusters don't look very clustery...

#try running clusters again with slightly higher epsilon

# Compute DBSCAN
db = DBSCAN(eps=0.08, min_samples=10).fit(tf)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 492


KeyboardInterrupt: 

In [113]:
# Save new labels to tf and compare content
tf['labels_2'] = pd.Series(labels)

In [116]:
tf['labels_2'].shape

(53951,)

In [119]:
# Changing epsilon from .5 to .8 did not change the labels at all!!!

(tf['labels_2'] == tf['labels']).value_counts()

True    53951
dtype: int64

In [120]:
#try running clusters again with slightly higher epsilon

# Compute DBSCAN
db = DBSCAN(eps=0.1, min_samples=10).fit(tf)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

# Save new labels to tf and compare content
tf['labels_2'] = pd.Series(labels)

Estimated number of clusters: 492


In [121]:
# Still the same darn clusters
# keep ramping up epsilon

# Compute DBSCAN
db = DBSCAN(eps=0.2, min_samples=10).fit(tf)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

# Save new labels to tf and compare content
tf['labels_2'] = pd.Series(labels)

Estimated number of clusters: 492


In [125]:
# How many words do each cluster have in common?

labels_1 = tf[tf['labels']==1]

In [145]:
labels_1.columns

Index([      u'03',       u'04',       u'07',     u'1960',     u'1996',
           u'1997',     u'1998',     u'49er',       u'51',       u'5k',
       ...
         u'yearly',   u'yelled',  u'yiddish',   u'yonder',    u'zevon',
          u'zines',     u'zola', u'zucchini',   u'labels', u'labels_2'],
      dtype='object', length=502)

In [147]:
labels_1['yearly'].value_counts()[0]

30

In [153]:
sums_1 = pd.DataFrame([labels_1.sum(axis=0)])

In [160]:
# so label 1 is anything that contains the word carnegie?
for col in sums_1.columns:
    if sums_1[col].values[0] > 0:
        print col
        print sums_1[col]

carnegie
0    30.0
Name: carnegie, dtype: float64
labels
0    30.0
Name: labels, dtype: float64
labels_2
0    30.0
Name: labels_2, dtype: float64


In [161]:
labels_42 = tf[tf['labels']==42]
sums_42 = pd.DataFrame([labels_42.sum(axis=0)])

for col in sums_42.columns:
    if sums_42[col].values[0] > 0:
        print col
        print sums_42[col]

hairstylist
0    40.0
Name: hairstylist, dtype: float64
labels
0    1680.0
Name: labels, dtype: float64
labels_2
0    1680.0
Name: labels_2, dtype: float64


In [178]:
labels_43 = tf[tf['labels']==43]
sums_43 = pd.DataFrame([labels_43.sum(axis=0)])

for col in sums_43.columns:
    if sums_43[col].values[0] > 0:
        print col
        print sums_43[col]

parenthesis
0    20.0
Name: parenthesis, dtype: float64
labels
0    860.0
Name: labels, dtype: float64
labels_2
0    860.0
Name: labels_2, dtype: float64


In [168]:
# What is label 0?
# This contains vast majority of essays
#
label_0 = okc.ix[tf[tf['labels']==0].index]

In [169]:
label_0.shape

(37877, 32)

In [172]:
label_0.essays

1         i imagine that yes is the only living thing  ...
3        i am originally from cincinnati  a mid western...
4        a spirited  compassionate woman who is as comf...
5        100  puerto rican working to support my 10 yea...
6         brevity is the charm of eloquence   i like th...
9        ends here  is my business  going backwards  yo...
12       i m a traveling rn from chicago who stayed in ...
15       i am currently starting my own company in the ...
16       summary  so many ways to tell the story of the...
19       i m 5 4   107 lbs  i spend a lot of time outsi...
21       i m an outgoing  easy to get along with type o...
22       i love meeting new people and experiencing new...
23       i am the king of adventure  picnics  biking  f...
24       i m one of the happiest people you will ever m...
25       i am new to city  just moved here from chile  ...
27       using my blinker  and procrastinating  my smil...
28       for the past ten years i have been to a lot of.

In [189]:
tf[tf['labels']==0]

Unnamed: 0,03,04,07,1960,1996,1997,1998,49er,51,5k,...,yearly,yelled,yiddish,yonder,zevon,zines,zola,zucchini,labels,labels_2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [164]:
labels_0 = tf[tf['labels']==0]
sums_0 = pd.DataFrame([labels_0.sum(axis=0)])
print sums_0

for col in sums_0.columns:
    if sums_0[col].values[0] > 0:
        print col
        print sums_0[col]

    03   04   07  1960  1996  1997  1998  49er   51   5k    ...     yearly  \
0  0.0  0.0  0.0   0.0   0.0   0.0   0.0   0.0  0.0  0.0    ...        0.0   

   yelled  yiddish  yonder  zevon  zines  zola  zucchini  labels  labels_2  
0     0.0      0.0     0.0    0.0    0.0   0.0       0.0     0.0       0.0  

[1 rows x 502 columns]


In [148]:
for column in labels_1.columns:
    if labels_1[column].value_counts()[0]>10:

        print labels_1[column]
        print labels_1[column].value_counts()

2        0.0
2695     0.0
3527     0.0
4084     0.0
4518     0.0
6793     0.0
8942     0.0
9448     0.0
15940    0.0
21659    0.0
22678    0.0
22830    0.0
24104    0.0
26285    0.0
28058    0.0
29857    0.0
30989    0.0
31051    0.0
35239    0.0
36639    0.0
39407    0.0
40143    0.0
40335    0.0
41295    0.0
42236    0.0
44170    0.0
46711    0.0
47525    0.0
49461    0.0
52695    0.0
Name: 03, dtype: float64
0.0    30
Name: 03, dtype: int64
2        0.0
2695     0.0
3527     0.0
4084     0.0
4518     0.0
6793     0.0
8942     0.0
9448     0.0
15940    0.0
21659    0.0
22678    0.0
22830    0.0
24104    0.0
26285    0.0
28058    0.0
29857    0.0
30989    0.0
31051    0.0
35239    0.0
36639    0.0
39407    0.0
40143    0.0
40335    0.0
41295    0.0
42236    0.0
44170    0.0
46711    0.0
47525    0.0
49461    0.0
52695    0.0
Name: 04, dtype: float64
0.0    30
Name: 04, dtype: int64
2        0.0
2695     0.0
3527     0.0
4084     0.0
4518     0.0
6793     0.0
8942     0.0
9448     0.0


KeyError: 0.0

In [6]:
# try increasing max_df in tfidf vectorizer to help prevent 1 giant cluster from forming

t0 = time()
vec = TfidfVectorizer(encoding='utf-8', stop_words='english',max_df=100, max_features=500)
tf = vec.fit_transform(okc.essays)
print "vectorized essays in %g seconds" %(time()-t0)

vectorized essays in 18.2054 seconds


In [7]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

In [8]:
#try running clusters again with slightly higher epsilon

# Compute DBSCAN
db = DBSCAN(eps=0.1, min_samples=10).fit(tf)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

# Save new labels to tf and compare content
tf['labels'] = pd.Series(labels)

Estimated number of clusters: 498


ValueError: shape mismatch: objects cannot be broadcast to a single shape

In [13]:
labels

array([ 0, -1, -1, ...,  0,  0, -1])

In [12]:
pd.Series(labels).value_counts()

 0      28960
-1      11583
 152       52
 311       50
 377       47
 52        47
 175       47
 62        46
 352       45
 327       45
 55        44
 25        43
 97        43
 346       42
 198       42
 11        42
 233       41
 165       41
 472       41
 26        41
 60        41
 282       41
 382       40
 256       40
 145       40
 202       39
 92        39
 166       39
 456       39
 397       38
        ...  
 171       17
 51        17
 457       17
 476       17
 445       17
 297       17
 378       17
 495       16
 356       16
 190       16
 180       16
 422       16
 402       16
 324       16
 412       16
 475       16
 236       16
 127       16
 453       16
 57        15
 280       15
 468       14
 232       14
 18        14
 396       14
 482       13
 469       13
 355       13
 497       13
 404       12
dtype: int64

In [14]:
# Looks like increasing max_df from 50 to 100 shrank our mega-cluster.
# Other clusters may still be united by just one word.

# Run tf-idf vectorization again increasing both max_df AND max_features
# Increasing max_df should cut down on megacluster
# Increasing max_features may help smaller clusters form based on two or three words rather just one

# Like cluster #42 above was made just on the word "hairstylist"
# A higher max df_would have included features like 'salon' that these also shared.


t0 = time()
vec = TfidfVectorizer(encoding='utf-8', stop_words='english',max_df=150, max_features=700)
tf = vec.fit_transform(okc.essays)
print "vectorized essays in %g seconds" %(time()-t0)

vectorized essays in 20.1625 seconds


In [15]:
# Compute DBSCAN
db = DBSCAN(eps=0.1, min_samples=10).fit(tf)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 643


In [16]:
pd.Series(labels).value_counts()

-1      23037
 3      18941
 17        46
 136       42
 261       42
 174       39
 170       39
 53        37
 206       37
 209       36
 223       35
 147       35
 269       35
 129       35
 235       35
 423       35
 263       34
 262       34
 142       34
 15        34
 484       34
 584       34
 64        34
 54        33
 298       33
 163       33
 80        33
 26        33
 23        33
 575       33
        ...  
 240       11
 623       11
 99        11
 445       10
 546       10
 461       10
 108       10
 313       10
 579       10
 549       10
 550       10
 482       10
 479       10
 504       10
 495       10
 299       10
 609       10
 140       10
 531       10
 636       10
 598       10
 561       10
 328       10
 361       10
 596       10
 208       10
 618       10
 414       10
 110       10
 45        10
dtype: int64

In [23]:
# What is holding cluster 3 together???
# How does that compare to other clusters

tf_df = pd.DataFrame(tf.toarray(), columns=vec.get_feature_names())

tf_df['label'] = pd.Series(labels)

In [35]:
tf_df.mean(axis=0)

2004               0.001130
2005               0.001335
2006               0.001356
3000               0.001300
400                0.001364
40s                0.001525
95                 0.001356
aaron              0.001469
abandon            0.001215
abandoned          0.001409
abba               0.001414
abs                0.001458
absorbed           0.001352
abuse              0.001356
academia           0.001495
accomplishment     0.001486
accurately         0.001394
additional         0.001224
address            0.001205
ai                 0.001073
aimee              0.001334
alert              0.001291
amlie              0.001331
amores             0.001473
anchor             0.001661
angle              0.001492
antiques           0.001434
apocalyptic        0.001367
apologies          0.001260
appeals            0.001352
                    ...    
vehicle            0.001507
virtually          0.001317
vivaldi            0.001392
vulnerability      0.001270
walnut             0

In [46]:
# Let's have a look at cluster 17 (the biggest of the smaller clusters)
# Is this held together by just one word, or are there more than one?

tf_3 = tf_df[tf_df['label']==3]
tf_3.shape

(18941, 701)

In [50]:
#
tf_df.ix[9,:].value_counts()

0.0    700
3.0      1
Name: 9, dtype: int64

In [42]:
tf_df[tf_df.label==25].sum(axis=0).value_counts()

0.0      699
450.0      1
18.0       1
dtype: int64

In [55]:
# Pandas is not doing a great job on this df
# do computations on original scipy sparse tf

# The means for each column match the means from original scipy df

tf.mean(axis=0)

matrix([[ 0.00113008,  0.00133458,  0.00135638,  0.00130019,  0.00136353,
          0.00152491,  0.00135569,  0.00146914,  0.00121453,  0.00140949,
          0.00141444,  0.00145823,  0.00135174,  0.00135627,  0.00149547,
          0.00148553,  0.00139437,  0.00122375,  0.00120459,  0.0010726 ,
          0.00133385,  0.00129107,  0.00133149,  0.0014732 ,  0.00166053,
          0.00149238,  0.00143367,  0.00136688,  0.00125975,  0.00135228,
          0.00130303,  0.00144598,  0.00138437,  0.00129699,  0.0015955 ,
          0.00131743,  0.00135337,  0.00147335,  0.00155378,  0.00148517,
          0.00138805,  0.0014867 ,  0.00173098,  0.00126742,  0.00135223,
          0.00132766,  0.00152537,  0.00152693,  0.00145777,  0.00141106,
          0.00166996,  0.00141234,  0.00143437,  0.00129137,  0.00118181,
          0.00140517,  0.00149497,  0.00150729,  0.00136918,  0.0014939 ,
          0.00153678,  0.00148733,  0.00129601,  0.00135853,  0.0014674 ,
          0.00138254,  0.00145573,  0.

In [65]:
# Must convert to dense to filter values?
# Even a regular numpy array converts everything to 0.
# must do all compuatations on original scipy sparse
# fml

tf_a = tf.toarray()

np.sum(tf_a[labels==3], axis=0)



array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0