# HW5_week6_Modeling text data with a hierarchy of clusters

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans                # we'll be using scikit-learn's KMeans for this assignment
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
import json
%matplotlib inline

### 1. Data Preprocessing 

In [3]:
wiki = pd.read_csv('people_wiki.csv')

Load the TF-IDF vectors:

In [17]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')

In [13]:
with open ('people_wiki_map_index_to_word.json', 'r') as f:
    map_index_to_word = json.load(f)

In [18]:
tf_idf = normalize(tf_idf) # x / sqrt(sum(x^2))

### 2. Implement modeling

Rather than passing around the three variables separately, we package them into a Python dictionary. The wrapper function takes a single dictionary (representing a parent cluster) and returns two dictionaries (representing the child clusters).

In [31]:
def bipartition(cluster, maxiter = 400, num_runs = 4, seed = None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''
    
    data_matrix = cluster['matrix']
    dataframe   = cluster['dataframe']
    
    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1)    
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], \
                                                      data_matrix[cluster_assignment==1]
    
    # Divide the dataframe into two parts, again using the cluster assignments.
    cluster_assignment_sa = np.array(cluster_assignment) # minor format conversion
    dataframe_left_child, dataframe_right_child     = dataframe[cluster_assignment_sa==0], \
                                                      dataframe[cluster_assignment_sa==1]
        
    
    # Package relevant variables for the child clusters
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    
    return (cluster_left_child, cluster_right_child)

In [32]:
wiki_data = {'matrix': tf_idf, 'dataframe': wiki} # no 'centroid' for the root cluster
left_child, right_child = bipartition(wiki_data, maxiter = 100, num_runs = 6, seed = 1)

In [33]:
print left_child
print right_child

{'centroid': array([  0.00000000e+00,   8.57526623e-06,   0.00000000e+00, ...,
         1.38560691e-04,   6.46049863e-05,   2.26551103e-05]), 'matrix': <11510x547979 sparse matrix of type '<type 'numpy.float64'>'
	with 1885831 stored elements in Compressed Sparse Row format>, 'dataframe':                                                      URI  \
0            <http://dbpedia.org/resource/Digby_Morrell>   
17     <http://dbpedia.org/resource/Paddy_Dunne_(Gael...   
21           <http://dbpedia.org/resource/Ceiron_Thomas>   
22            <http://dbpedia.org/resource/Adel_Sellimi>   
25             <http://dbpedia.org/resource/Vic_Stasiuk>   
28            <http://dbpedia.org/resource/Leon_Hapgood>   
30               <http://dbpedia.org/resource/Dom_Flora>   
33               <http://dbpedia.org/resource/Bob_Reece>   
41     <http://dbpedia.org/resource/Bob_Adams_(Americ...   
48              <http://dbpedia.org/resource/Marc_Logan>   
49          <http://dbpedia.org/resource/Corey_Woo

### 3. Visualize the bipartition

We provide you with a modified version of the visualization function from the k-means assignment. For each cluster, we print the top 5 words with highest TF-IDF weights in the centroid and display excerpts for the 8 nearest neighbors of the centroid.

In [42]:
# Convert the dictionary map_index_to_words to a list of words
words = sorted(map_index_to_word, key = map_index_to_word.get)

In [51]:
def display_single_tf_idf_cluster(cluster, map_index_to_word):
    '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices'''
    
    wiki_subset   = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid      = cluster['centroid']
    
    # Print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    for i in xrange(5):
        print('{0:s}:{1:.3f}'.format(words[idx[i]], centroid[idx[i]])),
    print('')
    
    # Compute distances from the centroid to all data points in the cluster.
    distances = pairwise_distances(tf_idf_subset, [centroid], metric = 'euclidean').flatten()
    # compute nearest neighbors of the centroid within the cluster.
    nearest_neighbors = distances.argsort()
    # For 8 nearest neighbors, print the title as well as first 180 characters of text.
    # Wrap the text at 80-character mark.
    for i in xrange(8):
        text = ' '.join(wiki_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
        print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki_subset.iloc[nearest_neighbors[i]]['name'],
              distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
    print('')

In [54]:
display_single_tf_idf_cluster(left_child, map_index_to_word)

league:0.040 season:0.036 team:0.029 football:0.029 played:0.028 
* Todd Williams                                      0.95468
  todd michael williams born february 13 1971 in syracuse new york is a former major league 
  baseball relief pitcher he attended east syracuseminoa high school
* Gord Sherven                                       0.95622
  gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas
  katchewan is a retired canadian professional ice hockey forward who played
* Justin Knoedler                                    0.95639
  justin joseph knoedler born july 17 1980 in springfield illinois is a former major league 
  baseball catcherknoedler was originally drafted by the st louis cardinals
* Chris Day                                          0.95648
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Tony Smith (football

In [53]:
display_single_tf_idf_cluster(right_child, map_index_to_word)

she:0.025 her:0.017 music:0.012 he:0.011 university:0.011 
* Anita Kunz                                         0.97401
  anita e kunz oc born 1956 is a canadianborn artist and illustratorkunz has lived in london
   new york and toronto contributing to magazines and working
* Janet Jackson                                      0.97472
  janet damita jo jackson born may 16 1966 is an american singer songwriter and actress know
  n for a series of sonically innovative socially conscious and
* Madonna (entertainer)                              0.97475
  madonna louise ciccone tkoni born august 16 1958 is an american singer songwriter actress 
  and businesswoman she achieved popularity by pushing the boundaries of lyrical
* %C3%81ine Hyland                                   0.97536
  ine hyland ne donlon is emeritus professor of education and former vicepresident of univer
  sity college cork ireland she was born in 1942 in athboy co
* Jane Fonda                                         0.9

### 4. Perform recursive bipartitioning

**Criteria:** Should we keep subdividing the clusters? If so, which cluster should we subdivide?  
we would like to achieve similar level of granularity for all clusters.

To help identify the clusters we've built so far, let's give them easy-to-read aliases:

In [55]:
athletes = left_child
non_athletes = right_child

In [56]:
# Bipartition the cluster of athletes
left_child_athletes, right_child_athletes = bipartition(athletes, maxiter = 100, num_runs = 6, seed = 1)

In [57]:
display_single_tf_idf_cluster(left_child_athletes, map_index_to_word)

baseball:0.111 league:0.103 major:0.051 games:0.046 season:0.045 
* Steve Springer                                     0.89344
  steven michael springer born february 11 1961 is an american former professional baseball 
  player who appeared in major league baseball as a third baseman and
* Dave Ford                                          0.89598
  david alan ford born december 29 1956 is a former major league baseball pitcher for the ba
  ltimore orioles born in cleveland ohio ford attended lincolnwest
* Todd Williams                                      0.89823
  todd michael williams born february 13 1971 in syracuse new york is a former major league 
  baseball relief pitcher he attended east syracuseminoa high school
* Justin Knoedler                                    0.90097
  justin joseph knoedler born july 17 1980 in springfield illinois is a former major league 
  baseball catcherknoedler was originally drafted by the st louis cardinals
* Kevin Nicholson (baseball)        

In [58]:
display_single_tf_idf_cluster(right_child_athletes, map_index_to_word)

season:0.034 football:0.033 team:0.031 league:0.029 played:0.027 
* Gord Sherven                                       0.95562
  gordon r sherven born august 21 1963 in gravelbourg saskatchewan and raised in mankota sas
  katchewan is a retired canadian professional ice hockey forward who played
* Ashley Prescott                                    0.95656
  ashley prescott born 11 september 1972 is a former australian rules footballer he played w
  ith the richmond and fremantle football clubs in the afl between
* Chris Day                                          0.95656
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Jason Roberts (footballer)                         0.95658
  jason andre davis roberts mbe born 25 january 1978 is a former professional footballer and
   now a football punditborn in park royal london roberts was
* Todd Curley                         

In [59]:
baseball            = left_child_athletes
ice_hockey_football = right_child_athletes

#### Bipartition the cluster of ice_hockey_football

In [63]:
# Bipartition the cluster of ice_hockey_football
left_child_ice_hockey_football, right_child_ice_hockey_football = bipartition(ice_hockey_football, maxiter = 100, num_runs = 8, seed = 1)

In [64]:
display_single_tf_idf_cluster(left_child_ice_hockey_football, map_index_to_word)

championships:0.045 tour:0.044 championship:0.035 world:0.031 won:0.031 
* Alessandra Aguilar                                 0.93849
  alessandra aguilar born 1 july 1978 in lugo is a spanish longdistance runner who specialis
  es in marathon running she represented her country in the event
* Heather Samuel                                     0.93966
  heather barbara samuel born 6 july 1970 is a retired sprinter from antigua and barbuda who
   specialized in the 100 and 200 metres in 1990
* Viola Kibiwot                                      0.94008
  viola jelagat kibiwot born december 22 1983 in keiyo district is a runner from kenya who s
  pecialises in the 1500 metres kibiwot won her first
* Ayelech Worku                                      0.94022
  ayelech worku born june 12 1979 is an ethiopian longdistance runner most known for winning
   two world championships bronze medals on the 5000 metres she
* Krisztina Papp                                     0.94070
  krisztina papp 

In [65]:
display_single_tf_idf_cluster(right_child_ice_hockey_football, map_index_to_word)

football:0.048 season:0.043 league:0.041 played:0.036 coach:0.034 
* Todd Curley                                        0.94580
  todd curley born 14 january 1973 is a former australian rules footballer who played for co
  llingwood and the western bulldogs in the australian football league
* Tony Smith (footballer, born 1957)                 0.94607
  anthony tony smith born 20 february 1957 is a former footballer who played as a central de
  fender in the football league in the 1970s and
* Chris Day                                          0.94624
  christopher nicholas chris day born 28 july 1975 is an english professional footballer who
   plays as a goalkeeper for stevenageday started his career at tottenham
* Ashley Prescott                                    0.94634
  ashley prescott born 11 september 1972 is a former australian rules footballer he played w
  ith the richmond and fremantle football clubs in the afl between
* Jason Roberts (footballer)                         0.9

#### Cluster of non-athletes.

In [61]:
# Bipartition the cluster of non-athletes
left_child_non_athletes, right_child_non_athletes = bipartition(non_athletes, maxiter=100, num_runs=6, seed=1)

display_single_tf_idf_cluster(left_child_non_athletes, map_index_to_word)
display_single_tf_idf_cluster(right_child_non_athletes, map_index_to_word)

he:0.013 music:0.012 university:0.011 film:0.010 his:0.009 
* Wilson McLean                                      0.97870
  wilson mclean born 1937 is a scottish illustrator and artist he has illustrated primarily 
  in the field of advertising but has also provided cover art
* Julian Knowles                                     0.97938
  julian knowles is an australian composer and performer specialising in new and emerging te
  chnologies his creative work spans the fields of composition for theatre dance
* James A. Joseph                                    0.98042
  james a joseph born 1935 is an american former diplomatjoseph is professor of the practice
   of public policy studies at duke university and founder of
* Barry Sullivan (lawyer)                            0.98054
  barry sullivan is a chicago lawyer and as of july 1 2009 the cooney conway chair in advoca
  cy at loyola university chicago school of law
* Archie Brown                                       0.98081
  archibal

In [67]:
male_non_athletes = left_child_non_athletes
female_non_athletes = right_child_non_athletes

#### Male_non_athletes

In [68]:
# Bipartition the cluster of male_non_athletes
left_child_male_non_athletes, right_child_male_non_athletes = bipartition(male_non_athletes, maxiter = 100, num_runs = 6, seed = 1)

display_single_tf_idf_cluster(left_child_male_non_athletes, map_index_to_word)
display_single_tf_idf_cluster(right_child_male_non_athletes, map_index_to_word)

university:0.017 he:0.015 law:0.013 served:0.013 research:0.013 
* Barry Sullivan (lawyer)                            0.97075
  barry sullivan is a chicago lawyer and as of july 1 2009 the cooney conway chair in advoca
  cy at loyola university chicago school of law
* James A. Joseph                                    0.97344
  james a joseph born 1935 is an american former diplomatjoseph is professor of the practice
   of public policy studies at duke university and founder of
* David Anderson (British Columbia politician)       0.97383
  david a anderson pc oc born august 16 1937 in victoria british columbia is a former canadi
  an cabinet minister educated at victoria college in victoria
* Sven Erik Holmes                                   0.97469
  sven erik holmes is a former federal judge and currently the vice chairman legal risk and 
  regulatory and chief legal officer for kpmg llp a
* Andrew Fois                                        0.97558
  andrew fois is an attorney livi

#### Female_non_athletes

In [69]:
# Bipartition the cluster of male_non_athletes
left_child_female_non_athletes, right_child_female_non_athletes = bipartition(female_non_athletes, maxiter = 100, num_runs = 6, seed = 1)

display_single_tf_idf_cluster(left_child_female_non_athletes, map_index_to_word)
display_single_tf_idf_cluster(right_child_female_non_athletes, map_index_to_word)

she:0.121 her:0.100 actress:0.031 film:0.030 music:0.028 
* Janet Jackson                                      0.92374
  janet damita jo jackson born may 16 1966 is an american singer songwriter and actress know
  n for a series of sonically innovative socially conscious and
* Barbara Hershey                                    0.92524
  barbara hershey born barbara lynn herzstein february 5 1948 once known as barbara seagull 
  is an american actress in a career spanning nearly 50 years
* Madonna (entertainer)                              0.92753
  madonna louise ciccone tkoni born august 16 1958 is an american singer songwriter actress 
  and businesswoman she achieved popularity by pushing the boundaries of lyrical
* Cher                                               0.92909
  cher r born cherilyn sarkisian may 20 1946 is an american singer actress and television ho
  st described as embodying female autonomy in a maledominated industry
* Candice Bergen                               