In [2]:
from sklearn.datasets import load_files 

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from sklearn.cluster import KMeans

In [3]:
# preparar funciones de procesamiento de texto

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [4]:
dataset = load_files('data/txt_sentoken', shuffle=False)

synopses = []
N = len(dataset['data'])

for i in range(N):
    synopses.append(dataset['data'][i].decode('utf-8').strip())    

In [5]:
totalvocab_stemmed = []

for i in synopses:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
        
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
f_text = [word for word in totalvocab_stemmed if word not in stopwords]

vocab_frame = pd.DataFrame({'words': f_text}, index = range(len(f_text)))

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses
print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

(2000, 165)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [7]:
num_clusters = 5

km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [8]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 535 elementos
El cluster 1 tiene 592 elementos
El cluster 2 tiene 385 elementos
El cluster 3 tiene 308 elementos
El cluster 4 tiene 180 elementos


In [17]:
num_clusters = 10
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [19]:
print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("  WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        #print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=' / ')
        print(' %s' % terms[ind], end=' /')
    print('\n')
    
print('\n')    

Top terms per cluster:

[[ Cluster 0 ]]

  WORDS ///  stori / love / charact / doe / like / time /

[[ Cluster 1 ]]

  WORDS ///  life / world / charact / man / live / make /

[[ Cluster 2 ]]

  WORDS ///  john / origin / like / new / time / effect /

[[ Cluster 3 ]]

  WORDS ///  like / becom / charact / high / play / scene /

[[ Cluster 4 ]]

  WORDS ///  charact / veri / play / perform / good / star /

[[ Cluster 5 ]]

  WORDS ///  action / scene / like / charact / plot / make /

[[ Cluster 6 ]]

  WORDS ///  like / just / scene / make / good / time /

[[ Cluster 7 ]]

  WORDS ///  did / did n't / like / just / know / charact /

[[ Cluster 8 ]]

  WORDS ///  comedi / funni / laugh / like / charact / just /

[[ Cluster 9 ]]

  WORDS ///  bad / guy / good / like / just / act /



