In [21]:
from sklearn.datasets import fetch_20newsgroups
import nltk.data
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import logging
from gensim.models import Word2Vec,KeyedVectors
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from stemming.porter2 import stem
from sklearn.cluster import KMeans
import time
from sklearn.ensemble import RandomForestClassifier

In [2]:
#loading the news dataset from sklearn dataset library
twenty_train = fetch_20newsgroups(subset='train',shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

# tokenize to sentences based on the notations followed in english literature
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
rec.autos


In [3]:
# function to parse sentences to words and remove stopwords from sentence
def sentence_to_wordlist( sentence, remove_stopwords=True ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    text = BeautifulSoup(sentence).get_text()
    #  
    # 2. Remove non-letters
    text = re.sub("[^a-zA-Z]"," ",text)
    #
    # 3. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [4]:
# function to parse text to sentences using tokenizer mentioned above
def text_to_sentences(text,tokenizer,remove_stopwords=True):
    raw_sentences = tokenizer.tokenize(text.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( sentence_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [5]:
#train_sentences and test_sentences contain the filtered sentences of training and testing data respectively

train_sentences = []  # Initialize an empty list of sentences
test_sentences = []

print "Parsing sentences from training set"
for i in range(len(twenty_train.data)):
    train_sentences += [[stem(str(word)) for word in sentence] \
                        for sentence in [text_to_sentences(twenty_train.data[i], tokenizer)]]
print len(train_sentences)
print train_sentences[0]

print "Parsing sentences from unlabeled set"
for i in range(len(twenty_test.data)):
    test_sentences += [[stem(str(word)) for word in sentence] \
                       for sentence in [text_to_sentences(twenty_test.data[i], tokenizer)]]
print len(test_sentences)
print test_sentences[0]

Parsing sentences from training set




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  'Beautiful Soup.' % markup)


11314
["[u'lerxst', u'wam', u'umd', u'edu', u'thing', u'subject', u'car']", "[u'nntp', u'posting', u'host', u'rac', u'wam', u'umd', u'edu', u'organization', u'university', u'maryland', u'college', u'park', u'lines', u'wondering', u'anyone', u'could', u'enlighten', u'car', u'saw', u'day']", "[u'door', u'sports', u'car', u'looked', u'late', u'early']", "[u'called', u'bricklin']", "[u'doors', u'really', u'small']", "[u'addition', u'front', u'bumper', u'separate', u'rest', u'body']", "[u'know']", "[u'anyone', u'tellme', u'model', u'name', u'engine', u'specs', u'years', u'production', u'car', u'made', u'history', u'whatever', u'info', u'funky', u'looking', u'car', u'please', u'e', u'mail']", "[u'thanks', u'il', u'brought', u'neighborhood', u'lerxst']"]
Parsing sentences from unlabeled set


  'Beautiful Soup.' % markup)


7532
["[u'v', u'mb', u'k', u'ubvmsd', u'cc', u'buffalo', u'edu', u'neil', u'b', u'gandler', u'subject', u'need', u'info', u'bonneville', u'organization', u'university', u'buffalo', u'lines', u'news', u'software', u'vax', u'vms', u'vnews', u'nntp', u'posting', u'host', u'ubvmsd', u'cc', u'buffalo', u'edu', u'little', u'confused', u'models', u'bonnevilles']", "[u'heard', u'le', u'se', u'lse', u'sse', u'ssei']", "[u'could', u'someone', u'tell', u'differences', u'far', u'features', u'performance']", "[u'also', u'curious', u'know', u'book', u'value', u'prefereably', u'model']", "[u'much', u'less', u'book', u'value', u'usually', u'get']", "[u'words', u'much', u'demand', u'time', u'year']", "[u'heard', u'mid', u'spring', u'early', u'summer', u'best', u'time', u'buy']", "[u'neil', u'gandler']"]


In [7]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
# Initialize and train the model (this will take some time)

print "Training model..."
model = Word2Vec(train_sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=False)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)
#model.save_word2vec_format(model_name,binary=False)

2017-03-23 18:06:31,385 : INFO : collecting all words and their counts
2017-03-23 18:06:31,386 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-03-23 18:06:31,528 : INFO : PROGRESS: at sentence #10000, processed 166860 words, keeping 130308 word types
2017-03-23 18:06:31,547 : INFO : collected 144031 word types from a corpus of 187884 raw words and 11314 sentences
2017-03-23 18:06:31,548 : INFO : Loading a fresh vocabulary


Training model...


2017-03-23 18:06:31,641 : INFO : min_count=40 retains 28 unique words (0% of original 144031, drops 144003)
2017-03-23 18:06:31,642 : INFO : min_count=40 leaves 7150 word corpus (3% of original 187884, drops 180734)
2017-03-23 18:06:31,643 : INFO : deleting the raw counts dictionary of 144031 items
2017-03-23 18:06:31,651 : INFO : sample=0.001 downsamples 28 most-common words
2017-03-23 18:06:31,655 : INFO : downsampling leaves estimated 1012 word corpus (14.2% of prior 7150)
2017-03-23 18:06:31,657 : INFO : estimated required memory for 28 words and 300 dimensions: 81200 bytes
2017-03-23 18:06:31,659 : INFO : resetting layer weights
2017-03-23 18:06:31,661 : INFO : training model with 4 workers on 28 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2017-03-23 18:06:31,662 : INFO : expecting 11314 sentences, matching count from corpus used for vocabulary survey
2017-03-23 18:06:31,912 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-

In [14]:
print model.wv.syn0.shape[0]
# Set "k" (num_clusters) to be number of categories in newsgroup dataset
word_vectors = model.wv.syn0
num_clusters = 20

start = time.time()
# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

28
Time taken for K Means clustering:  0.0819571018219 seconds.


In [16]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

In [17]:
for cluster in range(num_clusters):
    #
    # Print the cluster number  
    print "\nCluster %d" % cluster
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if( word_centroid_map.values()[i] == cluster ):
            words.append(word_centroid_map.keys()[i])
    print words


Cluster 0
["[u'good']"]

Cluster 1
['[]']

Cluster 2
["[u'serdar', u'argic', u'closed', u'roads', u'mountain', u'passes', u'might', u'serve', u'ways', u'escape', u'turks', u'proceeded', u'work', u'extermination']"]

Cluster 3
["[u'really']", "[u'e']"]

Cluster 4
["[u'keith']", "[u'article']"]

Cluster 5
["[u'yes']"]

Cluster 6
["[u'survivors']", "[u'jon']"]

Cluster 7
["[u'think']", "[u'thanks']"]

Cluster 8
["[u'p']"]

Cluster 9
["[u'sure']", "[u'etc']", "[u'know']"]

Cluster 10
["[u'true']", "[u'right']"]

Cluster 11
["[u'sahak', u'melkonian']"]

Cluster 12
["[u'ohanus', u'appressian', u'soviet', u'armenia', u'today', u'longer', u'exists', u'single', u'turkish', u'soul']"]

Cluster 13
["[u'wrong']"]

Cluster 14
["[u'alink', u'ksand', u'private', u'activities', u'net']", "[u'gordon', u'banks', u'n', u'jxp', u'skepticism', u'chastity', u'intellect', u'geb', u'cadre', u'dsl', u'pitt', u'edu', u'shameful', u'surrender', u'soon']"]

Cluster 15
["[u'clh']"]

Cluster 16
["[u'thanks', u'adv

In [18]:
#This function will give us a numpy array for each news, each with a number of features equal to the number of clusters. 
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the news. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [22]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( ( len(twenty_train.data), num_clusters), \
    dtype="float32" )

# Transform the training set news into bags of centroids
counter = 0
for sentence in train_sentences:
    train_centroids[counter] = create_bag_of_centroids( sentence, \
        word_centroid_map )
    counter += 1

# Repeat for test news data 
test_centroids = np.zeros(( len(twenty_test.data), num_clusters), \
    dtype="float32" )

counter = 0
for sentence in test_sentences:
    test_centroids[counter] = create_bag_of_centroids( sentence, \
        word_centroid_map )
    counter += 1

# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print "Fitting a random forest to labeled training data..."
forest = forest.fit(train_centroids,twenty_train.target)
result = forest.predict(test_centroids)

print np.mean(result==twenty_test.target)

Fitting a random forest to labeled training data...
0.0886882634095
