In [1]:
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
#from abstract import *
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from sklearn.grid_search import GridSearchCV
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report




In [2]:
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories, remove=('headers', 'footers', 'quotes'))

In [3]:
from pprint import pprint
#pprint(newsgroups_train)


# Using TF-IDF 

In [4]:
vectorizer = TfidfVectorizer()
le = LabelEncoder()

# Training 
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
labels_train = le.fit_transform(newsgroups_train.target)

# Just for debug quickly reassign labels
X = vectors_train
y = labels_train


clf = MultinomialNB(alpha=.01)
#clf = svm.SVC(kernel='linear', C = 1.0)

# Evaluate using cross validation
cross_val = KFold(len(newsgroups_train.target),n_folds=5,shuffle=True)

#for train_index, test_index in cross_val:
#    X_train,X_test = X[train_index],X[test_index]
#    y_train,y_test = y[train_index],y[test_index]
#    clf.fit(X_train,y_train)
#    y_pred = clf.predict(X_test)
#    print classification_report(le.inverse_transform(y_test),le.inverse_transform(y_pred))
#    print metrics.confusion_matrix(y_pred,y_test)

clf.fit(vectors_train, newsgroups_train.target)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)

#print metrics.f1_score(newsgroups_test.target, pred)
print classification_report(le.inverse_transform(newsgroups_test.target),le.inverse_transform(pred))


             precision    recall  f1-score   support

          0       0.68      0.68      0.68       319
          1       0.92      0.89      0.91       389
          2       0.81      0.90      0.85       394
          3       0.68      0.60      0.64       251

avg / total       0.79      0.79      0.79      1353



# Using Doc2Vec

In [5]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec

from collections import OrderedDict
import multiprocessing
from gensim.models.doc2vec import TaggedDocument
import sys

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"


In [6]:
from collections import namedtuple
import nltk

NewsgroupDocument = namedtuple('NewsGroupDocument', 'words tags split category')
doc_count = 0 # Used to generate unique id for all documents across both train and test

all_newsgroup_documents = []

#Used to convert newsgroup corpus into Doc2Vec formats
def convert_newsgroup(docs,labels,split):
    global doc_count
    tagged_documents = []
    
    for doc, label in zip(docs,labels):
        doc_count += 1
        #print doc
        words = gensim.utils.to_unicode(doc).split() # expected by gensim
        tags = [doc_count] #needs to be a list. Exp with having multiple tags
        all_newsgroup_documents.append(NewsgroupDocument(words,tags,split,label))
        #print words
        
        #if doc_count == 5:
        #    print all_newsgroup_documents
        #    break
            #sys.exit(0)
    

convert_newsgroup(newsgroups_test.data,newsgroups_test.target,'test')
convert_newsgroup(newsgroups_train.data,newsgroups_train.target,'train')

train_docs = [doc for doc in all_newsgroup_documents if doc.split == 'train']
test_docs = [doc for doc in all_newsgroup_documents if doc.split == 'test']
doc_list = all_newsgroup_documents[:]  # for reshuffling per pass

print('%d docs: %d train, %d test' % (len(doc_list), len(train_docs), len(test_docs)))

3387 docs: 2034 train, 1353 test


### Set Up Doc2Vec

In [7]:
# Models to evaluate
simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=200, window=3, negative=100, hs=0, min_count=1, workers=cores),
    # PV-DBOW 
    #Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=1, workers=cores),
    # PV-DM w/average
    #Doc2Vec(dm=1, dm_mean=1, size=300, window=10, negative=5, hs=0, min_count=1, workers=cores),
]

# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(all_newsgroup_documents)  # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])

for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dm/c,d200,n100,w3,t8)


### Prediction and Evaluation

### Training

Why is bulk training done with all the  docs from train, test,  and dev ? Compared to other methods training is only done on the training dataset 

In [8]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

In [9]:
#Get Vectors From Word2Vec
def extract_vectors(model,docs):
    
    vectors_list = []
    
    for doc_no in range(len(docs)):
        doc_label = docs[doc_no].tags[0]
        doc_vector = model.docvecs[doc_label]
        vectors_list.append(doc_vector)
        
    return vectors_list

def evaluation():
    clf = MultinomialNB(alpha=.01)

    # Evaluate using cross validation
    cross_val = KFold(len(newsgroups_train.target),n_folds=5,shuffle=True)

    for train_index, test_index in cross_val:
        X_train,X_test = X[train_index],X[test_index]
        y_train,y_test = y[train_index],y[test_index]
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print classification_report(le.inverse_transform(y_test),le.inverse_transform(y_pred))
        print metrics.confusion_matrix(y_pred,y_test)


In [11]:

from random import shuffle
alpha, min_alpha, passes = (0.025, 0.001, 25)
alpha_delta = (alpha - min_alpha) / passes
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC


for epoch in range(passes):
    shuffle(doc_list)
    
    for name, train_model in models_by_name.items():
        
        #Train
        print alpha
        train_model.alpha, train_model.min_alpha = alpha, alpha

        train_model.train(doc_list)
            
        #Evaluation
        train_vectors = extract_vectors(train_model,train_docs)
        test_vectors = extract_vectors(train_model,test_docs)
      
        #clf = MultinomialNB(alpha=.01)
        penalties = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.5,2.0])
        model = LinearSVC()
        grid = GridSearchCV(estimator=model ,param_grid=dict(C=penalties))
        grid.fit(train_vectors, newsgroups_train.target)
        print(grid)
        
        # summarize the results of the grid search
        print(grid.best_score_)
        print(grid.best_estimator_.C)
        #clf = svm.SVC(kernel='linear', C = 10)
        #clf.fit(train_vectors, newsgroups_train.target)
        #pred = clf.predict(test_vectors)
        
        print classification_report(le.inverse_transform(newsgroups_test.target),le.inverse_transform(pred))
        print("%i passes : %s " % (epoch + 1, name))
        
    alpha -= alpha_delta

0.025
GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.5,
        2. ])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)
0.590953785644
2.0
             precision    recall  f1-score   support

          0       0.68      0.68      0.68       319
          1       0.92      0.89      0.91       389
          2       0.81      0.90      0.85       394
          3       0.68      0.60      0.64       251

avg / total       0.79      0.79      0.79      1353

1 passes : Doc2Vec(dm/c,d200,n100,w3,t8) 
0.02404
GridSearchCV(cv=None, error_score='raise',
       estimator=Line