In [1]:
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
#from abstract import *
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from sklearn.grid_search import GridSearchCV
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report
import sys



In [2]:
categories = ['alt.atheism','rec.sport.baseball','talk.politics.mideast','comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories, remove=('headers', 'footers', 'quotes'))

In [3]:
from pprint import pprint
#pprint(newsgroups_train)

# Preprocessing 

In [4]:
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer


def preprocessing(text,stem=False, stop=False, sent=False):
    
    
    # Remove punctuations
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)
    
 
    tokens = word_tokenize(text)
    
    if stop:
        stop = stopwords.words('english')
        tokens =[word for word in tokens if word not in stop]
        tokens = [word.lower() for word in tokens]

    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
    
    if sent:
        tokens = ' '.join(tokens)
        
    return tokens
    




# Using TF-IDF 

In [5]:
vectorizer = TfidfVectorizer(tokenizer=preprocessing,
                             stop_words=stopwords.words('english'))
le = LabelEncoder()

# Training 
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
labels_train = le.fit_transform(newsgroups_train.target)

# Just for debug quickly reassign labels
X = vectors_train
y = labels_train


#clf = MultinomialNB(alpha=.01)
clf = svm.SVC(kernel='linear', C = 1.0)

# Evaluate using cross validation
cross_val = KFold(len(newsgroups_train.target),n_folds=5,shuffle=True)

#for train_index, test_index in cross_val:
#    X_train,X_test = X[train_index],X[test_index]
#    y_train,y_test = y[train_index],y[test_index]
#    clf.fit(X_train,y_train)
#    y_pred = clf.predict(X_test)
#    print classification_report(le.inverse_transform(y_test),le.inverse_transform(y_pred))
#    print metrics.confusion_matrix(y_pred,y_test)

clf.fit(vectors_train, newsgroups_train.target)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)

#print metrics.f1_score(newsgroups_test.target, pred)
print classification_report(le.inverse_transform(newsgroups_test.target),le.inverse_transform(pred))


             precision    recall  f1-score   support

          0       0.73      0.70      0.71       319
          1       0.88      0.87      0.88       389
          2       0.82      0.90      0.86       397
          3       0.80      0.83      0.81       394
          4       0.87      0.78      0.82       376

avg / total       0.82      0.82      0.82      1875



# Using Doc2Vec

In [6]:
#Preprocessing for Doc2Vec 
def clean_news(articles):
    
    clean = []
    
    for article in articles:
        clean.append(preprocessing(article,stop=True,sent=False, stem=False))
        #print clean
        #sys.exit(1)
    
    return clean
        

newsgroups_train.data = clean_news(newsgroups_train.data)
newsgroups_test.data = clean_news(newsgroups_test.data)

In [7]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec

from collections import OrderedDict
import multiprocessing
from gensim.models.doc2vec import TaggedDocument
import sys

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"


In [8]:
from collections import namedtuple
import nltk
from gensim.models.doc2vec import  LabeledSentence
import sys



#NewsgroupDocument = namedtuple('NewsGroupDocument', 'words tags split category')
#doc_count = 0 # Used to generate unique id for all documents across both train and test

all_newsgroup_documents = []

#Used to convert newsgroup corpus into Doc2Vec formats
def convert_newsgroup(docs,split):
    #global doc_count
    tagged_documents = []
    
    for i,v in enumerate(docs):
        label = '%s_%s'%(split,i)
        tagged_documents.append(LabeledSentence(v, [label]))
    
    return tagged_documents
    
    #for doc, label in zip(docs,labels):
      #  doc_count += 1
        #print doc
        #words = gensim.utils.to_unicode(doc).split() # expected by gensim
        #tags = [doc_count] #needs to be a list. Exp with having multiple tags
        #all_newsgroup_documents.append(NewsgroupDocument(words,tags,split,label))
        #print words
        
        #if doc_count == 5:
        #    print all_newsgroup_documents
        #    break
            #sys.exit(0)
    
test_docs = convert_newsgroup(newsgroups_test.data,'test')
train_docs = convert_newsgroup(newsgroups_train.data,'train')

all_newsgroup_documents.extend(train_docs)
all_newsgroup_documents.extend(test_docs)
#train_docs = [doc for doc in all_newsgroup_documents if doc.split == 'train']
#test_docs = [doc for doc in all_newsgroup_documents if doc.split == 'test']
doc_list = all_newsgroup_documents[:]  # for reshuffling per pass

print('%d docs: %d train, %d test' % (len(doc_list), len(train_docs), len(test_docs)))
print(len(newsgroups_train.target))

4693 docs: 2818 train, 1875 test
2818


### Set Up Doc2Vec

In [9]:
#Doc2Vec(dm=0, dm_concat=1, size=300, window=5, negative=5, hs=0, min_count=3, workers=cores)
#Doc2Vec(dm=0, dm_mean=1, size=300, window=5, negative=5, hs=0, min_count=3, workers=cores),

dbow_model = Doc2Vec(dm=0, dm_concat=1,sample=1e-5, size=300, window=5, negative=5, hs=0, min_count=2, workers=cores)
dm_model =  Doc2Vec(dm=1, dm_mean=1, sample=1e-5, size=300, window=10, negative=5, hs=0, min_count=2, workers=cores)

# TODO speed setup by sharing results of 1st model's vocabulary scan
dbow_model.load_word2vec_format('/home/skillachie/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True)
dbow_model.build_vocab(all_newsgroup_documents)  # PV-DM/concat requires one special NULL word so it serves as template

dm_model.load_word2vec_format('/home/skillachie/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True)
dm_model.build_vocab(all_newsgroup_documents)  # PV-DM/concat requires one special NULL word so it serves as template



# Models to evaluate
#simple_models = [
 
    # PV-DBOW  0.86 with Stem & hs=0
    #Doc2Vec(dm=0, dm_concat=1,sample=1e-5, size=300, window=5, negative=5, hs=0, min_count=2, workers=cores),
    
    #
    #Doc2Vec(dm=0, dm_mean=1, sample=1e-5,size=300, window=5, negative=5, hs=0, min_count=2, workers=cores),
    
    
    #Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=1, workers=cores),
    
    # PV-DM w/average No good 0.84
    #Doc2Vec(dm=1, dm_mean=1, sample=1e-5, size=300, window=10, negative=5, hs=0, min_count=2, workers=cores),
    
    # PV-DM w/sum
    #Doc2Vec(dm=1, dm_mean=0, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
#]

# speed setup by sharing results of 1st model's vocabulary scan
#simple_models[0].load_word2vec_format('/home/skillachie/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True)
#simple_models[0].build_vocab(all_newsgroup_documents)  # PV-DM/concat requires one special NULL word so it serves as template


#print(simple_models[0])

#for model in simple_models[1:]:
    #model.reset_from(simple_models[0])
#    model.load_word2vec_forma('/home/skillachie/nlpArea51/doc2vec/GoogleNews-vectors-negative300.bin', binary=True)
#    model.build_vocab(all_newsgroup_documents)
#    print(model)

#models_by_name = OrderedDict((str(model), model) for model in simple_models)

In [10]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
dbow_dmm_model = ConcatenatedDoc2Vec([dbow_model, dm_model])
#models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

### Prediction and Evaluation

### Training

Why is bulk training done with all the  docs from train, test,  and dev ? Compared to other methods training is only done on the training dataset 

In [11]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0)  # to selectively-print only best errors achieved

In [12]:
#Get Vectors From Word2Vec
def extract_vectors(model,docs):
    
    vectors_list = []
    
    for doc_no in range(len(docs)):
        doc_label = docs[doc_no].tags[0]
        doc_vector = model.docvecs[doc_label]
        vectors_list.append(doc_vector)
        
    return vectors_list

# TODO inferred vectors

def get_infer_vectors(model,docs):
    
    vecs = []
    for doc in docs:
        vecs.append(model.infer_vector(doc.words))
    return vecs
    
    

In [13]:

from random import shuffle
alpha, min_alpha, passes = (0.025, 0.001, 100)
alpha_delta = (alpha - min_alpha) / passes
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC


for epoch in range(passes):
    shuffle(doc_list)
    
    #for name, train_model in models_by_name.items():
        
    #Train
    print alpha
    
    dbow_model.alpha, dbow_model.min_alpha = alpha, alpha
    dbow_model.train(doc_list)
    
    dm_model.alpha, dm_model.min_alpha = alpha, alpha
    dm_model.train(doc_list)
    
    
    dbow_dmm_model.alpha, dbow_dmm_model.min_alpha = alpha, alpha
    dbow_dmm_model.train(doc_list)
    
    
    
        
    alpha -= alpha_delta
    

0.025
0.02476
0.02452
0.02428
0.02404
0.0238
0.02356
0.02332
0.02308
0.02284
0.0226
0.02236
0.02212
0.02188
0.02164
0.0214
0.02116
0.02092
0.02068
0.02044
0.0202
0.01996
0.01972
0.01948
0.01924
0.019
0.01876
0.01852
0.01828
0.01804
0.0178
0.01756
0.01732
0.01708
0.01684
0.0166
0.01636
0.01612
0.01588
0.01564
0.0154
0.01516
0.01492
0.01468
0.01444
0.0142
0.01396
0.01372
0.01348
0.01324
0.013
0.01276
0.01252
0.01228
0.01204
0.0118
0.01156
0.01132
0.01108
0.01084
0.0106
0.01036
0.01012
0.00988
0.00964
0.0094
0.00916
0.00892
0.00868
0.00844
0.0082
0.00796
0.00772
0.00748
0.00724
0.007
0.00676
0.00652
0.00628
0.00604
0.0058
0.00556
0.00532
0.00508
0.00484
0.0046
0.00436
0.00412
0.00388
0.00364
0.0034
0.00316
0.00292
0.00268
0.00244
0.0022
0.00196
0.00172
0.00148
0.00124


In [14]:
#Evaluation
train_vectors = extract_vectors(dbow_dmm_model,train_docs)
test_vectors = extract_vectors(dbow_dmm_model,test_docs)



#model = LinearSVC()
#penalties = np.array([0.001,0.002,0.003,0.004,0.005,0.007,0.008,0.009,0.01,0.05,0.04,0.03,0.02])
#grid = GridSearchCV(estimator=model ,n_jobs=7,param_grid=dict(C=penalties))
#grid.fit(train_vectors, newsgroups_train.target)
        
# summarize the results of the grid search
#print(grid.best_score_)
#print(grid.best_estimator_.C)

#clf = LinearSVC(C=0.009)
clf = LinearSVC(C=0.0025)
clf.fit(train_vectors, newsgroups_train.target)

predDoc = clf.predict(test_vectors)
        
print classification_report(le.inverse_transform(newsgroups_test.target),le.inverse_transform(predDoc))

             precision    recall  f1-score   support

          0       0.81      0.74      0.77       319
          1       0.91      0.92      0.91       389
          2       0.83      0.93      0.87       397
          3       0.89      0.85      0.87       394
          4       0.86      0.84      0.85       376

avg / total       0.86      0.86      0.86      1875

