In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from time import time
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

# 6.6 Stochastic Gradient Descent

Let's look at the speed of normal logistic regression vs. logistic regression that uses stochastic gradient descent to fit it's parameters. The topic if the excercise is irrelevant (it's hard to find a good example, as you need a huge database to find a notable performance improvement). For those interested, it concerns an emailclassifier. It reads an email and models what the topic of the email is. So, this is actually your first natural language processing example. 

In [None]:
# Load the array into memory
X_train_np = np.load('data/sgd_X_train_np.npz')['arr_0']
X_test_np = np.load('data/sgd_X_test_np.npz')['arr_0']
Y_train_np = np.load('data/sgd_target_Y_train_np.npz')['arr_0']
Y_test_np = np.load('data/sgd_target_Y_test_np.npz')['arr_0']

In [None]:
X_train_np[0] #example of the data

In [None]:
Y_train_np 
# Each Y is actually a topic 1='alt.atheism', 2='talk.religion.misc', 3='comp.graphics', 4='sci.space'

In [None]:
# Extracting features from the training dataset using a sparse vectorizer
use_hashing = True
n_features = 500

t0 = time()

if use_hashing:
    vectorizer = HashingVectorizer(stop_words='english', n_features=n_features)
    X_train = vectorizer.transform(X_train_np)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
    X_train = vectorizer.fit_transform(X_train_np)

duration = time() - t0
print("completed in: " + str(duration) + ' seconds')

In [None]:
#Extracting features from the test dataset using the same vectorizer
t0 = time()
X_test = vectorizer.transform(X_test_np)
duration = time() - t0
print("completed in: " + str(duration) + ' seconds')

In [None]:
# mapping from integer feature name to original token string
if use_hashing:
    feature_names = None
else:
    feature_names = np.asarray(vectorizer.get_feature_names())

Here comes the part where we are interested in, comparing speed of the two approaches.

In [None]:
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, Y_train_np.astype(np.int))
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
  
    acc = clf.score(X_test, Y_test_np.astype(np.int))
    print('Score: ' + str(acc))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

iterations = 2
benchmark(SGDClassifier(loss='log', alpha=.001, max_iter=iterations))

In [None]:
benchmark(LogisticRegression())

The score is very similar but the time to train is faster. For this example the training time is negligible, but think about if the training data would be 1000 - 1 million times bigger. Which in reality is often the case.

Sources for Notebook:
- Nullege.com