# Extracting features from text files
### CountVectorizer
Convert a collection of text documents to a matrix of token counts

$\begin{pmatrix}tf(t_1, d_1) & \cdots & tf(t_m, d_1) \\ tf(t_1, d_2) & \cdots & tf(t_m, d_2) \\ \vdots & \vdots & \vdots \\ tf(t_1, d_n) & \cdots & tf(t_m, d_n) \end{pmatrix}$

$tf(t, d)$: term frequency of term $t$ in the document $d$, i.e. the number of occurrances of term $t$ in the document $d$.


# Loading the Data From fetch_20newsgroups



In [5]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.graphics','comp.sys.mac.hardware']
data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
len(data_train.data)

1162

# Visualize the data

In [6]:
print data_train.data[0]

From: winstead@faraday.ece.cmu.edu (Charles Holden Winstead)
Subject: ftp site for Radius software???
Organization: Electrical and Computer Engineering, Carnegie Mellon

Hey All,

Does anyone know if I can ftp to get the newest version of Radiusware
and soft pivot from Radius?  I bought a pivot monitor, but it has an
old version of this software and won't work on my C650, and Radius said
it would be 4-5 weeks until delivery.

Thanks!

-Chuck





# Output Labels of the Training Data
Two different categories:
    - 'comp.graphics' and 'comp.sys.mac.hardware'

In [15]:
# list of category indices of the documents
print data_train.target 
print '-'*50
print data_train.target_names
print '-'*50
print data_train.target_names[data_train.target[0]]

[1 0 0 ..., 0 1 0]
--------------------------------------------------
['comp.graphics', 'comp.sys.mac.hardware']
--------------------------------------------------
comp.sys.mac.hardware


## The files/data is loaded in memory in the data attribute


In [16]:
print len(data_train.data)

1162


# CountVectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Stop Words


In [21]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS
print stop_words
print '-'*50
print len(stop_words)


frozenset(['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'nei

## Demo of CountVectorizer

    fit_transform(raw_documents, y=None)
    
 Learn the vocabulary dictionary and return term-document matrix.
    
 Returns:  X : array, [n_samples, n_features]
 - Document-term matrix.   

In [30]:
corpus = [
 'This is the first document. Not a document is this',
 'This is the second second document.',
 'And this is the third one.',
 'Hi Sandeep, let me split you!!',
]

# `fit_transform(corpus)` is equivalent to `fit(corpus)` then `transform(corpus)`
X = vectorizer.fit_transform(corpus)
print X 

print '-'*100

print '-'*100

print '-'*100
# feature names are terms
vectorizer.get_feature_names()
print 'Number of Features'
print len(vectorizer.get_feature_names())

  (0, 7)	1
  (0, 1)	2
  (0, 2)	1
  (0, 12)	1
  (0, 4)	2
  (0, 14)	2
  (1, 10)	2
  (1, 1)	1
  (1, 12)	1
  (1, 4)	1
  (1, 14)	1
  (2, 8)	1
  (2, 13)	1
  (2, 0)	1
  (2, 12)	1
  (2, 4)	1
  (2, 14)	1
  (3, 15)	1
  (3, 11)	1
  (3, 6)	1
  (3, 5)	1
  (3, 9)	1
  (3, 3)	1
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Number of Features
16


In [31]:
# use `toarray()` to convert sparse matrices to ordinary matrices (multi-dim arrays)
X.toarray()

array([[0, 2, 1, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0],
       [0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1]])

In [33]:

test_corpus = [
 'Another random document is this Sandeep.'
]

# Use `transform` instead of `fit_transform` here, to only count
# terms that are in the vocabulary of the training dataset
Y = vectorizer.transform(test_corpus)
# Here 'Another' and 'random' are just ignored
Y.toarray()

array([[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]])

# CountVectorizer on the fetch_20newsgroups

In [36]:
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(data_train.data)
X_train_counts.shape # 1162 docs, 19320 terms in the training dataset

(1162, 19320)