In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
reviews_train = load_files('aclImdb/train/')

In [13]:
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print('length of text_train: {}'.format(len(text_train)))
a = np.array(3)
print(a)

type of text_train: <class 'list'>
length of text_train: 25000
3


In [4]:
text_train = [review.replace(b"<br />", b" ") for review in text_train]
print("samples per class (training): {}".format(np.bincount(y_train)))


samples per class (training): [12500 12500]


In [5]:
reviews_test = load_files('aclImdb/test')
text_test, y_test = reviews_test.data, reviews_test.target
text_test = [review.replace(b"<br />", b" ") for review in text_test]


In [6]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train) #transforming into sparse matrix
print("X_train:\n{}".format(repr(X_train)))


X_train:
<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [7]:
feature_names = vect.get_feature_names()
print("number of features: {}".format(len(feature_names)))
print("First 15 features: {}".format(feature_names[:15]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))



number of features: 74849
First 15 features: ['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am']
Features 20010 to 20030:
['dratted', 'draub', 'draught', 'draughts', 'draughtswoman', 'draw', 'drawback', 'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl', 'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea']


In [8]:
#Tokens that appears only in a single document is unlikely to appear
#in the test set thus not helpful; we require tokens to show up at least in 5 reviews
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df:{}".format(repr(X_train)))

X_train with min_df:<25000x27271 sparse matrix of type '<class 'numpy.int64'>'
	with 3354014 stored elements in Compressed Sparse Row format>


In [9]:
#removing stop words and imposing min_df
remove_these = ['the', 'and', 'is', 'of']
vect = CountVectorizer(min_df=5, stop_words = remove_these).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words:\n{}".format(repr(X_train)))

#representing text data as bag of words:

X_train with stop words:
<25000x27267 sparse matrix of type '<class 'numpy.int64'>'
	with 3258904 stored elements in Compressed Sparse Row format>


In [10]:
bards_words = ["The fool doth think he is wise,", 
               "but the wise man knows himself to be a fool"]
vect = CountVectorizer()
vect.fit(bards_words)
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))


Vocabulary size: 13
Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [11]:
#bag_of_words = vect.transform(bards_words)
#print("bag_of_words: {}".format(repr(bag_of_words)))