In [1]:
%matplotlib inline
from preamble import *

## Vectorization

### Example application: Sentiment analysis of movie reviews
#### source: http://ai.stanford.edu/~amaas/data/sentiment/

In [9]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("type of text_train[6]: {}".format(type(text_train[6])))
print("text_train[6]:\n{}".format(text_train[6]))


type of text_train: <class 'list'>
length of text_train: 25000
type of text_train[6]: <class 'bytes'>
text_train[6]:
b"This movie has a special way of telling the story, at first i found it rather odd as it jumped through time and I had no idea whats happening.<br /><br />Anyway the story line was although simple, but still very real and touching. You met someone the first time, you fell in love completely, but broke up at last and promoted a deadly agony. Who hasn't go through this? but we will never forget this kind of pain in our life. <br /><br />I would say i am rather touched as two actor has shown great performance in showing the love between the characters. I just wish that the story could be a happy ending."


In [10]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [11]:
np.unique(y_train)

array([0, 1])

In [12]:
print("Samples per class (training): {}".format(np.bincount(y_train)))

Samples per class (training): [12500 12500]


In [13]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

Number of documents in test data: 25000
Samples per class (test): [12500 12500]


### Representing text data as Bag of Words

### Bag-of-word for movie reviews

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))
#print(X_train)

X_train:
<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [16]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 74849
First 20 features:
['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']
Features 20010 to 20030:
['dratted', 'draub', 'draught', 'draughts', 'draughtswoman', 'draw', 'drawback', 'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl', 'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea']
Every 2000th feature:
['00', 'aesir', 'aquarian', 'barking', 'blustering', 'bête', 'chicanery', 'condensing', 'cunning', 'detox', 'draper', 'enshrined', 'favorit', 'freezer', 'goldman', 'hasan', 'huitieme', 'intelligible', 'kantrowitz', 'lawful', 'maars', 'megalunged', 'mostey', 'norrland', 'padilla', 'pincher', 'promisingly', 'receptionist', 'rivals', 'schnaas', 'shunning', 'sparse', 'subset', 'temptations', 'treatises', 'unproven', 'walkman', 'xylophonist']


In [29]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression(random_state=10).fit(X_train, y_train)
X_test = vect.transform(text_test)
print("Logistic Regression Score after applying default vectorization: {:.2f}".format(LR.score(X_test, y_test)))

Logistic Regression Score after applying default vectorization: 0.87


In [30]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df: {}".format(repr(X_train)))

X_train with min_df: <25000x27271 sparse matrix of type '<class 'numpy.int64'>'
	with 3354014 stored elements in Compressed Sparse Row format>


In [23]:
feature_names = vect.get_feature_names()

print("First 50 features:\n{}".format(feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(feature_names[::700]))

First 50 features:
['00', '000', '10', '100', '11', '12', '13', '13th', '14', '15', '16', '17', '18', '19', '1930', '1930s', '1933', '1936', '1939', '1940', '1940s', '1944', '1945', '1950', '1950s', '1953', '1959', '1960', '1960s', '1968', '1969', '1970', '1970s', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1980s', '1981', '1982', '1983', '1984', '1985', '1986']
Features 20010 to 20030:
[]
Every 700th feature:
['00', 'blind', 'creation', 'expectation', 'hoffman', 'luckily', 'people', 'rope', 'sung', 'westerns']


In [31]:
LR=LogisticRegression(random_state=10).fit(X_train, y_train)
X_test = vect.transform(text_test)
LR.score(X_test, y_test)
print("Logistic Regression Score after applying min_df=5: {:.2f}".format(LR.score(X_test, y_test)))

Logistic Regression Score after applying min_df=5: 0.86


### Stop-words

In [32]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword:
['everything', 'hereafter', 'please', 'why', 'your', 'detail', 'whereby', 'thus', 'enough', 'most', 'mine', 'full', 'bottom', 'once', 'nobody', 'than', 'wherein', 'with', 'even', 'ours', 'upon', 'their', 'him', 'hereupon', 'below', 'again', 'already', 'along', 'get', 'who', 'anyway', 'this']


In [33]:
# Specifying stop_words="english" uses the built-in list.
# We could also augment it and pass our own.
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words:\n{}".format(repr(X_train)))

X_train with stop words:
<25000x26966 sparse matrix of type '<class 'numpy.int64'>'
	with 2149958 stored elements in Compressed Sparse Row format>


In [37]:
LR=LogisticRegression(random_state=10).fit(X_train, y_train)
X_test = vect.transform(text_test)
LR.score(X_test, y_test)
print("Logistic Regression Score after applying min_df=5 and removing STOP_WORDS: {:.2f}".format(LR.score(X_test, y_test)))

Logistic Regression Score after applying min_df=5 and removing STOP_WORDS: 0.86
