In [1]:
import os
import numpy as np
import pandas as pd
os.chdir('D:/project/dl/Stanford/data/aclImdb')

## Data Sources : http://ai.stanford.edu/~amaas/data/sentiment/
## download the data

from sklearn.datasets import load_files

In [47]:
"""
http://ai.stanford.edu/~amaas/data/sentiment/

download the data from above site. train and test both folders has two sub-folders : pos and neg
pos - > positive review
neg - > negative review
"""

'\nhttp://ai.stanford.edu/~amaas/data/sentiment/\n\ndownload the data from above site. train and test both folders has two sub-folders : pos and neg\npos - > positive review\nneg - > negative review\n'

In [7]:
# load train data
reviews_train = load_files('./train/')
text_train, y_train = reviews_train.data, reviews_train.target

In [8]:
print('type of text_train: {}'.format(type(text_train)))
print('length of text_train: {}'.format(len(text_train)))
print('access text_train[1]: \n{}'.format(text_train[1]))
print('\nlable[1]: {}'.format(y_train[1]))
print('\nunique label: {}'.format(set(y_train)))

type of text_train: <class 'list'>
length of text_train: 25000
access text_train[1]: 
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive

In [9]:
print('Samples per class (training): {}'.format(np.bincount(y_train)))

Samples per class (training): [12500 12500]


In [10]:
## load test data
reviews_test = load_files('./test/')
text_test, y_test = reviews_test.data, reviews_test.target
print('type of text_train: {}'.format(type(text_train)))
print('length of text_train: {}'.format(len(text_train)))
print('access text_train[1]: \n{}'.format(text_train[1]))
print('\nlable[1]: {}'.format(y_train[1]))
print('\nunique label: {}'.format(set(y_train)))

type of text_train: <class 'list'>
length of text_train: 25000
access text_train[1]: 
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive

In [38]:
'''
Computing bag of words for a corpus of documents,consists of following steps:

1. Tokenization- split each documents into the words
2. Vocabulary building 
3. Encoding - for each document,count how often each of the words appear in the document

Below simple example to explain bag of words method
'''

sentence = ['In fact, every other movie in the world is better than this one. I would not watch it again']

from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
vector.fit(sentence)

print('Vocabulary size: {}'.format(len(vector.vocabulary_)))
print('Vocabulary content:\n {}'.format(vector.vocabulary_))

Vocabulary size: 17
Vocabulary content:
 {'in': 4, 'fact': 3, 'every': 2, 'other': 10, 'movie': 7, 'the': 12, 'world': 15, 'is': 5, 'better': 1, 'than': 11, 'this': 13, 'one': 9, 'would': 16, 'not': 8, 'watch': 14, 'it': 6, 'again': 0}


In [40]:
bag_of_words = vector.transform(sentence)
print('bag_of_words: {}'.format(repr(bag_of_words))) # difference between str() and repr() if we print string using repr() 
# function then it prints with a pair of quotes and if we calculate a value we get more precise value than str() function.

bag_of_words: <1x17 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>


In [41]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print('X_train: \n {}'.format(repr(X_train)))

X_train: 
 <25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3445861 stored elements in Compressed Sparse Row format>


In [43]:
features_name = vect.get_feature_names()
print('Number of features: {}'.format(len(features_name)))
print('\nFirst 10 features: {}'.format(features_name[:20]))
print('\nEvery 5000th features: {}'.format(features_name[::5000]))


Number of features: 74849

First 10 features: ['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']

Every 5000th features: ['00', 'augustine', 'bête', 'cost', 'draper', 'fleece', 'hasan', 'jardine', 'maars', 'nathaniel', 'pincher', 'replica', 'shunning', 'swordmen', 'unproven']


In [44]:
## let's fit model without any other cleaning process
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print('Mean cross validation accuracy: {:.2f}'.format(np.mean(scores)))

Mean cross validation accuracy: 0.88


In [50]:
## Predict sentiment for test data
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1,10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train,y_train)

print('Best cross-validation score: {:.2f}'.format(grid.best_score_))
print('Best parameters: ', grid.best_params_)

X_test = vect.transform(text_test)
print(" test score: {:.2f}".format(grid.score(X_test,y_test)))

Best cross-validation score: 0.89
Best parameters:  {'C': 0.1}
 test score: 0.88
