In [1]:
!tree -L 2 aclImdb

aclImdb
├── data
├── imdbEr.txt
├── imdb.vocab
├── movie_data
│   ├── full_test.txt
│   └── full_train.txt
├── README
├── sentoken
│   ├── neg
│   ├── pos
│   ├── test_neg
│   ├── test_pos
│   ├── train_neg
│   └── train_pos
├── test
│   ├── neg
│   └── pos
└── train
    ├── neg
    └── pos

15 directories, 5 files


In [18]:
from sklearn.datasets import load_files
from pprint import pprint
from sklearn.model_selection import GridSearchCV
import numpy as np
import scipy
reviews_train = load_files("aclImdb/train")

In [19]:
text_train, y_train = reviews_train.data, reviews_train.target
print('type of text_train:{}'.format(type(text_train)))
print('length of text_train:{}'.format(len(text_train)))
print('type of reviews_train.target: {}'.format(y_train))
pprint('text_train[1]:\n{}'.format(text_train[1]))

type of text_train:<class 'list'>
length of text_train:25000
type of reviews_train.target: [1 0 1 ..., 0 0 0]
('text_train[1]:\n'
 "b'Words can\\'t describe how bad this movie is. I can\\'t explain it by "
 'writing only. You have too see it for yourself to get at grip of how '
 'horrible a movie really can be. Not that I recommend you to do that. There '
 'are so many clich\\xc3\\xa9s, mistakes (and all other negative things you '
 'can imagine) here that will just make you cry. To start with the technical '
 "first, there are a LOT of mistakes regarding the airplane. I won\\'t list "
 "them here, but just mention the coloring of the plane. They didn\\'t even "
 'manage to show an airliner in the colors of a fictional airline, but instead '
 'used a 747 painted in the original Boeing livery. Very bad. The plot is '
 'stupid and has been done many times before, only much, much better. There '
 'are so many ridiculous moments here that i lost count of it really early. '
 "Also, I was on

In the block of code above, we observe that text_train contains 25,000 documents of both positive and negative
documents indicated by reviews_train.target

In [20]:
text_train = [doc.replace(b"<br />", b" ")for doc in text_train] #remove html line breaks



Now we load the test data in the same manner:

In [21]:
reviews_test = load_files('aclImdb/test')
text_test, y_test = reviews_test.data, reviews_train.target
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]


X_train <- contains all documents (as .txt files) in the training folder

y_test <- a list of 0s and 1s categorizing the documents in X_train as positive or negative

Currently the data is not in a format that ML algorithms can take as input. 
We will transform it into a bag-of-words model:

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(text_train) # tokenize the training data and builds our vocabulary set
print("Vocabulary size: {}".format(len(vect.vocabulary_))) #vocabulary contains 74849 words
print("Vocabulary content:\n {}".format(vect.vocabulary_)) #dictionary containing words and their count across all documents in training data
X_train = vect.transform(text_train)  #transforms our vocabulary into a sparse matrix
print("X_train representation: \n{}".format(repr(X_train)))

Vocabulary size: 74849
Vocabulary content:
X_train representation: 
<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


We observe that the bag-of-words representation of the training data, is 25,000x74849,
where each row of the matrix represents a document and each column represents a word in the vocabulary

In [7]:
import numpy as np
np.random.seed(42)
##LOGISTIC REGRESSION BEFORE FEATURE SELECTION: 88% withC=0.1 on test data
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
pipeline = make_pipeline(TfidfVectorizer(min_df = 5, norm='l2'), LogisticRegression(solver = 'liblinear') )
param_grid = {'logisticregression__C':[0.001, 0.1, 1, 10], 'logisticregression__penalty':['l2','l1']}                       
grid = GridSearchCV(pipeline, param_grid, cv =5) 
grid.fit(text_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'logisticregression__C': [0.001, 0.1, 1, 10], 'logisticregression__penalty': ['l2', 'l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
print('test score: {:.2f}'.format(grid.score(text_test,y_test)))
pprint("Best estimator: \n{}".format(grid.best_estimator_))

test score: 0.88
('Best estimator: \n'
 'Pipeline(memory=None,\n'
 "     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', "
 "binary=False, decode_error='strict',\n"
 "        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n"
 '        lowercase=True, max_df=1.0, max_features=None, min_df=5,\n'
 "        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...ty='l2', "
 "random_state=None, solver='liblinear',\n"
 '          tol=0.0001, verbose=0, warm_start=False))])')


In [9]:
pprint(grid.best_params_)

{'logisticregression__C': 10, 'logisticregression__penalty': 'l2'}


In [10]:
vectorizer = grid.best_estimator_.named_steps['tfidfvectorizer']
X_train = vectorizer.transform(text_train)
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names())
print('Features with lowest tfidf:\n{}'.format(feature_names[sorted_by_tfidf[:20]]))
print("Features with highest tfidf: \n{}".format(feature_names[sorted_by_tfidf[-20:]]))         

Features with lowest tfidf:
['suplexes' 'gauche' 'hypocrites' 'oncoming' 'songwriting' 'galadriel'
 'emerald' 'mclaughlin' 'sylvain' 'oversee' 'cataclysmic' 'pressuring'
 'uphold' 'thieving' 'inconsiderate' 'ware' 'denim' 'reverting' 'booed'
 'spacious']
Features with highest tfidf: 
['gadget' 'sucks' 'zatoichi' 'demons' 'lennon' 'bye' 'dev' 'weller'
 'sasquatch' 'botched' 'xica' 'darkman' 'woo' 'casper' 'doodlebops'
 'smallville' 'wei' 'scanners' 'steve' 'pokemon']


In [None]:
import mglearn
mglearn.tools.visualize_coefficients(
    grid.best_estimator_.named_steps['logisticregression'].coef,
    feature_names, n_top_features=30)



In [None]:
'''
import numpy as np
np.random.seed(42)
##LOGISTIC REGRESSION BEFORE FEATURE SELECTION: 88% withC=0.1 on test data
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
pipeline = make_pipeline(TfidfVectorizer(min_df = 5, norm='l2'), SGDClassifier(loss = 'log') )
param_grid = {'sgdclassifier__penalty':['l2','l1'],
              'sgdclassifier__learning_rate':['optimal', 'adaptive'],
              'sgdclassifier__alpha':[0.001,0.01,0.1,1,10],
              'sgdclassifier__eta0': [0.00001,0.0001,0.1, 10, 100]}                       
grid = GridSearchCV(pipeline, param_grid, cv =5) 
grid.fit(text_train, y_train)
'''




In [None]:
print('test score: {:.2f}'.format(grid.score(text_test,y_test)))
pprint("Best estimator: \n{}".format(grid.best_estimator_))


In [None]:
#SVM MODEL 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
np.random.seed(42)
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm = 'l2'), SVC())
param_grid = {'svc__gamma':[0.001, 0.01, 0.1, 1, 10, 100], 'svc__C':[0.001, 0.01, 0.1,1,10,100]}
grid = GridSearchCV(pipe, param_grid,  cv = 5)
grid.fit(text_train, y_train)


In [None]:
print('test score: {:.2f}'.format(grid.score(text_test,y_test)))
pprint("Best estimator: \n{}".format(grid.best_estimator_))
pred = grid.predict(text_test)
print("Confusion Matrix: \n{}".format(confusion_matrix(y_test, pred)))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

pipe = make_pipeline(TfidfVectorizer(min_df = 5, norm = 'l2'), AdaBoostClassifier())
param_grid = {'adaboostclassifier__n_estimators':{50, 100, 150,200, 250}}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)



In [None]:
print('hello')