# Just extrating the compressed data

In [2]:
import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

In [3]:
import pyprind
import pandas as pd
import os
# change the `basepath` to the directory of the
# unzipped movie dataset
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(25000) #no. of samples
df_train = pd.DataFrame()
for s in ('train',):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
                df_train = df_train.append([[txt, labels[l]]],ignore_index=True)
                pbar.update()
df_train.columns = ['review', 'sentiment']

print("done with importing training")

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(25000) #no. of samples
df_test = pd.DataFrame()
for s in ('test',):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
                df_test = df_test.append([[txt, labels[l]]],ignore_index=True)
                pbar.update()
df_test.columns = ['review', 'sentiment']

print("done with importing testing")

0% [##############################] 100% | ETA: 00:00:00

done with importing training



Total time elapsed: 00:02:46
0% [##############################] 100% | ETA: 00:00:00

done with importing testing



Total time elapsed: 00:02:46


In [4]:
import numpy as np
df_train.to_csv('movie_data_train.csv', index=False, encoding='utf-8')
df_test.to_csv('movie_data_test.csv', index=False, encoding='utf-8')

# Better Method using mini Batch, partialLearning, HashVectorizer for onlineLearning

In [5]:
import pyprind
import pandas as pd
import os
# change the `basepath` to the directory of the
# unzipped movie dataset
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000) #no. of samples
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]],ignore_index=True)
                pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:11:03


In [6]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [7]:
import pyprind
import pandas as pd
import os
import numpy as np

In [8]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [14]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
    text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [15]:
# be
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:   #it is a simple opening of a file with pointer as csv
        next(csv) # skip header 'review  and sentiment '  The next() function returns the next item in an iterator.
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label  #The yield statement suspends function’s execution and sends a value back to caller, but retains enough state to enable function to resume where it is left off. When resumed, the function continues execution immediately after the last yield run. 

In [16]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [24]:
doc_stream = stream_docs(path='movie_data.csv')
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

In [25]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
#for training we keep 45000
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:14


# Accuracy

In [26]:
#for testing we keep 5000
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.867
