### Dataset Processing

In [None]:
!tar xvzf aclImdb_small.tgz > /dev/null

In [7]:
import os
import sys

import nltk
from nltk import word_tokenize
nltk.download('punkt')
import torch

#Sparse matrix implementation
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import numpy as np
from collections import Counter

np.random.seed(1)

class Vocab:
    def __init__(self, vocabFile=None):
        self.locked = False
        self.nextId = 0
        self.word2id = {}
        self.id2word = {}
        if vocabFile:
            for line in open(vocabFile):
                line = line.rstrip('\n')
                (word, wid) = line.split('\t')
                self.word2id[word] = int(wid)
                self.id2word[wid] = word
                self.nextId = max(self.nextId, int(wid) + 1)

    def GetID(self, word):
        if not word in self.word2id:
            if self.locked:
                return -1        #UNK token is -1.
            else:
                self.word2id[word] = self.nextId
                self.id2word[self.word2id[word]] = word
                self.nextId += 1
        return self.word2id[word]

    def HasWord(self, word):
        return self.word2id.has_key(word)

    def HasId(self, wid):
        return self.id2word.has_key(wid)

    def GetWord(self, wid):
        return self.id2word[wid]

    def SaveVocab(self, vocabFile):
        fOut = open(vocabFile, 'w')
        for word in self.word2id.keys():
            fOut.write("%s\t%s\n" % (word, self.word2id[word]))

    def GetVocabSize(self):
        #return self.nextId-1
        return self.nextId

    def GetWords(self):
        return self.word2id.keys()

    def Lock(self):
        self.locked = True

class IMDBdata:
    def __init__(self, directory, vocab=None):
        """ Reads in data into sparse matrix format """
        pFiles = os.listdir("%s/pos" % directory)
        nFiles = os.listdir("%s/neg" % directory)

        if not vocab:
            self.vocab = Vocab()
        else:
            self.vocab = vocab

        #For csr_matrix (see http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.sparse.csr_matrix.html#scipy.sparse.csr_matrix)
        X_values = []
        X_row_indices = []
        X_col_indices = []
        Y = []

        XwordList = []
        XfileList = []

        #Read positive files
        for i in range(len(pFiles)):
            f = pFiles[i]
            for line in open("%s/pos/%s" % (directory, f)):
                wordList   = [self.vocab.GetID(w.lower()) for w in word_tokenize(line) if self.vocab.GetID(w.lower()) >= 0]
                XwordList.append(wordList)
                XfileList.append(f)
                wordCounts = Counter(wordList)
                for (wordId, count) in wordCounts.items():
                    if wordId >= 0:
                        X_row_indices.append(i)
                        X_col_indices.append(wordId)
                        X_values.append(count)
            Y.append(+1.0)

        #Read negative files
        for i in range(len(nFiles)):
            f = nFiles[i]
            for line in open("%s/neg/%s" % (directory, f)):
                wordList   = [self.vocab.GetID(w.lower()) for w in word_tokenize(line) if self.vocab.GetID(w.lower()) >= 0]
                XwordList.append(wordList)
                XfileList.append(f)
                wordCounts = Counter(wordList)
                for (wordId, count) in wordCounts.items():
                    if wordId >= 0:
                        X_row_indices.append(len(pFiles)+i)
                        X_col_indices.append(wordId)
                        X_values.append(count)
            Y.append(-1.0)
            
        self.vocab.Lock()

        #Create a sparse matrix in csr format
        self.X = csr_matrix((X_values, (X_row_indices, X_col_indices)), shape=(max(X_row_indices)+1, self.vocab.GetVocabSize()))
        self.Y = np.asarray(Y)

        #Randomly shuffle
        index = np.arange(self.X.shape[0])
        np.random.shuffle(index)
        self.X = self.X[index,:]
        self.XwordList = [torch.LongTensor(XwordList[i]) for i in index]  #Two different sparse formats, csr and lists of IDs (XwordList).
        self.XfileList = [XfileList[i] for i in index]
        self.Y = self.Y[index]

[nltk_data] Downloading package punkt to /Users/Sandy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
self.X.shape

NameError: name 'self' is not defined

In [12]:
train = IMDBdata("aclImdb_small/train")
train.vocab.Lock()
dev = IMDBdata("aclImdb_small/dev", vocab=train.vocab)
test = IMDBdata("aclImdb_small/test", vocab=train.vocab)

In [24]:
train.X.shape

(7222, 57205)

In [26]:
train.Y.shape

(7222,)

In [28]:
train.X

  (0, 4)	5
  (0, 9)	2
  (0, 12)	8
  (0, 14)	16
  (0, 16)	12
  (0, 17)	1
  (0, 20)	3
  (0, 21)	8
  (0, 22)	2
  (0, 23)	15
  (0, 30)	5
  (0, 31)	12
  (0, 33)	15
  (0, 35)	3
  (0, 36)	7
  (0, 40)	32
  (0, 43)	1
  (0, 46)	4
  (0, 48)	1
  (0, 50)	14
  (0, 60)	1
  (0, 68)	4
  (0, 72)	1
  (0, 75)	18
  (0, 82)	7
  :	:
  (0, 40422)	1
  (0, 40423)	1
  (0, 40424)	1
  (0, 40425)	1
  (0, 40426)	1
  (0, 40427)	1
  (0, 40428)	1
  (0, 40429)	2
  (0, 40430)	1
  (0, 40431)	1
  (0, 40432)	1
  (0, 40433)	1
  (0, 40434)	2
  (0, 40435)	1
  (0, 40436)	1
  (0, 40437)	1
  (0, 40438)	1
  (0, 40439)	1
  (0, 40440)	1
  (0, 40441)	1
  (0, 40442)	1
  (0, 40443)	1
  (0, 40444)	1
  (0, 40445)	1
  (0, 40446)	1
