## Read Dataset

In [38]:
import pandas as pd

train = pd.read_csv('..\\0.data\\raw\\imdb_train.csv')
print(len(train))
print(train.head())

test = pd.read_csv('..\\0.data\\raw\\imdb_test.csv')
print(len(test))
print(test.head())

17500
      id  labels                                               text
0   1288       0  We saw this on the shelf at the local video st...
1   2064       0  Well, you'd better if you plan on sitting thro...
2  18997       1  This is my favorite Jackie Chan movie and in a...
3  10448       0  The long list of "big" names in this flick (in...
4  16133       1  The great and underrated Marion Davies shows h...
7500
      id  labels                                               text
0  20594       1  I am decidedly not in the target audience for ...
1    602       0  Detective Russell Logan(Lou Diamond Phillips)h...
2     29       0  I had some expectation for the movie, since it...
3  20342       1  I think that this movie is very neat. You eith...
4   6230       0  Well I just gave away 95 minutes and 47 second...


## To convert string data into numerical data one can use following methods
1. Bag of words
2. TFIDF
3. Word2Vec

####

1. -Remove Noisy Data - text file header,footer, HTML,XML,markup data - beatifulsoup or regex.
2. Tokenization
3. Normalization

## Clean Data (Tokenization, Lemmatization, Punctuation Removal and Lower Case

In [40]:
from nltk.tokenize import RegexpTokenizer
from nltk import stem

def clean_paragraph(para):
    lmtzr = stem.WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    p = ' '.join([lmtzr.lemmatize(token.lower()) for token in tokenizer.tokenize(para)])
    
    return p

In [41]:
import time; t0 = time.time()

train.text = [clean_paragraph(para) for para in train.text]

print(time.time() - t0)

15.79221796989441


## Build Count Vector on Training Data

In [42]:
import sklearn, nltk
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
vectorizer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)
training_set_counts = vectorizer.fit_transform(train.text)

### Test to see what really is happening

'''
The count vectorizer will extract all unique tokens
and makes a matrix of 17500 rows and 58531 unique tokens.

Each row is the vector representation of that document.
'''

In [45]:
'''This will work for minimum doc freq Not Set'''
index = 0

# Convert para to tokens. Length is 95.
tempTok = nltk.word_tokenize(train.text[index])
print(len(tempTok))

# Create a dictionary to keep unique tokens only. Says 64 unique tokens.
validationDict = {}
for tok in tempTok:
    validationDict[tok] = 1
print(len(validationDict))

# Says that in the first row there are 64 entries. Hence, verified
training_set_counts[index]

95
64


<1x34325 sparse matrix of type '<class 'numpy.int64'>'
	with 63 stored elements in Compressed Sparse Row format>

In [46]:
'''
17500 docs, 34325 unique tokens
'''
print(training_set_counts.shape)
print(vectorizer.vocabulary_.get('underrated'))
print(len(tokDict))

(17500, 34325)
31896
58535


## Running the model on test data

In [47]:
'''Clean test Data'''
test.text = [clean_paragraph(para) for para in test.text]

# Run vectorizer on it.
test_set_counts = vectorizer.transform(test.text)

In [48]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(training_set_counts, train.labels)

test_label_predictions = clf.predict(test_set_counts)
sklearn.metrics.accuracy_score(test.labels, test_label_predictions)

0.8402666666666667