In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [4]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [5]:
import random
random.shuffle(documents)
documents[0:5]

[(['"', 'showgirls', '"', 'is', 'the', 'first', 'big', ...], 'neg'),
 (['for', 'better', 'or', 'worse', ',', 'the', ...], 'neg'),
 (['"', 'gordy', '"', 'is', 'not', 'a', 'movie', ',', ...], 'neg'),
 (['poster', 'boy', 'for', 'co', '-', 'dependency', ...], 'neg'),
 (['barely', 'scrapping', 'by', 'playing', 'at', 'a', ...], 'pos')]

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [9]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [10]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [11]:
documents = [(clean_review(document), category) for document, category in documents]

# Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
train_set = {"the sky sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)

## it does the tokenizatin , choosing the best words and converting each document into frequency based array where column name are ## features and count in row represents frequency
a = count_vec.fit_transform(train_set)
# (a) is called as a sparse matrix, 
a.todense()

matrix([[1, 0, 1],
        [1, 2, 1]], dtype=int64)

In [15]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [16]:
a = ["ad", "is"]
" ".join(a)

'ad is'

In [17]:
categories = [category for document, category in documents]

In [18]:
text_documents = [" ".join(document) for document, category in documents]

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [21]:
count_vec = CountVectorizer(max_features = 2000)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 2, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '17',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '54',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'airplane',
 'al',
 'ala',
 'alan',
 'alex',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apar

In [23]:
x_test_features = count_vec.transform(x_test)

In [24]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 84674 stored elements in Compressed Sparse Row format>