In [14]:
from nltk.corpus import movie_reviews

In [15]:
movie_reviews.categories()

['neg', 'pos']

In [16]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [17]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [18]:
import random
random.shuffle(documents)
documents[0:5]

[(['my', 'filmcritic', '.', 'com', 'colleague', 'norm', ...], 'pos'),
 (['the', 'event', 'of', 'events', 'is', 'upon', 'us', ...], 'pos'),
 (['the', 'army', 'comedy', 'genre', 'has', 'never', ...], 'neg'),
 (['in', 'my', 'reviews', 'i', 'try', 'to', 'make', ...], 'pos'),
 (['all', 'through', 'its', 'production', 'and', 'into', ...], 'neg')]

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [20]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [21]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [22]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [23]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [24]:
documents = [(clean_review(document), category) for document, category in documents]

# Count Vectorizer
Used to convert data to format sklearn requires

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
train_set = {"the sky sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)

## it does the tokenizatin , choosing the best words and converting each document into frequency based array where column name are ## features and count in row represents frequency
a = count_vec.fit_transform(train_set)
# (a) is called as a sparse matrix, 
a.todense() ## converts sparse matrix to dense

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [27]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [28]:
a = ["ad", "is"]
" ".join(a)

'ad is'

# Y

In [29]:
categories = [category for document, category in documents] ## Y

# X

In [30]:
text_documents = [" ".join(document) for document, category in documents] ## X

# Split to test and train

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [33]:
count_vec = CountVectorizer(max_features = 2000)
x_train_features = count_vec.fit_transform(x_train) ## X matrix of training data required by sklearn 
x_train_features.todense() 

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [34]:
# count_vec.get_feature_names()

In [35]:
x_test_features = count_vec.transform(x_test) ## X matrix of testing data required by sklearn 

In [36]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 85081 stored elements in Compressed Sparse Row format>

# Sklearn

In [37]:
from sklearn.svm import SVC

In [38]:
svc = SVC()
svc.fit(x_train_features,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [39]:
svc.score(x_test_features,y_test)

0.81