In [1]:
#importing data set from nltk

import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\prince\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [2]:
#testing the imported data
movie_reviews.words(movie_reviews.fileids()[8])

['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...]

In [3]:
#creating document as a array of tuples containing document and the category 
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[2]

(['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg')

In [4]:
#creating stopwords and punctuation array
from nltk.corpus import stopwords
import string
punct=list(string.punctuation)
stop=stopwords.words('english')
stop=stop+punct

In [5]:
#importing pos_tag, lemmatizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
lemmatizer=WordNetLemmatizer()

In [6]:
#function which return pos_tag as require in the format by lemmatizer
from nltk.corpus import wordnet
def get_simple_pos_tag(w):
    if w.startswith('J'):
        return wordnet.ADJ
    elif w.startswith('V'):
        return wordnet.VERB
    elif w.startswith('N'):
        return wordnet.NOUN
    elif w.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
  

In [7]:
#function which lemmatize our documents and also remove stopwords
def clean_reviews(words):
    output_words=[]
    for w in words:
        if w.lower() not in stop:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos_tag(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [8]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prince\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# calling function for data cleaning
documents=[(clean_reviews(document),category) for document,category in documents]

In [10]:
#shuffling documents
import random
random.shuffle(documents)

In [11]:
#splitting document into train and test
training_document=documents[0:1700]
testing_document=documents[1700:]


In [12]:
#adding all the words in a array from train documents to create feature set
all_words=[]
for doc in training_document:
    all_words+=doc[0]
len(all_words)

602369

In [13]:
#using top k frequency words as features
import nltk
frequency=nltk.FreqDist(all_words)
k=3000
common=frequency.most_common(k)
feature_set=[i[0] for i in common]

In [14]:
# function for creating feature for a doc 
def feature_dict(doc):
    feature_dict={}
    words=set(doc)
    for w in feature_set:
        feature_dict[w]= w in words
    return feature_dict

In [15]:
#calling function for train and test to create feature 
x_train=[(feature_dict(document),category) for document,category in training_document]
x_test=[(feature_dict(document),category) for document,category in testing_document]

In [16]:
#applying naivebayesclassifier
from nltk import NaiveBayesClassifier
classifier=NaiveBayesClassifier.train(x_train)
nltk.classify.accuracy(classifier,x_test)

0.84

In [17]:
#import svm
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [18]:
#applying svm
svc=SVC()
classifier_sklearn=SklearnClassifier(svc)
classifier_sklearn.train(x_train)
nltk.classify.accuracy(classifier_sklearn,x_test)

0.8833333333333333

In [19]:
#pplying randomforest
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
classifier_sklearn1=SklearnClassifier(rfc)
classifier_sklearn1.train(x_train)
nltk.classify.accuracy(classifier_sklearn1,x_test)

0.8733333333333333

In [20]:
#creating a array for the category as Y
cat=[category for document,category in documents]

In [21]:
#creating a document array which contains words join by space as X
text_doc=[" ".join(document) for document,category in documents]

In [22]:
#splitting the document
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(text_doc,cat)

In [23]:
#applying countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(max_features=2000)
x_train_feature=count_vec.fit_transform(x_train)
x_train_feature.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 7, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
x_test_feature=count_vec.transform(x_test)

In [25]:
# apply svm
svc2=SVC()
svc2.fit(x_train_feature,y_train)
svc2.score(x_test_feature,y_test)

0.816

In [26]:
#using countvectorizer with ngram
from sklearn.feature_extraction.text import CountVectorizer
count_vec2=CountVectorizer(max_features=2000, ngram_range=(2,3))
x_train_feature=count_vec2.fit_transform(x_train)
x_train_feature.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
x_test_feature=count_vec2.transform(x_test)

In [28]:
#applying svm 
svc2=SVC()
svc2.fit(x_train_feature,y_train)
svc2.score(x_test_feature,y_test)

0.74

In [29]:
#using tfidvectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
count_vec2=TfidfVectorizer(max_features=2000)
x_train_feature=count_vec2.fit_transform(x_train)
x_train_feature.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.32642951, 0.        , ..., 0.        , 0.03186018,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [30]:
x_test_feature=count_vec2.transform(x_test)

In [31]:
#applying svm
svc2=SVC()
svc2.fit(x_train_feature,y_train)
svc2.score(x_test_feature,y_test)

0.812