<a href="https://colab.research.google.com/github/saloniasrani/sentimentanalysis/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

In [5]:
movie_reviews.categories()

['neg', 'pos']

Importing dataset into a pandas framework

In [6]:
import pandas as pd


In [7]:
documents = []
for category in movie_reviews.categories():
  for fileid in movie_reviews.fileids(category):
    documents.append((movie_reviews.words(fileid),category))

documents[0:5]    

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [8]:
import random
random.shuffle(documents)
documents[0:5]

[(['wyatt', 'earp', 'details', 'thirty', '-', 'five', ...], 'neg'),
 (['the', 'rich', 'man', "'", 's', 'wife', 'is', 'one', ...], 'pos'),
 (['in', '"', 'the', '13th', 'warrior', ',', '"', 'arab', ...], 'neg'),
 (['in', 'tim', 'burton', "'", 's', '`', 'sleepy', ...], 'pos'),
 (['i', 'can', 'imagine', 'how', 'good', 'krippendorf', ...], 'neg')]

In [9]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
  if tag.startswith('J'):
    return wordnet.ADJ
  elif tag.startswith('V'):
    return wordnet.VERB
  elif tag.startswith('N'):
    return wordnet.NOUN
  elif tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

  

In [None]:

import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
from nltk import pos_tag
def clean_review(words):
  output_words =[]
  for w in words:
    if w.lower() not in stops:
      pos = pos_tag([w])
      clean_word = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
      output_words.append(clean_word.lower())
  return output_words

In [13]:
documents = [(clean_review(document), category) for document, category in documents]

In [None]:
documents[0]

In [16]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [17]:
all_words = []
for doc in training_documents:
  all_words+=doc[0]

In [21]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]


In [None]:
features

**Making a feature dictionary for each review**

In [25]:
def get_feature_dict(words):
  current_features = {}
  word_set = set(words)
  for w in features:
    current_features[w] = w in word_set
  return current_features


In [None]:
get_feature_dict(training_documents[0][0])

**Model Training and Testing**

In [27]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]

In [28]:
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [39]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [40]:
nltk.classify.accuracy(classifier,testing_data)

0.812

In [31]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     10.8 : 1.0
             wonderfully = True              pos : neg    =      9.4 : 1.0
                   anger = True              pos : neg    =      8.7 : 1.0
               ludicrous = True              neg : pos    =      8.5 : 1.0
                  welles = True              neg : pos    =      8.0 : 1.0
               stupidity = True              neg : pos    =      7.9 : 1.0
             magnificent = True              pos : neg    =      7.6 : 1.0
                   inept = True              neg : pos    =      7.5 : 1.0
                 stiller = True              pos : neg    =      7.4 : 1.0
                  poorly = True              neg : pos    =      6.9 : 1.0
                    lame = True              neg : pos    =      6.8 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                   damon = True              pos : neg    =      6.2 : 1.0

In [32]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
classifier_sklearn = SklearnClassifier(rfc)

In [35]:
classifier_sklearn.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [38]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.804