# Sentiment Analysis

## Initialize library

In [1]:
import pandas as pd
import nltk
from nltk import pos_tag, FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from math import pow

## Basic naive bayes

In [2]:
def make_to_list(dataframe):    # Convert dataframe into list
    documents = []
    for index, row in dataframe.iterrows():
        tokens = nltk.word_tokenize(row['Comment']) # Tokenize each document
        documents.append((tokens, row['Label']))    # Add the tokenized document into the list, paired with the label
    return documents

def feature_extraction(document, word_features):
    stop_words = set(nltk.corpus.stopwords.words('english'))    # Create the list of stop words

    filtered_sentence = [w for w in document if not w in stop_words] # Filter the sentence by removing stop words
    document_words = set(filtered_sentence)     # Remove repeating word
    features = {}
    for word in word_features:  # Loop through all the word features
        # Enter the feature (Extract the feature from each document) to the dictionary
        features['contains({})'.format(word)] = (word in document_words)
    return features

def feature_selection(documents):
    word_list = []
    for docs, label in documents:   # Loop through all documents
        word_list.extend(docs)  # Append all the word into the list
    document_set = set(word_list)   # Remove repeating word
    # Get the distribution of the word
    all_words = nltk.FreqDist(word.lower()for word in document_set)
    # Get the top 2000 most frequent words as feature
    word_features = list(all_words)[:2000]
    return word_features

In [3]:
dataframe = pd.read_csv('amazon_cells_labelled.csv')    # Read the file as dataframe
docs = make_to_list(dataframe)  # Get the dataframe as a list

In [4]:
feature = feature_selection(docs)   #Select the feature
# Extract the selected features from documents
featuresets = [(feature_extraction(d, feature), c) for (d, c) in docs]
train_set, test_set = featuresets[200:], featuresets[:200]  # Split test and train data
print("Data points in training set = {}, Data points in test set = {}".format(len(train_set), len(test_set)))

Data points in training set = 800, Data points in test set = 200


In [5]:
classifier1 = nltk.NaiveBayesClassifier.train(train_set) # Train the naive bayes classifier
# Get the accuracy
print("Accuracy is : {}%".format(accuracy(classifier1, test_set)*100)) 
print(classifier1.show_most_informative_features(10))  # Get 10 of the most informative features in the classifier

Accuracy is : 75.5%
Most Informative Features
         contains(works) = True                1 : 0      =     15.1 : 1.0
         contains(great) = True                1 : 0      =      9.5 : 1.0
         contains(money) = True                0 : 1      =      9.0 : 1.0
         contains(price) = True                1 : 0      =      8.6 : 1.0
         contains(happy) = True                1 : 0      =      7.7 : 1.0
         contains(first) = True                0 : 1      =      7.6 : 1.0
          contains(fine) = True                1 : 0      =      7.0 : 1.0
          contains(best) = True                1 : 0      =      5.8 : 1.0
         contains(calls) = True                0 : 1      =      5.6 : 1.0
   contains(comfortable) = True                1 : 0      =      5.0 : 1.0
None


## Naive Bayes + Pos tagging

In [6]:
# Dictionary for nltk pos tag groupings
POS_TAG_GROUPINGS = {
    "JJ": "Adjective", "JJR": "Adjective", "JJS": "Adjective",
    "NN": "Noun", "NNS": "Noun", "NNP": "Noun", "NNPS": "Noun",
    "PRP": "Pronoun", "PRP$": "Pronoun",
    "RB": "Adverb", "RBR": "Adverb", "RBS": "Adverb",
    "VB": "Verb", "VBD": "Verb", "VBG": "Verb", "VBN": "Verb", "VBP": "Verb", "VBZ": "Verb",
    "VH": "Verb", "VHD": "Verb", "VHG": "Verb", "VHN": "Verb", "VHP": "Verb", "VHZ": "Verb",
    "VV": "Verb", "VVD": "Verb", "VVG": "Verb", "VVN": "Verb", "VVP": "Verb", "VVZ": "Verb",
}


In [7]:
def make_to_list(dataframe): # Convert dataframe into list
    documents = []
    for index, row in dataframe.iterrows():
        word_tokens = word_tokenize(row['Comment']) # Tokenize each document
        tags = pos_tag(word_tokens) # Get the tag for each tokenized word
        word_set = []
        for word, tag in zip(word_tokens, tags):    # Loop through the tokens and tag pair
            if tag[1] not in POS_TAG_GROUPINGS.keys():  # If the tag is not in the group we consider
                # Append the word in a tuple together with the tag as 'Others'
                word_set.append(tuple([word.lower(), 'Others']))
            else:   # If the tag is in the group we consider
                # Append the word in a tuple together with the tag group
                word_set.append(tuple([word.lower(), POS_TAG_GROUPINGS[tag[1]]]))
        # Append the whole set of row (document) into the main documents list
        documents.append((word_set, row['Label']))
    return documents

def feature_extraction(document, word_features):    # Get the feature for the data row (document)
    stop_words = set(stopwords.words('english'))    # Create the list of stop words
    filtered_sentence = [(w,t) for w, t in document if not w in stop_words] # Filter the sentence by removing stop words
    document_words = set(filtered_sentence)     # Remove repeating word set in the document
    features = {}
    for word_set in word_features:   # Loop through all the features selected
        # Enter the feature (Extract the feature from each document) to the dictionary
        features['{} as {}'.format(word_set[0], word_set[1])] = \
            (tuple([word_set[0], word_set[1]]) in document_words)
    return features


def feature_selection(documents):
    word_list = []
    for docs, label in documents:   # Loop through all document
        word_list.extend(docs)  # Append all the word-tag pair
    document_set = set(word_list)   # Remove repeating pair
    all_words = FreqDist((word.lower(), tag) for word, tag in document_set) #Get the distribution
    word_features = list(all_words)[:2000]  # Get the 2000 most common pair
    return word_features

In [8]:
dataframe = pd.read_csv('amazon_cells_labelled.csv')    # Read the file as dataframe
docs = make_to_list(dataframe)  # Get the dataframe as a list

In [9]:
features = feature_selection(docs)      # Select the features to use
# Extract the selected features from the documents
featuresets = [(feature_extraction(d, features), c) for (d, c) in docs]
train_set, test_set = featuresets[200:], featuresets[:200]  # Split the train and test data
print("Data point in training set = {}, Data point in test set = {}".format(len(train_set), len(test_set)))

Data point in training set = 800, Data point in test set = 200


In [10]:
classifier2 = NaiveBayesClassifier.train(train_set)  # Train the naive bayes classifier
print("Accuracy is : {}%".format(accuracy(classifier2, test_set)*100))    # Get the accuracy
print(classifier2.show_most_informative_features(10))    # Get 10 of the most informative features in the classifier

Accuracy is : 80.5%
Most Informative Features
           works as Verb = True                1 : 0      =     15.1 : 1.0
  excellent as Adjective = True                1 : 0      =     14.4 : 1.0
      great as Adjective = True                1 : 0      =     10.2 : 1.0
           price as Noun = True                1 : 0      =      9.0 : 1.0
           money as Noun = True                0 : 1      =      9.0 : 1.0
      happy as Adjective = True                1 : 0      =      7.7 : 1.0
           piece as Noun = True                0 : 1      =      7.6 : 1.0
comfortable as Adjective = True                1 : 0      =      7.0 : 1.0
       best as Adjective = True                1 : 0      =      6.6 : 1.0
      first as Adjective = True                0 : 1      =      6.3 : 1.0
None


## Naive Bayes + Sentiment Polarity

In [11]:
# Score / weight for each group of tag
POS_TAG_SCORING = {
    "Adjective": 0.5,
    "Verb": 0.333,
    "Noun": 0.2,
    "Adverb": 0,
    "Pronoun": 0.2
}

In [12]:
def make_to_list(dataframe): # Convert dataframe into list
    documents = []
    for index, row in dataframe.iterrows():
        word_tokens = word_tokenize(row['Comment']) # Tokenize each document
        tags = pos_tag(word_tokens) # Get the tag for each tokenized word
        word_set = []
        for word, tag in zip(word_tokens, tags):    # Loop through the tokens and tag pair
            if tag[1] not in POS_TAG_GROUPINGS.keys():  # If the tag is not in the group we consider
                # Append the word in a tuple together with the tag as 'Others'
                word_set.append(tuple([word.lower(), 'Others']))
            else:   # If the tag is in the group we consider
                # Append the word in a tuple together with the tag group
                word_set.append(tuple([word.lower(), POS_TAG_GROUPINGS[tag[1]]]))
        # Append the whole set of row (document) into the main documents list
        documents.append((word_set, row['Label']))
    return documents


def sentiment_polarity_dictionary_creation(documents):  # Create a sentiment polarity dictionary
    list_of_all_words = []  # Prepare a list to store all the word set (Word with the group tag)
    dictionary_count = {0: {}, 1: {}}   # Create a dictionary for their count based on classification label
    for word_sets, label in documents:   # Loop through the whole data row (Documents)
        non_repeat = set(word_sets)  # Remove repeating word set
        for word_and_tag in non_repeat:    # Loop through the non repeating word set
            list_of_all_words.append(word_and_tag)    # Append the word set into the main list
            if word_and_tag not in dictionary_count[label]: # If the word set is not in the dictionary
                # Make a dictionary for both classification with 0 as the initial count
                dictionary_count[0][word_and_tag] = 0
                dictionary_count[1][word_and_tag] = 0
                dictionary_count[label][word_and_tag] = 1   # Set value as 1 for the label where that word is found
            else:   # If it already existed
                dictionary_count[label][word_and_tag] += 1    # Add 1 to the previous count value
    non_repeating_list = set(list_of_all_words) # Remove repeating word set in the main list
    # Create a polarity score dictionary
    polarity_dictionary = polarity_calculation(dictionary_count, non_repeating_list)
    return polarity_dictionary


def polarity_calculation(dictionary_count, list_of_all_words):  # Create a polarity score dictionary
    # Create the dictionary for all word set with 0 as the initial value
    polarity_dictionary = {word_set: 0 for word_set in list_of_all_words}
    for word, tag in list_of_all_words: # Loop through all the wordset in the documents
        negative_count = dictionary_count[0][(word, tag)]   # Get the count of this word set appear in negative document
        positive_count = dictionary_count[1][(word, tag)]   # Get the count of this word set appear in positive document
        # Get the score for the group tag, 0 for those not in the POS_SCORING
        tag_score = POS_TAG_SCORING[tag] if tag in POS_TAG_SCORING.keys() else 0
        # Calculate the polarity score
        polarity_score = (pow(tag_score, negative_count+1) - pow(tag_score, positive_count+1)) / (1-tag_score)
        polarity_dictionary[(word, tag)] = polarity_score   # Insert the polarity score for the word set
    return polarity_dictionary


def feature_extraction(document, word_features):    # Get the feature for the data row (document)

    stop_words = set(stopwords.words('english'))    # Create the list of stop words

    filtered_sentence = [(w,t) for w, t in document if not w in stop_words] # Filter the sentence by removing stop words
    document_words = set(filtered_sentence)     # Remove repeating word set in the document
    features = {}
    for word_set, value in word_features:   # Loop through all the features selected
        # Enter the feature (Extract the feature from each document) to the dictionary
        features['{} as {}'.format(word_set[0], word_set[1])] = \
            (tuple([word_set[0], word_set[1]]) in document_words)
    return features


def feature_selection(polarity_dictionary, feature_no = 1, positive_n = 750, negative_n = 850):
    # Select features to use
    # Sort the polarity scores in ascending order
    sorted_polarity = sorted(polarity_dictionary.items(), key=lambda kv: (kv[1], kv[0]))
    # Get the n_negative number of most negative polarity
    most_negative_features = sorted_polarity[:negative_n]
    print("Highest in most negative features selected ", most_negative_features[-1])
    # Get the n_positive number of most positive polarity
    most_positive_features = sorted_polarity[-positive_n:]
    print("Lowest in most positive features selected ", most_positive_features[0])

    features1 = []
    features1.extend(most_negative_features)    # Add the negative features into the main list of features
    features1.extend(most_positive_features)    # Add the positive features into the main list of features
    print("Features 1 ({} most positive + {} most negative: {}"
          .format(len(most_positive_features), len(most_negative_features), len(features1)))
    # Feature 2, get all polarity that is not 0 as features
    features2 = [(word_set, score) for word_set, score in sent_dict.items() if score != 0]
    print("Features 2 (Non zero polarity):", len(features2))
    features_selected = []
    if feature_no == 1:  # Select features 1 as the features
        features_selected = features1
        print("Feature 1 is selected")
    elif feature_no == 2:   # Select features 2 as the features
        features_selected = features2
        print("Feature 2 is selected")
    return features_selected

In [13]:
dataframe = pd.read_csv('amazon_cells_labelled.csv')    # Read the file as dataframe
docs = make_to_list(dataframe)  # Get the dataframe as a list

In [14]:
sent_dict = sentiment_polarity_dictionary_creation(docs)    # Get the sentiment polarity score
features = feature_selection(sent_dict, 1)      # Select the features to use
# Extract the selected features from the documents
featuresets = [(feature_extraction(set, features), c) for (set, c) in docs]
train_set, test_set = featuresets[200:], featuresets[:200]  # Split the train and test data
print("Data point in training set = {}, Data point in test set = {}".format(len(train_set), len(test_set)))

Highest in most negative features selected  (('kind', 'Noun'), -0.049600000000000005)
Lowest in most positive features selected  (('car', 'Noun'), 1.5994880000000004e-05)
Features 1 (750 most positive + 850 most negative: 1600
Features 2 (Non zero polarity): 1702
Feature 1 is selected
Data point in training set = 800, Data point in test set = 200


In [15]:
classifier3 = NaiveBayesClassifier.train(train_set)  # Train the naive bayes classifier
print("Accuracy is : {}%".format(accuracy(classifier3, test_set)*100))    # Get the accuracy
print(classifier3.show_most_informative_features(10))    # Get 10 of the most informative features in the classifier

Accuracy is : 85.5%
Most Informative Features
           works as Verb = True                1 : 0      =     15.1 : 1.0
  excellent as Adjective = True                1 : 0      =     14.4 : 1.0
      great as Adjective = True                1 : 0      =     10.2 : 1.0
           price as Noun = True                1 : 0      =      9.0 : 1.0
           money as Noun = True                0 : 1      =      9.0 : 1.0
      happy as Adjective = True                1 : 0      =      7.7 : 1.0
comfortable as Adjective = True                1 : 0      =      7.0 : 1.0
       best as Adjective = True                1 : 0      =      6.6 : 1.0
      first as Adjective = True                0 : 1      =      6.3 : 1.0
             buy as Verb = True                0 : 1      =      5.6 : 1.0
None
