In [113]:
from google.colab import drive
from sys import path
import pandas as pd   
import math

root = '/content/drive/My Drive/nlp-lab'

#dataset = "amazon-english"
#dataset = "organic-train"
dataset = "organic-test"
#dataset = "german2"

In [114]:
drive.mount('/content/drive', force_remount=True)
path.append(root)

Mounted at /content/drive


In [115]:
if dataset == "amazon-english":
  data_location = root + '/data/processed-data/amazon-english'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')

elif dataset == "organic-train":
  data_location = root + "/data/original-datasets/annotated-organic-dataset/train"
  data_df = pd.read_csv(data_location + '/dataframe.csv', sep='|')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})

elif dataset == "organic-test":
  data_location = root + "/data/original-datasets/annotated-organic-dataset/test"
  data_df = pd.read_csv(data_location + '/dataframe.csv')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})

In [116]:
##############
#Prepare data#
##############
data_no_neutral_sentiment_df = data_df[data_df['comment_sentiment'].apply(lambda x: x != "0")]

data_no_neutral_sentiment_x = data_no_neutral_sentiment_df["sentence_text"].values.tolist()
data_no_neutral_sentiment_y = data_no_neutral_sentiment_df["comment_sentiment"].values.tolist()

data_x = data_df["sentence_text"].values.tolist()
data_y = data_df["comment_sentiment"].values.tolist()

###Binary (without neutral sentiment)

In [119]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

lemmatizer = WordNetLemmatizer()

def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def swn_polarity(text):
    """
    Return a sentiment polarity: 0 = negative, 1 = positive
    """
 
    sentiment = 0.0
    tokens_count = 0

    tagged_sentence = pos_tag(word_tokenize(text))
 
    for word, tag in tagged_sentence:

        wn_tag = penn_to_wn(tag)

        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            continue
      
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)

        if not lemma:
            continue

        synsets = wn.synsets(lemma, pos=wn_tag)

        if not synsets:
            continue

        # Take the first sense, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())

        sentiment += swn_synset.pos_score() - swn_synset.neg_score()

        tokens_count += 1

    result = ""
    if not tokens_count:
      result = "n"
    elif sentiment >= 0:
      result = "p"
    else:
      result = "n"

    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [120]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix

predictions = [swn_polarity(text) for text in data_no_neutral_sentiment_x]
 
print ("accuracy:" + str(accuracy_score(data_no_neutral_sentiment_y, predictions)))
print ("f1_micro:" + str(f1_score(data_no_neutral_sentiment_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(data_no_neutral_sentiment_y, predictions,average='macro')))

print ("precision:" + str(precision_score(data_no_neutral_sentiment_y, predictions,average=None)))
print ("recall:" + str(recall_score(data_no_neutral_sentiment_y, predictions,average=None)))

#multilabel_confusion_matrix(data_y, predictions)

accuracy:0.5851851851851851
f1_micro:0.5851851851851851
f1_macro:0.56209453197405
precision:[0.46601942 0.65868263]
recall:[0.45714286 0.66666667]


###Multiple Classes (with Neutral sentiment)

In [121]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

lemmatizer = WordNetLemmatizer()

alpha = 0.3

def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def swn_polarity(text,alpha):
 
    sentiment = 0.0
    tokens_count = 0

    tagged_sentence = pos_tag(word_tokenize(text))
 
    for word, tag in tagged_sentence:

        wn_tag = penn_to_wn(tag)

        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            continue
      
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)

        if not lemma:
            continue

        synsets = wn.synsets(lemma, pos=wn_tag)

        if not synsets:
            continue

        # Take the first sense, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())

        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
 
    #Including neutral sentiment

    result = ""
    if not tokens_count:
      result = "0"
    elif -alpha <= sentiment <= alpha:
      result = "0"
    elif sentiment >= 0:
      result = "p"
    else:
      result = "n"

    #print("sentiment: " + str(sentiment))
    #print("result: " + str(result))
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [122]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix

predictions = [swn_polarity(text,alpha) for text in data_x]
 
print ("accuracy:" + str(accuracy_score(data_y, predictions)))
print ("f1_micro:" + str(f1_score(data_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(data_y, predictions,average='macro')))

print ("precision:" + str(precision_score(data_y, predictions,average=None)))
print ("recall:" + str(recall_score(data_y, predictions,average=None)))

multilabel_confusion_matrix(data_y, predictions)

accuracy:0.3847826086956522
f1_micro:0.3847826086956522
f1_macro:0.3769094989599991
precision:[0.40957447 0.27692308 0.45070423]
recall:[0.40526316 0.34285714 0.38787879]


array([[[159, 111],
        [113,  77]],

       [[261,  94],
        [ 69,  36]],

       [[217,  78],
        [101,  64]]])