In [74]:
from google.colab import drive
from sys import path
import pandas as pd   
import math
from sklearn.model_selection import train_test_split

root = '/content/drive/My Drive/nlp-lab'

#dataset = "train:amazon-english -> test:amazon-english"
#dataset = "train:organic -> test:organic"
#dataset = "train:amazon-english -> test:organic"
dataset = "train:german2 -> test:german2"

In [34]:
!pip install unidecode
!pip install twython



In [75]:
drive.mount('/content/drive', force_remount=True)
path.append(root)

Mounted at /content/drive


In [76]:
def balance_dataset(group, k=3):
    if len(group) < k:
        return group
    return group.sample(k)

In [77]:
if dataset == "train:amazon-english -> test:amazon-english":
  data_location = root + '/data/processed-data/amazon-english'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')
  #data_df = data_df.head(100)

  #Balance data
  k = data_df['comment_sentiment'].value_counts().min()
  data_df = data_df.groupby('comment_sentiment').apply(balance_dataset,k=k).reset_index(drop=True)

  #Split data
  train_data_df, test_data_df = train_test_split(data_df, test_size=0.3,shuffle = True)

elif dataset == "train:organic -> test:organic":
  data_location = root + "/data/original-datasets/annotated-organic-dataset/train"
  data_df = pd.read_csv(data_location + '/dataframe.csv', sep='|')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})
  train_data_df = data_df

  data_location = root + "/data/original-datasets/annotated-organic-dataset/test"
  data_df = pd.read_csv(data_location + '/dataframe.csv')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})
  test_data_df = data_df

elif dataset == "train:amazon-english -> test:organic":
  data_location = root + '/data/processed-data/amazon-english'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')
  train_data_df = data_df

  data_location = root + "/data/original-datasets/annotated-organic-dataset/test"
  data_df = pd.read_csv(data_location + '/dataframe.csv')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})
  test_data_df = data_df

elif dataset == "train:german2 -> test:german2":
  data_location = root + '/data/processed-data/amazon-german2'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')
  #data_df = data_df.head(100)

  #Balance data
  k = data_df['comment_sentiment'].value_counts().min()
  data_df = data_df.groupby('comment_sentiment').apply(balance_dataset,k=k).reset_index(drop=True)

  #Split data
  train_data_df, test_data_df = train_test_split(data_df, test_size=0.3,shuffle = True)


In [78]:
##############
#Prepare data#
##############

#Binary (no neutral sentiment)
train_data_no_neutral_sentiment_df = train_data_df[train_data_df['comment_sentiment'].apply(lambda x: x != "0")]
test_data_no_neutral_sentiment_df = test_data_df[test_data_df['comment_sentiment'].apply(lambda x: x != "0")]

train_data_no_neutral_sentiment_x = train_data_no_neutral_sentiment_df["sentence_text"].values.tolist()
train_data_no_neutral_sentiment_y = train_data_no_neutral_sentiment_df["comment_sentiment"].values.tolist()

test_data_no_neutral_sentiment_x = test_data_no_neutral_sentiment_df["sentence_text"].values.tolist()
test_data_no_neutral_sentiment_y = test_data_no_neutral_sentiment_df["comment_sentiment"].values.tolist()

#Multiple Classes (with Neutral sentiment)
train_data_x = train_data_df["sentence_text"].values.tolist()
train_data_y = train_data_df["comment_sentiment"].values.tolist()

test_data_x = test_data_df["sentence_text"].values.tolist()
test_data_y = test_data_df["comment_sentiment"].values.tolist()

###Binary (without neutral sentiment)

In [79]:
from unidecode import unidecode
from nltk import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import extract_unigram_feats, mark_negation

sentim_analyzer = SentimentAnalyzer()

all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_data_no_neutral_sentiment_x])

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

training_set = sentim_analyzer.apply_features(train_data_no_neutral_sentiment_x)
test_set = sentim_analyzer.apply_features(test_data_no_neutral_sentiment_x)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, list(zip(training_set, train_data_no_neutral_sentiment_y)))

zipped = list(zip(test_set, test_data_no_neutral_sentiment_y))

score = sentim_analyzer.evaluate(zipped)
print(score)
print("Accuracy: ", score['Accuracy'])

Training classifier
Evaluating NaiveBayesClassifier results...
{'Accuracy': 0.5797346881174146, 'Precision [p]': 0.5736708860759494, 'Recall [p]': 0.6365168539325843, 'F-measure [p]': 0.6034620505992011, 'Precision [n]': 0.5873724489795918, 'Recall [n]': 0.5224049914917753, 'F-measure [n]': 0.5529870909636745}
Accuracy:  0.5797346881174146


In [80]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix

predictions = [sentim_analyzer.classify(text) for text in test_data_no_neutral_sentiment_x]
 
print ("accuracy:" + str(accuracy_score(test_data_no_neutral_sentiment_y, predictions)))
print ("f1_micro:" + str(f1_score(test_data_no_neutral_sentiment_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(test_data_no_neutral_sentiment_y, predictions,average='macro')))

print ("precision:" + str(precision_score(test_data_no_neutral_sentiment_y, predictions,average=None)))
print ("recall:" + str(recall_score(test_data_no_neutral_sentiment_y, predictions,average=None)))

#multilabel_confusion_matrix(data_y, predictions)

accuracy:0.5797346881174146
f1_micro:0.5797346881174146
f1_macro:0.5782245707814377
precision:[0.58737245 0.57367089]
recall:[0.52240499 0.63651685]


###Multiple Classes (with Neutral sentiment)

In [82]:
from unidecode import unidecode
from nltk import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import extract_unigram_feats, mark_negation

sentim_analyzer = SentimentAnalyzer()

all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_data_x])

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)

sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

training_set = sentim_analyzer.apply_features(train_data_x)
test_set = sentim_analyzer.apply_features(test_data_x)

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, list(zip(training_set, train_data_y)))

zipped = list(zip(test_set, test_data_y))

score = sentim_analyzer.evaluate(zipped)
print(score)
print("Accuracy: ", score['Accuracy'])

Training classifier
Evaluating NaiveBayesClassifier results...
{'Accuracy': 0.40612592172433354, 'Precision [p]': 0.4759898904802022, 'Recall [p]': 0.31741573033707865, 'F-measure [p]': 0.3808560835861139, 'Precision [0]': 0.36915204678362573, 'Recall [0]': 0.5784650630011455, 'F-measure [0]': 0.45069165551093265, 'Precision [n]': 0.41947291361639827, 'Recall [n]': 0.32501418037436186, 'F-measure [n]': 0.36625119846596355}
Accuracy:  0.40612592172433354


In [83]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix

predictions = [sentim_analyzer.classify(text) for text in test_data_x]
 
print ("accuracy:" + str(accuracy_score(test_data_y, predictions)))
print ("f1_micro:" + str(f1_score(test_data_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(test_data_y, predictions,average='macro')))

print ("precision:" + str(precision_score(test_data_y, predictions,average=None)))
print ("recall:" + str(recall_score(test_data_y, predictions,average=None)))

#multilabel_confusion_matrix(data_y, predictions)

accuracy:0.40612592172433354
f1_micro:0.40612592172433354
f1_macro:0.39926631252100336
precision:[0.36915205 0.41947291 0.47598989]
recall:[0.57846506 0.32501418 0.31741573]
