In [28]:
from google.colab import drive
from sys import path
import pandas as pd   
import math
from sklearn.model_selection import train_test_split

root = '/content/drive/My Drive/nlp-lab'

#dataset = "train:amazon-english -> test:amazon-english"
#dataset = "train:organic -> test:organic"
#dataset = "train:amazon-english -> test:organic"
dataset = "train:german2 -> test:german2"

In [29]:
drive.mount('/content/drive', force_remount=True)
path.append(root)

Mounted at /content/drive


In [30]:
def balance_dataset(group, k=3):
    if len(group) < k:
        return group
    return group.sample(k)

In [31]:
if dataset == "train:amazon-english -> test:amazon-english":
  data_location = root + '/data/processed-data/amazon-english'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')
  #data_df = data_df.head(100)

  #Balance data
  k = data_df['comment_sentiment'].value_counts().min()
  data_df = data_df.groupby('comment_sentiment').apply(balance_dataset,k=k).reset_index(drop=True)

  #Split data
  train_data_df, test_data_df = train_test_split(data_df, test_size=0.3,shuffle = True)

elif dataset == "train:organic -> test:organic":
  data_location = root + "/data/original-datasets/annotated-organic-dataset/train"
  data_df = pd.read_csv(data_location + '/dataframe.csv', sep='|')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})
  train_data_df = data_df

  data_location = root + "/data/original-datasets/annotated-organic-dataset/test"
  data_df = pd.read_csv(data_location + '/dataframe.csv')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})
  test_data_df = data_df

elif dataset == "train:amazon-english -> test:organic":
  data_location = root + '/data/processed-data/amazon-english'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')
  train_data_df = data_df

  data_location = root + "/data/original-datasets/annotated-organic-dataset/test"
  data_df = pd.read_csv(data_location + '/dataframe.csv')
  data_df = data_df.dropna(subset=['Sentence', 'Sentiment'])
  data_df = data_df.rename(columns={"Sentence": "sentence_text", "Sentiment": "comment_sentiment"})
  test_data_df = data_df

elif dataset == "train:german2 -> test:german2":
  data_location = root + '/data/processed-data/amazon-german2'
  data_df = pd.read_json(data_location + '/processed_data_without_embeddings_no_outliers.json')
  #data_df = data_df.head(100)

  #Balance data
  k = data_df['comment_sentiment'].value_counts().min()
  data_df = data_df.groupby('comment_sentiment').apply(balance_dataset,k=k).reset_index(drop=True)

  #Split data
  train_data_df, test_data_df = train_test_split(data_df, test_size=0.3,shuffle = True)

In [32]:
##############
#Prepare data#
##############

#Binary (no neutral sentiment)
train_data_no_neutral_sentiment_df = train_data_df[train_data_df['comment_sentiment'].apply(lambda x: x != "0")]
test_data_no_neutral_sentiment_df = test_data_df[test_data_df['comment_sentiment'].apply(lambda x: x != "0")]

train_data_no_neutral_sentiment_x = train_data_no_neutral_sentiment_df["sentence_text"].values.tolist()
train_data_no_neutral_sentiment_y = train_data_no_neutral_sentiment_df["comment_sentiment"].values.tolist()

test_data_no_neutral_sentiment_x = test_data_no_neutral_sentiment_df["sentence_text"].values.tolist()
test_data_no_neutral_sentiment_y = test_data_no_neutral_sentiment_df["comment_sentiment"].values.tolist()

#Multiple Classes (with Neutral sentiment)
train_data_x = train_data_df["sentence_text"].values.tolist()
train_data_y = train_data_df["comment_sentiment"].values.tolist()

test_data_x = test_data_df["sentence_text"].values.tolist()
test_data_y = test_data_df["comment_sentiment"].values.tolist()

###Binary (without neutral sentiment)

In [33]:
#Unigram classifier
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
 
 
clf = Pipeline([
    ('vectorizer', CountVectorizer(analyzer="word",
                                   tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   #tokenizer=lambda text: mark_negation(word_tokenize(text)), 
                                   preprocessor=lambda text: text.replace("<br />", " "),
                                   max_features=10000) ),
    ('classifier', LinearSVC())
])
 
clf.fit(train_data_no_neutral_sentiment_x, train_data_no_neutral_sentiment_y)
clf.score(test_data_no_neutral_sentiment_x, test_data_no_neutral_sentiment_y)

#Bigram classifier
bigram_clf = Pipeline([
    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(2, 2),
                                   tokenizer=word_tokenize, 
                                   # tokenizer=lambda text: mark_negation(word_tokenize(text)),
                                   preprocessor=lambda text: text.replace("<br />", " "),)),
    ('classifier', LinearSVC())
])
 
bigram_clf.fit(train_data_no_neutral_sentiment_x, train_data_no_neutral_sentiment_y)
bigram_clf.score(test_data_no_neutral_sentiment_x, test_data_no_neutral_sentiment_y)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0.6907913263869333

In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix

#Unigram
print('Unigram results:')
predictions = clf.predict(test_data_no_neutral_sentiment_x)
 
print ("accuracy:" + str(accuracy_score(test_data_no_neutral_sentiment_y, predictions)))
print ("f1_micro:" + str(f1_score(test_data_no_neutral_sentiment_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(test_data_no_neutral_sentiment_y, predictions,average='macro')))

print ("precision:" + str(precision_score(test_data_no_neutral_sentiment_y, predictions,average=None)))
print ("recall:" + str(recall_score(test_data_no_neutral_sentiment_y, predictions,average=None)))

#Bigram
print('################')
print('Bigram results:')

predictions = bigram_clf.predict(test_data_no_neutral_sentiment_x)

print ("accuracy:" + str(accuracy_score(test_data_no_neutral_sentiment_y, predictions)))
print ("f1_micro:" + str(f1_score(test_data_no_neutral_sentiment_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(test_data_no_neutral_sentiment_y, predictions,average='macro')))

print ("precision:" + str(precision_score(test_data_no_neutral_sentiment_y, predictions,average=None)))
print ("recall:" + str(recall_score(test_data_no_neutral_sentiment_y, predictions,average=None)))

Unigram results:
accuracy:0.7293720078851028
f1_micro:0.7293720078851028
f1_macro:0.7293650539995276
precision:[0.72484714 0.73401826]
recall:[0.73672316 0.72206625]
################
Bigram results:
accuracy:0.6907913263869333
f1_micro:0.6907913263869333
f1_macro:0.6903569529393154
precision:[0.67554859 0.70861332]
recall:[0.73050847 0.65131948]


###Multiple Classes (with Neutral sentiment)

In [35]:
#Unigram classifier
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
 
 
clf = Pipeline([
    ('vectorizer', CountVectorizer(analyzer="word",
                                   tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   #tokenizer=lambda text: mark_negation(word_tokenize(text)), 
                                   preprocessor=lambda text: text.replace("<br />", " "),
                                   max_features=10000) ),
    ('classifier', LinearSVC())
])
 
clf.fit(train_data_x, train_data_y)
clf.score(test_data_x, test_data_y)

#Bigram classifier
bigram_clf = Pipeline([
    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(2, 2),
                                   tokenizer=word_tokenize, 
                                   # tokenizer=lambda text: mark_negation(word_tokenize(text)),
                                   preprocessor=lambda text: text.replace("<br />", " "),)),
    ('classifier', LinearSVC())
])
 
bigram_clf.fit(train_data_x, train_data_y)
bigram_clf.score(test_data_x, test_data_y)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




0.48232179996218566

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import multilabel_confusion_matrix

#Unigram
print('Unigram results:')
predictions = clf.predict(test_data_x)
 
print ("accuracy:" + str(accuracy_score(test_data_y, predictions)))
print ("f1_micro:" + str(f1_score(test_data_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(test_data_y, predictions,average='macro')))

print ("precision:" + str(precision_score(test_data_y, predictions,average=None)))
print ("recall:" + str(recall_score(test_data_y, predictions,average=None)))

#Bigram
print('################')
print('Bigram results:')

predictions = bigram_clf.predict(test_data_x)

print ("accuracy:" + str(accuracy_score(test_data_y, predictions)))
print ("f1_micro:" + str(f1_score(test_data_y, predictions,average='micro')))
print ("f1_macro:" + str(f1_score(test_data_y, predictions,average='macro')))

print ("precision:" + str(precision_score(test_data_y, predictions,average=None)))
print ("recall:" + str(recall_score(test_data_y, predictions,average=None)))

Unigram results:
accuracy:0.49971639251276234
f1_micro:0.49971639251276234
f1_macro:0.4982491107747479
precision:[0.42541766 0.51870748 0.54894538]
recall:[0.41024166 0.51694915 0.56990455]
################
Bigram results:
accuracy:0.48232179996218566
f1_micro:0.48232179996218566
f1_macro:0.47984192479769944
precision:[0.41410256 0.47922849 0.5483304 ]
recall:[0.3716916  0.54745763 0.52554745]
