In [1]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

import json
import pandas as pd 
import gzip 
import altair as alt # for plotting data
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import seaborn as sns
import torch
from sklearn.metrics import classification_report



### Pretrained Sentiment Analysis Bert Model

In [5]:
pipe = pipeline('sentiment-analysis', device=0) # add device to activate cuda please
# distilbert-base-uncased-finetuned-sst-2-english

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [6]:
dict_result = pd.read_table('review_corpus_dict_sentiment.tsv')
dict_result.head()

Unnamed: 0,ratings,review,dict_sentiment,rating_category,sent_category,match
0,1.0,Let's face it. Nintendo only made one good sys...,0.333333,0.0,1.0,False
1,1.0,I enjoy the Wii sports games and when I purcha...,0.333333,0.0,1.0,False
2,1.0,This Game Sucks The First Was Better. It Has M...,-0.4,0.0,0.0,True
3,1.0,"First, there is nothing about this to really w...",-0.153846,0.0,0.0,True
4,1.0,This was much better on Nintendo 64. This is j...,0.75,0.0,2.0,False


In [7]:
reviews = dict_result['review'].tolist()
default_predict = pipe(reviews, num_workers=0, truncation=True, padding=True) # notice how num_workers can be used here
# unfortunately we have to truncate a lot, there must a better way to do this so we dont lost too much information
# meanwhile its ok i guess because it is not used for training, the model is already trained anyway
#default_predict = pipe.preprocess(reviews, truncation=True, padding=True)

In [8]:
def get_labels(predict):
    labels = []
    for result in predict:
        if result['label'] == 'NEGATIVE' and result['score'] >= 0.75:
            labels.append(0)
        elif result['label'] == 'POSITIVE' and result['score'] >= 0.75:
            labels.append(2)
        else:
            labels.append(1)
    return labels
# it's hard to find the threshold as the output is not a "sentiment score" we define in the dictionary based model
# but i guess one can still try to combine the probability into a sentiment score, say POSITIVE Probability - NEGATIVE Probabiliy
# not sure if there will be a scaling problem, but our focus is to build the sentiment classifier later by ourselves
sent_pipe_labels = get_labels(default_predict)

In [9]:
print(sent_pipe_labels.count(0))
print(sent_pipe_labels.count(1))
print(sent_pipe_labels.count(2))

2361
128
2011


In [10]:
dict_result['sent_pipe_labels'] = sent_pipe_labels

# y_true, y_pred
target_names = ["negative", "neutral", "positive"]
print(
    classification_report(dict_result['rating_category'], dict_result['sent_category'], target_names=target_names)
)


              precision    recall  f1-score   support

    negative       0.68      0.52      0.59      1500
     neutral       0.41      0.49      0.44      1500
    positive       0.61      0.64      0.63      1500

    accuracy                           0.55      4500
   macro avg       0.57      0.55      0.55      4500
weighted avg       0.57      0.55      0.55      4500



In [11]:
target_names = ["negative", "neutral", "positive"]
print(
    classification_report(dict_result['rating_category'], dict_result['sent_pipe_labels'], target_names=target_names)
)

              precision    recall  f1-score   support

    negative       0.60      0.95      0.74      1500
     neutral       0.55      0.05      0.09      1500
    positive       0.66      0.89      0.76      1500

    accuracy                           0.63      4500
   macro avg       0.60      0.63      0.53      4500
weighted avg       0.60      0.63      0.53      4500



In [12]:
with open("datav3_dict_pipe.tsv", "w") as outfile:
    outfile.write(dict_result.to_csv(index=False, sep="\t"))

experiences a lot of problem with packages in conda env:
1. never able to install transformers and pytorch properly
2. found out pip does not install into the activated env (why???)
3. some conda installation doesn't work properly...
4. as long as cuda version of the GPU is up to date with the pytorch one, cuda should be able to run (fail)
5. eventually important things is better to be installed using conda env propmt from conda navigater (right click the play sign)

1. transformers pipeline is so easy to initialize and use
2. be careful when num_worker, it cannot be applied to initialization, it can only be apply to when pipeline is used, as an argument

### Finetune BERT Model

In [3]:
dict_result = pd.read_table('datav3_dict_pipe.tsv')
dict_result.head()

Unnamed: 0,ratings,review,dict_sentiment,rating_category,sent_category,match,sent_pipe_labels
0,1.0,Let's face it. Nintendo only made one good sys...,0.333333,0.0,1.0,False,0
1,1.0,I enjoy the Wii sports games and when I purcha...,0.333333,0.0,1.0,False,0
2,1.0,This Game Sucks The First Was Better. It Has M...,-0.4,0.0,0.0,True,0
3,1.0,"First, there is nothing about this to really w...",-0.153846,0.0,0.0,True,0
4,1.0,This was much better on Nintendo 64. This is j...,0.75,0.0,2.0,False,0


In [4]:
torch.cuda.is_available()

True

In [8]:
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
cuda_available = torch.cuda.is_available()
model_args = {
    "max_seq_length": 512,
    "evaluate_during_training": True,
    "num_train_epochs": 20,
    "train_batch_size": 10,
    "eval_batch_size": 10,
    "output_dir": "./model/",
    "best_model_dir": "./model/",
    "dataloader_num_workers": 0,
    "use_multiprocessing": False,
    "logging_steps": 50
}

In [6]:
# cased means it covers both upper and lower cased
# uncased means the case are irrelevant (also the tokenizer within the pipeline lower everything for you so you dont have to lower the data)


my_model = ClassificationModel("distilbert", "distilbert-base-uncased",  use_cuda=cuda_available, args=model_args, cuda_device=0, num_labels=3)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [7]:
x = dict_result['review']
y = dict_result['rating_category']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
train_df = pd.DataFrame({
    "text": x_train,
    "labels": y_train
})
eval_df = pd.DataFrame({
    "text": x_test,
    "labels": y_test
})

In [7]:
# Train the model
# my_model.train_model(train_df, eval_df=eval_df)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.97 GiB already allocated; 0 bytes free; 2.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
# load local saved model

my_model = ClassificationModel("distilbert", "model",  use_cuda=cuda_available, args=model_args, cuda_device=0, num_labels=3)


# Evaluate the model
# result, model_outputs, wrong_predictions = my_model.eval_model(eval_df)

In [34]:
final_prediction, _ = my_model.predict(x.tolist())

  0%|          | 9/4500 [00:07<1:00:12,  1.24it/s]
100%|██████████| 450/450 [03:03<00:00,  2.46it/s]


In [37]:
target_names = ["negative", "neutral", "positive"]
print(
    classification_report(dict_result['rating_category'], final_prediction, target_names=target_names)
)

              precision    recall  f1-score   support

    negative       0.95      0.94      0.95      1500
     neutral       0.90      0.91      0.91      1500
    positive       0.93      0.92      0.92      1500

    accuracy                           0.93      4500
   macro avg       0.93      0.93      0.93      4500
weighted avg       0.93      0.93      0.93      4500



## TO DO, Create yet another data set for fine-tuning
then fine-tune language model (or 直接收尾做 n-gram anaylsis)

In [2]:
with open("bigdata.txt") as f:
    corpus = f.read().splitlines()

print(corpus[:5])

["I enjoyed this game but then again I am addicted to the Dash type games.  The game is exactly like Diner Dash except you are a white bear.  The elements are the same..you seat your customers trying to match color, you take their orders, serve them then clean the tables.  Just like Diner Dash you can tie similiar actions together to get a bonus.  Granted there isn't much originality but it is a cute game.", 'Very pleased with product', 'my kids like it', 'Remember Crash Bandicoot, Spyro, Banjo Kazooie, Super Mario Bros?  If you like those you\'ll like this.  Remember there are plenty of great games out there without 100 button combos.  Game has great graphics, critics say no "next gen" but fail to notice the high about of particle effects, high resolution/fps, and the almost complete lack of jagged lines.  Game is good for kids and parents alike.  It isn\'t so easy that you should be able to steamroll through it, but not hard enough you\'ll want to throw your remote out the window.  I

In [10]:
big_corpus_prediction, _ = my_model.predict(corpus)

  0%|          | 20/10000 [00:09<1:17:38,  2.14it/s]
100%|██████████| 1000/1000 [06:52<00:00,  2.43it/s]


In [11]:
big_corpus = pd.DataFrame({'text': corpus, 'sent': big_corpus_prediction})

In [13]:
convert = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

counting = []
for i in range(3):
    count = len(big_corpus[big_corpus['sent'] == i])
    counting.append(count)

plotdf = pd.DataFrame({
    'count': counting,
    'Sent': ['Negative', 'Neutral', 'Positive']
})

chart = alt.Chart(plotdf).mark_bar().encode(
    x = 'Sent',
    y = 'count'
)
chart

In [9]:
# import packages for preparing the text for bigram analysis
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import bigrams

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abcd8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# processed review for bigram analysis
def text_processing(review):
    words = []
    for sentence in sent_tokenize(review):
        for w in word_tokenize(sentence):
            w = w.lower()
            if w not in stopwords.words('english') and w.isalpha():
                words.append(w)
    return words

processed_text = [text_processing(review) for review in big_corpus['text'].tolist()]
big_corpus['processed_text'] = processed_text

In [55]:
# redo from load
processed_text = [json.loads(text.replace('\'', '"')) for text in big_corpus['processed_text']]

all_bigrams = [[bigram[0] + '_' + bigram[1] for bigram in list(bigrams(t))] for t in processed_text]

negative_bigrams = []
neutral_bigrams = []
positive_bigrams = []

for i, score in enumerate(big_corpus['sent'].tolist()):
    if score == 0:
        negative_bigrams.append(all_bigrams[i])
    if score == 1:
        neutral_bigrams.append(all_bigrams[i])
    if score == 2:
        positive_bigrams.append(all_bigrams[i])

In [42]:
# save result so far
with open("final_result/big_corpus_result.tsv", "w") as outfile:
    outfile.write(big_corpus.to_csv(index=False, sep="\t"))

In [3]:
big_corpus = pd.read_table('final_result/big_corpus_result.tsv')


In [57]:
from keyness import log_likelihood

positive_keys = log_likelihood(all_bigrams, positive_bigrams)[:150]
negative_keys = log_likelihood(all_bigrams, negative_bigrams)[:150]
neutral_keys = log_likelihood(all_bigrams, neutral_bigrams)[:150]

In [58]:
negative_keys

[('waste_money', 40.478, 42, 28),
 ('highly_recommend', 30.981, 137, 0),
 ('waste_time', 30.6, 38, 23),
 ('worst_game', 24.791, 22, 16),
 ('love_game', 24.382, 286, 9),
 ('tes_series', 23.1, 12, 12),
 ('works_great', 22.368, 154, 2),
 ('great_game', 22.048, 595, 34),
 ('one_best', 20.372, 290, 11),
 ('internet_connection', 19.884, 36, 18),
 ('construction_set', 19.25, 10, 10),
 ('game_story', 18.996, 84, 0),
 ('get_used', 18.77, 114, 1),
 ('buy_game', 17.638, 193, 48),
 ('one_favorite', 17.413, 77, 0),
 ('th_loading', 17.325, 9, 9),
 ('well_worth', 16.692, 104, 1),
 ('save_money', 16.57, 30, 15),
 ('boring_game', 15.841, 20, 12),
 ('top_spin', 15.4, 8, 8),
 ('grand_theftouto', 15.4, 8, 8),
 ('game_crashes', 15.28, 14, 10),
 ('customer_support', 15.205, 11, 9),
 ('one_worst', 14.761, 18, 11),
 ('install_game', 14.761, 18, 11),
 ('mega_man', 13.795, 61, 0),
 ('gamepad_tablet', 13.475, 7, 7),
 ('water_damage', 13.475, 7, 7),
 ('customer_service', 13.385, 20, 11),
 ('voice_acting', 13.035,

In [70]:
from collections import Counter
def frequency_base(bigram_list):
    re = []
    for li in bigram_list:
        re.extend(li)
    count = Counter(re)
    return count.most_common()[:15]

In [72]:
frequency_base(negative_bigrams)

# the importance here is that we still need the keyness to determine the importance of each word.
# if we just look at the most frequent bigrams in negative data, we can't find insight easily


[('game_play', 65),
 ('play_game', 64),
 ('playing_game', 50),
 ('buy_game', 48),
 ('wii_u', 37),
 ('video_game', 35),
 ('game_like', 35),
 ('game_would', 35),
 ('great_game', 34),
 ('game_ever', 32),
 ('pretty_much', 32),
 ('bought_game', 31),
 ('like_game', 31),
 ('good_game', 30),
 ('xbox_one', 30)]

In [39]:
pos = pd.DataFrame(positive_keys, columns =['Bigram', 'Loglikelihood', 'Frequency', 'Frequency in Sent'])
pos['Sent'] = 'Positive' 

neutral = pd.DataFrame(neutral_keys, columns =['Bigram', 'Loglikelihood', 'Frequency', 'Frequency in Sent'])
neutral['Sent'] = 'Neutral'

neg = pd.DataFrame(negative_keys, columns =['Bigram', 'Loglikelihood', 'Frequency', 'Frequency in Sent'])
neg['Sent'] = 'Negative'

key_data = pd.concat([pos, neutral, neg])


In [43]:
with open("final_result/key.tsv", "w") as outfile:
    outfile.write(key_data.to_csv(index=False, sep="\t"))

In [4]:
key_data = pd.read_table('final_result/key.tsv')

In [41]:

import altair as alt

alt.Chart(key_data).mark_circle(size=60).encode(
    x='Loglikelihood',
    y='Frequency in Sent',
    color='Sent',
    tooltip=['Bigram', 'Loglikelihood', 'Frequency', 'Frequency in Sent']
).interactive()

In [50]:

positive_texts = big_corpus[big_corpus['sent']==2]['processed_text'].tolist()
negative_texts = big_corpus[big_corpus['sent']==0]['processed_text'].tolist()
neutral_texts = big_corpus[big_corpus['sent']==1]['processed_text'].tolist()

positive_texts = "\n".join([" ".join(tokens) for tokens in positive_texts])
negative_texts = "\n".join([" ".join(tokens) for tokens in negative_texts])
neutral_texts = "\n".join([" ".join(tokens) for tokens in neutral_texts])

def make_text_corpus(text):
    tokens = nltk.word_tokenize(text)
    return nltk.Text(tokens)

positive_text = make_text_corpus(positive_texts)
negative_text = make_text_corpus(negative_texts)
neutral_text = make_text_corpus(neutral_texts)

In [62]:
words_to_check = [
    "word",
    "great",
    "game",
    "love",
    "works",
    "highly",
    "recommend",
    "best",
    "product",
    "awesome",
    "best",
    "ever",
    "great",
    "excellent"
]
negative_text.concordance('best', width=100, lines=100)

Displaying 100 of 112 matches:
endo ipad etc fairly simple apply simple remove best part remove leave behind glue residue would nig
art quit trying incursion even worse worst part best way get gear dark zone guess full douchebags tr
l stuff narrative focuses new cast ghostbusters best summed cartoon characters personality beyond on
ed player monsters terrible terrible ai results best pvp action ever encountered well however game q
 amazing love auto attack targeting removed far best combat mmorpg played played lot thrill get dyna
server merge grinding grinding mention grinding best way level still repeat grind quests key areas q
ng awesome controller breaks far easy joysticks best quality came pulls game camera left constantly 
e block great light still shines various places best solution black acrylic paint controller atop bl
led bars location charging controllers probably best charging solution find dual shock moment using 
date locked pc reboot chkdsk needed uninstalled best game ti

In [55]:
negative_keys

[('waste_money', 40.478, 42, 28),
 ('highly_recommend', 30.981, 137, 0),
 ('waste_time', 30.6, 38, 23),
 ('worst_game', 24.791, 22, 16),
 ('love_game', 24.382, 286, 9),
 ('tes_series', 23.1, 12, 12),
 ('works_great', 22.368, 154, 2),
 ('great_game', 22.048, 595, 34),
 ('one_best', 20.372, 290, 11),
 ('internet_connection', 19.884, 36, 18),
 ('construction_set', 19.25, 10, 10),
 ('game_story', 18.996, 84, 0),
 ('get_used', 18.77, 114, 1),
 ('buy_game', 17.638, 193, 48),
 ('one_favorite', 17.413, 77, 0),
 ('th_loading', 17.325, 9, 9),
 ('well_worth', 16.692, 104, 1),
 ('save_money', 16.57, 30, 15),
 ('boring_game', 15.841, 20, 12),
 ('top_spin', 15.4, 8, 8),
 ('grand_theftouto', 15.4, 8, 8),
 ('game_crashes', 15.28, 14, 10),
 ('customer_support', 15.205, 11, 9),
 ('one_worst', 14.761, 18, 11),
 ('install_game', 14.761, 18, 11),
 ('mega_man', 13.795, 61, 0),
 ('water_damage', 13.475, 7, 7),
 ('gamepad_tablet', 13.475, 7, 7),
 ('customer_service', 13.385, 20, 11),
 ('voice_acting', 13.035,