<a href="https://colab.research.google.com/github/salsabillaflsft/ABSApublic/blob/main/AspectBasedSentimentAnalysis_DASH_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation

In [1]:
!pip install jupyter-dash
!pip install dash-bootstrap-components
!pip install dash-bootstrap-templates
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting dash (from jupyter-dash)
  Downloading dash-2.11.0-py3-none-any.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
Collecting retrying (from jupyter-dash)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting ansi2html (from jupyter-dash)
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting Werkzeug<2.3.0 (from dash->jupyter-dash)
  Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting dash-html-components==2.0.0 (from dash->jupyter-dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0 (from dash->jupyter-

## Import

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import dash
import dash_bootstrap_components as dbc
import plotly.express as px
import plotly.graph_objects as go
from dash import html, dcc
from dash.dependencies import Input, Output, State
from jupyter_dash import JupyterDash
from dash_bootstrap_templates import ThemeChangerAIO, template_from_url

import torch
from torch import optim, nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import Dataset, DataLoader

import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm
from transformers import BertTokenizer, BertConfig, BertForPreTraining, BertPreTrainedModel, BertModel

import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
# from pyngrok import ngrok

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/TA/Dataset

Mounted at /content/drive
/content/drive/My Drive/TA/Dataset


In [4]:
if torch.cuda.is_available():
  device = torch.device('cuda')

  print('there are %d GPU(s) available.' % torch.cuda.device_count())

  print('we will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print("No GPU available, using the CPU instead")
  device = torch.device("cpu")

there are 1 GPU(s) available.
we will use the GPU:  Tesla T4


## Load

### model

In [5]:
class BertForMultiLabelClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels_list

        self.bert = BertModel(config)
        # self.dropout = nn.Dropout(config.hidden_dropout_prob) #0.1
        self.dropout = nn.Dropout(p=0.5)
        self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, num_label) for num_label in self.num_labels])

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        subword_to_word_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = self.dropout(outputs[1])
        logits = []
        for classifier in self.classifiers:
            logit = classifier(sequence_output)
            logits.append(logit)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            total_loss = 0
            for i, (logit, num_label) in enumerate(zip(logits, self.num_labels)):
                label = labels[:,i]
                loss = loss_fct(logit.view(-1, num_label), label.view(-1))
                total_loss += loss
            outputs = (total_loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)

### dataset

In [6]:
class AspectBasedSentimentAnalysisDataset(Dataset):
    # Static constant variable
    ASPECT_DOMAIN = ['umum','layanan','menu','harga','fasilitas','suasana']
    LABEL2INDEX = {'none': 0, 'negative': 1, 'positive':2, 'neutral': 3}
    INDEX2LABEL = { 0:'none', 1:'negative', 2: 'positive', 3: 'neutral'}
    NUM_LABELS = [4, 4, 4, 4, 4, 4]
    NUM_ASPECTS = 6

    def load_dataset(self, path):
        df = pd.read_csv(path)
        for aspect in self.ASPECT_DOMAIN:
            df[aspect] = df[aspect].apply(lambda sen: self.LABEL2INDEX[sen])
        return df

    def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token

    def __getitem__(self, index):
        data = self.data.loc[index,:]
        sentence, labels = data['review_text'], [data[aspect] for aspect in self.ASPECT_DOMAIN]
        subwords = self.tokenizer.encode(sentence, add_special_tokens=not self.no_special_token)
        return np.array(subwords), np.array(labels), data['review_text']

    def __len__(self):
        return len(self.data)

class AspectBasedSentimentAnalysisDataLoader(DataLoader):
    def __init__(self, dataset, max_seq_len=512, *args, **kwargs):
        super(AspectBasedSentimentAnalysisDataLoader, self).__init__(dataset=dataset, *args, **kwargs)
        self.num_aspects = dataset.NUM_ASPECTS
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len

    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)

        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        label_batch = np.zeros((batch_size, self.num_aspects), dtype=np.int64)

        seq_list = []

        for i, (subwords, label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            label_batch[i,:] = label

            seq_list.append(raw_seq)

        return subword_batch, mask_batch, label_batch, seq_list

### saved data & model

In [7]:
# Load Tokenizer, Config, Model 1
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')

config.num_labels = max(AspectBasedSentimentAnalysisDataset.NUM_LABELS)
config.num_labels_list = AspectBasedSentimentAnalysisDataset.NUM_LABELS
model = BertForMultiLabelClassification.from_pretrained('indobenchmark/indobert-base-p1',config=config)

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json: 0.00B [00:00, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForMultiLabelClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifiers.0.bias', 'classifiers.5.bias', 'classifiers.3.weight', 'classifiers.2.weight', 'classifiers.4.weight', 'classifiers.1.weight', 'classifiers.5.weight', 'classifiers.3.bias', 'classifiers.0.weight', 'classifiers.4.bias', 'classifiers.2.bias', 'classifiers.1.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Load Tokenizer, Config, Model 2
tokenizer2 = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
config2 = BertConfig.from_pretrained('bert-base-multilingual-cased')

config2.num_labels = max(AspectBasedSentimentAnalysisDataset.NUM_LABELS)
config2.num_labels_list = AspectBasedSentimentAnalysisDataset.NUM_LABELS
multi_model = BertForMultiLabelClassification.from_pretrained('bert-base-multilingual-cased',config=config2)

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMultiLabelClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiLabelClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelClassification were not initialized from the model checkpoint at bert-base-mult

In [9]:
df = pd.read_csv('test_result_11.csv')

In [10]:
state_dict = torch.load('modelmono11.pt')

print(state_dict.keys())
model.load_state_dict(state_dict)

odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weigh

<All keys matched successfully>

In [11]:
state_dict2 = torch.load('modelmulti22.pt')

print(state_dict2.keys())
multi_model.load_state_dict(state_dict2)

odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weigh

<All keys matched successfully>

In [12]:
w2i, i2w = AspectBasedSentimentAnalysisDataset.LABEL2INDEX, AspectBasedSentimentAnalysisDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'none': 0, 'negative': 1, 'positive': 2, 'neutral': 3}
{0: 'none', 1: 'negative', 2: 'positive', 3: 'neutral'}


## Dashboard

In [13]:
# Stylesheet
dbc_css = "https://cdn.jsdelivr.net/gh/AnnMarieW/dash-bootstrap-templates/dbc.min.css"

# Define app
app = JupyterDash(__name__, external_stylesheets=[dbc.themes.FLATLY, dbc_css])
server = app.server

# Define the layout of the app
# Title
header = html.H3('Aspect-based Sentiment Analysis Dashboard', className="bg-primary text-white p-2 mb-2 text-center")

# Dropdown menu for selecting the cafe name
dropdown = html.Div(
    [
        dbc.Label("Select cafe and coworking space"),
        dcc.Dropdown(
            id='cafe-dropdown',
            options=[{'label': name, 'value': name} for name in df['name'].unique()],
            value=df['name'].iloc[0]
        )
    ],
    className="mb-4"
)

# Graph for showing the sentiment polarity
graph1 = dcc.Graph(id='polarity-graph')

# Graph for showing the aspect-based sentiment analysis
graph2 = dcc.Graph(id='sentiment-graph')

# Text field and button for predicting new review - IndoBERT
review = html.Div(
    [
        html.H5('New Review – IndoBERT', className='mb-3'),
        dcc.Input(
            id='new-review-input',
            type='text',
            placeholder='Type your review here...',
            style={'width': '100%', 'display': 'inline-block'}
        ),
        html.Br(),
        dbc.Button('Submit', id='new-review-button', color='primary', className='mt-2'),
        html.Br(),
        html.Div(id='new-review-output')
    ],
    style={'width': '100%', 'margin-bottom': '50px'}
)

# Text field and button for predicting new review - mBERT
review2 = html.Div(
    [
        html.H5('New Review – m-BERT', className='mb-3'),
        dcc.Input(
            id='new-review-input2',
            type='text',
            placeholder='Type your review here...',
            style={'width': '100%', 'display': 'inline-block'}
        ),
        html.Br(),
        dbc.Button('Submit', id='new-review-button2', color='primary', className='mt-2'),
        html.Br(),
        html.Div(id='new-review-output2')
    ],
    style={'width': '100%', 'margin-bottom': '50px'}
)

tab1 = dbc.Tab([graph2], label="Aspect-Sentiment Graph")
tab2 = dbc.Tab([graph1], label="Overall Chart")
tabs = dbc.Tabs([tab1, tab2])

card1 = dbc.Card([dropdown, tabs], body=True)
card2 = dbc.Card([review], body=True)
card3 = dbc.Card([review2], body=True)

space = html.Br(style={'margin-bottom': '20px'})

app.layout = dbc.Container(
    [
        header,
        dbc.Row(
            [
                dbc.Col([card1], width=8),
                dbc.Col([card2, space, card3], width=4),
            ]
        ),
    ],
    fluid=True,
    className="dbc",
)


In [14]:
# Define the callback function to update the polarity graph
@app.callback(
    Output('polarity-graph', 'figure'),
    Input('cafe-dropdown', 'value')
)
def update_polarity_graph(selected_cafe):
    filtered_df = df[df['name'] == selected_cafe]
    sentiment_counts = filtered_df[['pred_umum', 'pred_layanan', 'pred_menu', 'pred_harga', 'pred_fasilitas', 'pred_suasana']].stack().value_counts()
    sentiment_counts = sentiment_counts[sentiment_counts.index != 'none']  # Exclude 'none' sentiment
    color_dict = {'positive': '#00CC96', 'negative': '#EF553B', 'neutral': '#636EFA'}
    colors = [color_dict[sentiment] for sentiment in sentiment_counts.index]

    polarity_fig = px.pie(
        values=sentiment_counts.values,
        names=sentiment_counts.index,
        title='Overall Sentiment Polarity Distribution for {}'.format(selected_cafe),
        color_discrete_sequence=colors
    )
    return polarity_fig


# Define the callback function to update the aspect-based sentiment analysis graph
@app.callback(
    Output('sentiment-graph', 'figure'),
    Input('cafe-dropdown', 'value')
)
def update_sentiment_graph(selected_cafe):
    filtered_df = df[df['name'] == selected_cafe]
    aspect_scores = filtered_df.melt(
        id_vars=['name'],
        value_vars=['pred_umum', 'pred_layanan', 'pred_menu', 'pred_harga', 'pred_fasilitas', 'pred_suasana'],
        var_name='aspect',
        value_name='score'
    )
    aspect_scores['aspect'] = aspect_scores['aspect'].apply(lambda x: x[5:].capitalize() if x.startswith('pred_') else x)

    aspect_scores = aspect_scores[aspect_scores['score'] != 'none']  # Exclude 'none' sentiment

    aspect_counts = aspect_scores.groupby(['aspect', 'score']).size().reset_index(name='count')
    aspect_order = ['Umum', 'Layanan', 'Menu', 'Harga', 'Fasilitas', 'Suasana']
    aspect_counts['aspect'] = pd.Categorical(aspect_counts['aspect'], categories=aspect_order, ordered=True)
    aspect_counts = aspect_counts.sort_values('aspect')

    fig = go.Figure()
    for sentiment, color in zip(['positive', 'negative', 'neutral'], ['#00CC96', '#EF553B', '#636EFA']):
        sentiment_counts = aspect_counts[(aspect_counts['score'] == sentiment) & (aspect_counts['score'] != 'none')]
        fig.add_trace(go.Bar(
            x=sentiment_counts['aspect'],
            y=sentiment_counts['count'],
            name=sentiment,
            marker_color=color
        ))

    fig.update_layout(
        title='Aspect-Based Sentiment Analysis for {}'.format(selected_cafe),
        xaxis_title='Aspect',
        yaxis_title='Number of Reviews',
        barmode='group'
    )
    return fig

# Define the callback function to predict aspect-based sentiment analysis for a new review
@app.callback(
    Output('new-review-output', 'children'),
    Input('new-review-button', 'n_clicks'),
    State('new-review-input', 'value')
)
def predict_new_review_sentiment_analysis(n_clicks, value):
    if n_clicks is None:
        return ''
    if not value:
        return ''

    subwords = tokenizer.encode(value)
    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

    logits = model(subwords)[0]
    labels = [torch.topk(logit, k=1, dim=-1)[1].squeeze().item() for logit in logits]

    output = []
    output.append(html.Label('Sentimen umum: {}'.format(i2w[labels[0]])))
    output.append(html.Br())
    for i in range(1, len(labels)):
        if i2w[labels[i]] != 'none':
            output.append(html.Label('Aspek {}: {}'.format(AspectBasedSentimentAnalysisDataset.ASPECT_DOMAIN[i], i2w[labels[i]])))
            output.append(html.Br())

    return output

# Define the callback function to predict aspect-based sentiment analysis for a new review
@app.callback(
    Output('new-review-output2', 'children'),
    Input('new-review-button2', 'n_clicks'),
    State('new-review-input2', 'value')
)
def predict_new_review_sentiment_analysis2(n_clicks, value):
    if n_clicks is None:
        return ''
    if not value:
        return ''

    subwords = tokenizer2.encode(value)
    subwords = torch.LongTensor(subwords).view(1, -1).to(multi_model.device)

    logits = multi_model(subwords)[0]
    labels = [torch.topk(logit, k=1, dim=-1)[1].squeeze().item() for logit in logits]

    output = []
    output.append(html.Label('Sentimen umum: {}'.format(i2w[labels[0]])))
    output.append(html.Br())
    for i in range(1, len(labels)):
        if i2w[labels[i]] != 'none':
            output.append(html.Label('Aspek {}: {}'.format(AspectBasedSentimentAnalysisDataset.ASPECT_DOMAIN[i], i2w[labels[i]])))
            output.append(html.Br())

    return output

app.run_server(mode='external')

<IPython.core.display.Javascript object>

Dash app running on:


<IPython.core.display.Javascript object>