<a href="https://colab.research.google.com/github/salsabillaflsft/ABSApublic/blob/main/AspectBasedSentimentAnalysis_DASH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation

In [None]:
!pip install jupyter-dash
!pip install dash-bootstrap-components  
!pip install dash-bootstrap-templates
!pip install pyngrok
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting dash
  Downloading dash-2.8.1-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
Collecting ansi2html
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting nest-asyncio
  Downloading nest_asyncio-1.5.6-py3-none-any.whl (5.2 kB)
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━

## Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import dash
import dash_bootstrap_components as dbc
import plotly.express as px
import plotly.graph_objects as go
from dash import html, dcc
from dash.dependencies import Input, Output, State
from jupyter_dash import JupyterDash
from dash_bootstrap_templates import ThemeChangerAIO, template_from_url

import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim, nn

import plotly.express as px
import plotly.graph_objects as go

from pyngrok import ngrok

from transformers import BertTokenizer, BertConfig, BertForPreTraining, BertPreTrainedModel, BertModel

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/TA/Dataset

Mounted at /content/drive
/content/drive/My Drive/TA/Dataset


In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')

  print('there are %d GPU(s) available.' % torch.cuda.device_count())

  print('we will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print("No GPU available, using the CPU instead")
  device = torch.device("cpu")

there are 1 GPU(s) available.
we will use the GPU:  Tesla T4


## Load 

### model

In [None]:
class BertForMultiLabelClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels_list

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob) #0.1
        # self.dropout = nn.Dropout(p=0.5)
        self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, num_label) for num_label in self.num_labels])

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        subword_to_word_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = self.dropout(outputs[1])
        logits = []
        for classifier in self.classifiers:
            logit = classifier(sequence_output)
            logits.append(logit)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            total_loss = 0
            for i, (logit, num_label) in enumerate(zip(logits, self.num_labels)):
                label = labels[:,i]
                loss = loss_fct(logit.view(-1, num_label), label.view(-1))
                total_loss += loss
            outputs = (total_loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)

### dataset

In [None]:
class AspectBasedSentimentAnalysisDataset(Dataset):
    # Static constant variable
    ASPECT_DOMAIN = ['umum','layanan','menu','harga','fasilitas','suasana']
    LABEL2INDEX = {'negative': 0, 'neutral': 1, 'positive': 2}
    INDEX2LABEL = {0: 'negative', 1: 'neutral', 2: 'positive'}
    NUM_LABELS = [3, 3, 3, 3, 3, 3]
    NUM_ASPECTS = 6
    
    def load_dataset(self, path):
        df = pd.read_csv(path)
        for aspect in self.ASPECT_DOMAIN:
            df[aspect] = df[aspect].apply(lambda sen: self.LABEL2INDEX[sen])
        return df
    
    def __init__(self, dataset_path, tokenizer, no_special_token=False, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token
        
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        sentence, labels = data['review_text'], [data[aspect] for aspect in self.ASPECT_DOMAIN]
        subwords = self.tokenizer.encode(sentence, add_special_tokens=not self.no_special_token)
        return np.array(subwords), np.array(labels), data['review_text']
    
    def __len__(self):
        return len(self.data)

class AspectBasedSentimentAnalysisDataLoader(DataLoader):
    def __init__(self, dataset, max_seq_len=512, *args, **kwargs):
        super(AspectBasedSentimentAnalysisDataLoader, self).__init__(dataset=dataset, *args, **kwargs)
        self.num_aspects = dataset.NUM_ASPECTS
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        label_batch = np.zeros((batch_size, self.num_aspects), dtype=np.int64)

        seq_list = []
        
        for i, (subwords, label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            label_batch[i,:] = label

            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, label_batch, seq_list

### ff

In [None]:
# Forward function for sequence multilabel classification
def forward_sequence_multi_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs): 
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
            
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2] # logits list<tensor(bs, num_label)> ~ list of batch prediction per class 
    
    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = [torch.topk(logit, 1)[1] for logit in logits] # list<tensor(bs)>
    batch_size = label_batch.shape[0]
    num_label = len(hyp)
    for i in range(batch_size):
        hyps = []
        labels = label_batch[i,:].cpu().numpy().tolist()
        for j in range(num_label):
            hyps.append(hyp[j][i].item())
        list_hyp.append([i2w[hyp] for hyp in hyps])
        list_label.append([i2w[label] for label in labels])
        
    return loss, list_hyp, list_label

### saved data & model

In [None]:
# Load Tokenizer, Config, Model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')

config.num_labels = max(AspectBasedSentimentAnalysisDataset.NUM_LABELS)
config.num_labels_list = AspectBasedSentimentAnalysisDataset.NUM_LABELS
model = BertForMultiLabelClassification.from_pretrained('indobenchmark/indobert-base-p1',config=config)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForMultiLabelClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifiers.3.weight', 'classifiers.4.bias', 'classifiers.5.bias', 'classifiers.4.weight', 'classifiers.5.weight', 'classifiers.2.bias', 'classifiers.3.bias', 'classifiers.1.weight', 'classifiers.0.weight', 'classifiers.1.bias', 'classifiers.2.weight', 'classifiers.0.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df = pd.read_csv('test_result_idbert.csv')

state_dict = torch.load('model.pt')
print(state_dict.keys())
model.load_state_dict(state_dict)

odict_keys(['bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weigh

<All keys matched successfully>

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           385 non-null    int64  
 1   name                 385 non-null    object 
 2   location_link        385 non-null    object 
 3   reviews_link         385 non-null    object 
 4   reviews              385 non-null    int64  
 5   rating               385 non-null    float64
 6   review_text          385 non-null    object 
 7   review_link          385 non-null    object 
 8   review_rating        385 non-null    int64  
 9   review_datetime_utc  385 non-null    object 
 10  umum                 385 non-null    object 
 11  layanan              385 non-null    object 
 12  menu                 385 non-null    object 
 13  harga                385 non-null    object 
 14  fasilitas            385 non-null    object 
 15  suasana              385 non-null    obj

In [None]:
w2i, i2w = AspectBasedSentimentAnalysisDataset.LABEL2INDEX, AspectBasedSentimentAnalysisDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'negative': 0, 'neutral': 1, 'positive': 2}
{0: 'negative', 1: 'neutral', 2: 'positive'}


## Dashboard

In [None]:
# stylesheet 
dbc_css = "https://cdn.jsdelivr.net/gh/AnnMarieW/dash-bootstrap-templates/dbc.min.css"

# Define app
app = JupyterDash(__name__, external_stylesheets=[dbc.themes.FLATLY, dbc_css])
server = app.server

# Define the layout of the app
# app.layout = html.Div([  
# Title
header = html.H3('Aspect-based Sentiment Analysis Dashboard', className="bg-primary text-white p-2 mb-2 text-center")
html.Br()

# Dropdown menu for selecting the cafe name
dropdown = html.Div(
            [
                dbc.Label("Select cafe and coworking space"),
                dcc.Dropdown(
                id='cafe-dropdown',
                options=[{'label': name, 'value': name} for name in df['name'].unique()],
                value=df['name'].iloc[0]
                )
            ], className="mb-4",
            )
    
  
# Graph for showing the sentiment polarity
graph1 = html.Div([
              dcc.Graph(id='polarity-graph')
          ])

# Graph for showing the aspect-based sentiment analysis
graph2 = html.Div([
              dcc.Graph(id='sentiment-graph')
          ])

# Text field and button for predicting new review
review = html.Div([
            html.Label('Try a new review'),
            dcc.Input(
                id='new-review-input',
                type='text',
                placeholder='Type your review here...',
                style={'width': '100%'}
            ),
            html.Br(),
            dbc.Button('Submit', id='new-review-button', color='primary', className='mt-2'),
            html.Br(),
            # html.Label('Predicted sentiment analysis for the new review:'),
            html.Div(id='new-review-output')
        ], style={'width': '100%', 'display': 'inline-block', 'vertical-align': 'top', 'margin-top': '20px', 'margin-bottom': '50px'})


theme_colors = [
    "primary",
    "secondary",
    "success",
    "warning",
    "danger",
    "info",
    "light",
    "dark",
    "link",
]

colors = html.Div(
    [dbc.Button(f"{color}", color=f"{color}", size="sm") for color in theme_colors]
)
colors = html.Div(["Theme Colors:", colors], className="mt-2")

# ])
tab1 = dbc.Tab([graph1], label="Polarity Chart")
tab2 = dbc.Tab([graph2], label="Sentiment Graph")
# # tab3 = dbc.Tab([table], label="Table", className="p-4")
tabs = dbc.Card(dbc.Tabs([tab1, tab2]))

card1 = dbc.Card(
    [dropdown, tabs],
    body=True,
)
card2 = dbc.Card(
    [review],
    body=True,
)

app.layout = dbc.Container(
    [
        header,
        dbc.Row(
            [
                dbc.Col(
                    [
                        card1,
                        # ThemeChangerAIO(aio_id="theme")
                    ],
                    width=8,
                ),
                dbc.Col([card2], width=4),
            ]
        ),
    ],
    fluid=True,
    className="dbc",
)

In [None]:
# Define the callback function to update the polarity graph
@app.callback(
    Output('polarity-graph', 'figure'),
    Input('cafe-dropdown', 'value')
)
def update_polarity_graph(selected_cafe):
    filtered_df = df[df['name'] == selected_cafe]
    polarity_counts = filtered_df['pred_umum'].value_counts(normalize=True)
    polarity_fig = px.pie(
        values=polarity_counts.values,
        names=polarity_counts.index,
        title='General Sentiment Polarity Distribution for {}'.format(selected_cafe),
        color_discrete_sequence=['#00CC96', '#EF553B', '#636EFA'] 
    )
    return polarity_fig

# Define the callback function to update the aspect-based sentiment analysis graph
@app.callback(
    Output('sentiment-graph', 'figure'),
    Input('cafe-dropdown', 'value')
)

def update_sentiment_graph(selected_cafe):
    filtered_df = df[df['name'] == selected_cafe]
    aspect_scores = filtered_df.melt(id_vars=['name'], value_vars=['pred_layanan', 'pred_menu', 'pred_harga', 'pred_fasilitas', 'pred_suasana'], var_name='aspect', value_name='score')
    aspect_scores['aspect'] = aspect_scores['aspect'].apply(lambda x: x[5:].capitalize() if x.startswith('pred_') else x) # capitalize aspect names and remove 'pred_' prefix
    aspect_counts = aspect_scores.groupby(['aspect', 'score']).size().reset_index(name='count')
    
    fig = go.Figure()
    for sentiment, color in zip(['positive','negative','neutral'],['#00CC96','#EF553B','#636EFA']):
        sentiment_counts = aspect_counts[aspect_counts['score'] == sentiment]
        fig.add_trace(go.Bar(
            x=sentiment_counts['aspect'],
            y=sentiment_counts['count'],
            name=sentiment,
            marker_color=color # set the color for the current sentiment

        ))
    
    fig.update_layout(
        title='Aspect-Based Sentiment Analysis for {}'.format(selected_cafe),
        xaxis_title='Aspect',
        yaxis_title='Number of Reviews',
        barmode='group'
    )
    return fig

# Define the callback function to predict aspect-based sentiment analysis for a new review
@app.callback(
    Output('new-review-output', 'children'),
    Input('new-review-button', 'n_clicks'),
    State('new-review-input', 'value')
)
def predict_new_review_sentiment_analysis(n_clicks, value):
    if n_clicks is None:
        return ''
    if not value:
        return ''
    
    subwords = tokenizer.encode(value)
    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

    logits = model(subwords)[0]
    labels = [torch.topk(logit, k=1, dim=-1)[1].squeeze().item() for logit in logits]
    
    output = []
    output.append(html.Label('sentimen umum: {}'.format(i2w[labels[0]])))
    output.append(html.Br())
    for i in range(1, len(labels)):
        output.append(html.Label('aspek {}: {}'.format(AspectBasedSentimentAnalysisDataset.ASPECT_DOMAIN[i], i2w[labels[i]])))
        output.append(html.Br())

    return output


app.run_server(mode='external')

Dash app running on:


<IPython.core.display.Javascript object>

In [None]:
# from pyngrok import ngrok

# public_url = ngrok.connect(port='8050')
# print(f'Open this URL in your browser: {public_url}')

In [None]:
# ngrok.disconnect(public_url)