# Dashboard Visualisations

In [71]:
import sys
import os

import pandas as pd
import plotly.express as px
import numpy as np
from bertopic import BERTopic

ModuleNotFoundError: No module named 'visualise_topics'

In [68]:
os.path.join(os.path.join(os.path.pardir, "src/visualisation/"))

'../src/visualisation/'

In [4]:
data = pd.read_excel("predictions_lbl2vec_2.xlsx")

In [74]:
data.head()
labels = ['drinks', 'snacks', 'ingredients', 'baked goods', 'spreads', 'noodles & pasta', 'pets']

In [8]:
def reformat_data(data):
    data = data.assign(sentiment = data.sentiment.map({1: "positive", 0: "negative"}))
    data["date"] = pd.to_datetime(data["date"])

    return data

In [9]:
cleaned_data = reformat_data(data)

## Sentiment Page

In [41]:
fig = px.pie(cleaned_data, names='sentiment', title='Overall Sentiments', color_discrete_sequence=px.colors.qualitative.Safe[0:2])
fig.show()

In [34]:
freq_df = cleaned_data.groupby(['date', 'sentiment'], as_index = False).size()
fig = px.line(freq_df, x="date", y="size", color = "sentiment", title="Sentiments over Time", color_discrete_sequence=px.colors.qualitative.Safe[0:2][::-1]) 
fig.show()

In [39]:
freq_df = cleaned_data.groupby(['pred_topic_label', 'sentiment'], as_index = False).size()
freq_df['pct'] = freq_df.groupby('pred_topic_label', group_keys = False)['size'].apply(lambda x: np.round(x*100/x.sum(), 1))
fig = px.bar(freq_df, x='pred_topic_label', y='pct', color = 'sentiment', color_discrete_sequence=px.colors.qualitative.Safe[0:2][::-1])
fig.update_layout(xaxis={"dtick":1})
fig.show()

## Topics Page

In [None]:
# Most Positive Topics
# Most Negative Topics
cleaned_data.groupby()

In [45]:
freq_df = cleaned_data.groupby(['date', 'pred_topic_label'], as_index = False).size()
fig = px.area(freq_df, x="date", y="size", color = "pred_topic_label", title="Topics over Time", color_discrete_sequence=px.colors.qualitative.Safe) 
fig.show()

In [47]:
# Frequency by Topic (replace top keyword by topic - put in playground) (rename x ticks to topic name)
fig = px.pie(cleaned_data, 'pred_topic_label', title = "Frequency of Topics", color_discrete_sequence=px.colors.qualitative.Safe)
fig.show()

In [109]:
import itertools

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text


def get_top_words(corpus, custom_sw = []):

    my_stop_words = list(text.ENGLISH_STOP_WORDS.union(custom_sw))

    vec = TfidfVectorizer(stop_words = my_stop_words).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return pd.DataFrame(words_freq[:6], columns = ["top words", "tf-idf score"])

def visualise_top_words(df, topics, specific = False, custom_sw = []):

    colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])

    if specific:
        topic_corpus = df[df["pred_topic_label"] == topics[0]]
        freq_df = get_top_words(topic_corpus["cleaned_text"], custom_sw)
        fig = px.bar(freq_df, x = "tf-idf score",y= "top words", title = f"Top Words for {topics[0]}")
        
        rows = 1
        columns = 1
    else:
        subplot_titles = [topic for topic in topics]
        columns = 4
        rows = int(np.ceil(len(topics)/columns))
        fig = make_subplots(rows=rows,
                            cols=columns,
                            shared_xaxes=False,
                            horizontal_spacing=.1,
                            vertical_spacing=.4 / rows if rows > 1 else 0,
                            subplot_titles=subplot_titles)

        row = 1
        column = 1
        for topic in topics:
            topic_corpus = df[df["pred_topic_label"] == topic]
            freq_df = get_top_words(topic_corpus["cleaned_text"], custom_sw)

            fig.add_trace(
                go.Bar(x = freq_df["tf-idf score"],
                    y= freq_df["top words"],
                    orientation='h',
                    marker_color=next(colors)),
                row=row, col=column)

            if column == columns:
                column = 1
                row += 1
            else:
                column += 1

    fig.update_layout(
        template="plotly_white",
        showlegend=False,
        title={
            'text': "Top Words",
            'x': .5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width= 1000 if columns > 1 else 400,
        height=250*rows if rows > 1 else 250 * 1.3,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    return fig

custom_sw = ['great', 'taste', 'good', 'like', 'product', 'flavor', 'love', 'really', 'buy', 'tastes', 'better', 'best', 'tried', 'use', 'eat', 'food', 'make']
my_stop_words = list(text.ENGLISH_STOP_WORDS.union(custom_sw))

In [110]:
visualise_top_words(data, labels, custom_sw = custom_sw)

## Topic Playground: Topic 2

In [113]:
#representative docs


["I'm a fan of all-natural anything, especially when it's a tasty and healthy alternative to something I've enjoyed my whole life, but I know is 'bad' for me. Regular soda is one of those 'bad' things. Loaded with artificial colors, sweeteners, corn syrup...yuck. Hardly seems worth the risk when I'm also getting 150 empty calories on average. And now Switch has come along. True to its claim, there's nothing artificial in it. Just juice, sparkling water and other 'all natural' ingredients. That's great. So I chose this particular flavor (orange tangerine) because I wanted the strong citrus taste I love. This drink, however, failed to meet their promise. Yes, I can 'taste' the orangy citrus, but as much I can taste the apple and grape juices used as well. Moreover, this doesn't have that bold 'pop' to it as other carbonated beverages have. I've had mineral water with more bite. But the worse part about this 'good for me' drink is in clocks in at 140 calories (the same as a regular can of

In [111]:
visualise_top_words(data, ["snacks"], True, custom_sw = custom_sw)

In [50]:
fig = px.pie(cleaned_data.query("pred_topic_label == 'drinks'"), 'sentiment', title = "Sentiment Distribution of drinks", color_discrete_sequence=px.colors.qualitative.Safe[0:2][::-1])
fig.show()