In [1]:
from bertopic import BERTopic
import pandas as pd 
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from jupyter_dash import JupyterDash
from dash import dcc,html,dash_table
from dash.dependencies import Input,Output,State


In [2]:
#!pip install colorlover



In [3]:
csv_file = r"C:\Users\gprak\Downloads\youtubeVideos\datasets\subjects-questions\subjects-questions.csv"
df = pd.read_csv(csv_file)

In [4]:
df.columns

Index(['eng', 'Subject'], dtype='object')

In [5]:
text = df['eng'].tolist()
target = df['Subject'].tolist()

In [8]:
docs = text
topic_model = BERTopic(verbose=True)
# For supervised learning
# topics, probs = topic_model.fit_transform(docs,y=target)
# For semisupervised learning
topics, probs = topic_model.fit_transform(docs)


Batches:   0%|          | 0/3829 [00:00<?, ?it/s]

2022-08-06 16:21:32,724 - BERTopic - Transformed documents to Embeddings
2022-08-06 16:22:33,071 - BERTopic - Reduced dimensionality
2022-08-06 16:22:41,018 - BERTopic - Clustered reduced embeddings


In [9]:
topic_df = topic_model.get_topic_info()
print(f"First 10 relevant topics are :- \n{topic_df.head(10)}")

First 10 relevant topics are :- 
   Topic  Count                                        Name
0     -1  45647                            -1_h2_true_h3_o2
1      0    734                      0_tan_cot_cos_1leftcos
2      1    651                         1_z1_complex_z2_arg
3      2    597     2_matrix_1endarrayright_matrices_mathbf
4      3    579                3_mean_median_deviation_data
5      4    575               4_angle_angles_measure_60circ
6      5    548  5_particle_acceleration_graph_velocitytime
7      6    546           6_min_rate_reaction_concentration
8      7    531                         7_theta_tan_cos_sin
9      8    495                        8_int0pi_int_xcos_xc


In [10]:
def data_bars(df, column):
    n_bins = 100
    bounds = [i * (1.0 / n_bins) for i in range(n_bins + 1)]
    ranges = [
        ((df[column].max() - df[column].min()) * i) + df[column].min()
        for i in bounds
    ]
    styles = []
    for i in range(1, len(bounds)):
        min_bound = ranges[i - 1]
        max_bound = ranges[i]
        max_bound_percentage = bounds[i] * 100
        styles.append({
            'if': {
                'filter_query': (
                    '{{{column}}} >= {min_bound}' +
                    (' && {{{column}}} < {max_bound}' if (i < len(bounds) - 1) else '')
                ).format(column=column, min_bound=min_bound, max_bound=max_bound),
                'column_id': column
            },
            'background': (
                """
                    linear-gradient(90deg,
                    #0074D9 0%,
                    #0074D9 {max_bound_percentage}%,
                    white {max_bound_percentage}%,
                    white 100%)
                """.format(max_bound_percentage=max_bound_percentage)
            ),
            'paddingBottom': 2,
            'paddingTop': 2
        })

    return styles


def data_bars_diverging(df, column, color_above='#3D9970', color_below='#FF4136'):
    n_bins = 100
    bounds = [i * (1.0 / n_bins) for i in range(n_bins + 1)]
    col_max = df[column].max()
    col_min = df[column].min()
    ranges = [
        ((col_max - col_min) * i) + col_min
        for i in bounds
    ]
    midpoint = (col_max + col_min) / 2.

    styles = []
    for i in range(1, len(bounds)):
        min_bound = ranges[i - 1]
        max_bound = ranges[i]
        min_bound_percentage = bounds[i - 1] * 100
        max_bound_percentage = bounds[i] * 100

        style = {
            'if': {
                'filter_query': (
                    '{{{column}}} >= {min_bound}' +
                    (' && {{{column}}} < {max_bound}' if (i < len(bounds) - 1) else '')
                ).format(column=column, min_bound=min_bound, max_bound=max_bound),
                'column_id': column
            },
            'paddingBottom': 2,
            'paddingTop': 2
        }
        if max_bound > midpoint:
            background = (
                """
                    linear-gradient(90deg,
                    white 0%,
                    white 50%,
                    {color_above} 50%,
                    {color_above} {max_bound_percentage}%,
                    white {max_bound_percentage}%,
                    white 100%)
                """.format(
                    max_bound_percentage=max_bound_percentage,
                    color_above=color_above
                )
            )
        else:
            background = (
                """
                    linear-gradient(90deg,
                    white 0%,
                    white {min_bound_percentage}%,
                    {color_below} {min_bound_percentage}%,
                    {color_below} 50%,
                    white 50%,
                    white 100%)
                """.format(
                    min_bound_percentage=min_bound_percentage,
                    color_below=color_below
                )
            )
        style['background'] = background
        styles.append(style)

    return styles

def discrete_background_color_bins(df, n_bins=5, columns='all'):
    import colorlover
    bounds = [i * (1.0 / n_bins) for i in range(n_bins + 1)]
    if columns == 'all':
        if 'id' in df:
            df_numeric_columns = df.select_dtypes('number').drop(['id'], axis=1)
        else:
            df_numeric_columns = df.select_dtypes('number')
    else:
        df_numeric_columns = df[columns]
    df_max = df_numeric_columns.max().max()
    df_min = df_numeric_columns.min().min()
    ranges = [
        ((df_max - df_min) * i) + df_min
        for i in bounds
    ]
    styles = []
    legend = []
    for i in range(1, len(bounds)):
        min_bound = ranges[i - 1]
        max_bound = ranges[i]
        backgroundColor = colorlover.scales[str(n_bins)]['seq']['Blues'][i - 1]
        color = 'white' if i > len(bounds) / 2. else 'inherit'

        for column in df_numeric_columns:
            styles.append({
                'if': {
                    'filter_query': (
                        '{{{column}}} >= {min_bound}' +
                        (' && {{{column}}} < {max_bound}' if (i < len(bounds) - 1) else '')
                    ).format(column=column, min_bound=min_bound, max_bound=max_bound),
                    'column_id': column
                },
                'backgroundColor': backgroundColor,
                'color': color
            })
        legend.append(
            html.Div(style={'display': 'inline-block', 'width': '60px'}, children=[
                html.Div(
                    style={
                        'backgroundColor': backgroundColor,
                        'borderLeft': '1px rgb(50, 50, 50) solid',
                        'height': '10px'
                    }
                ),
                html.Small(round(min_bound, 2), style={'paddingLeft': '2px'})
            ])
        )

    return (styles, html.Div(legend, style={'padding': '5px 0 5px 0'}))



In [11]:
def create_dash_bar_charts_for_each_topic(topic_model):
    top_relevant_topics = topic_model.get_topic_info()
    
    (styles, legend) = discrete_background_color_bins(top_relevant_topics, columns=['Count'])
    my_list = list(range(len(top_relevant_topics)))
    #print(my_list)
    app = JupyterDash()
    app.layout = html.Div([
        
        html.H1('Semi Supervised and Unsupervised Topic Modelling Visualization using Dash!'),
        dcc.Dropdown(my_list,my_list[0],id='demo-dropdown_2'),
        html.Div(id="barplot-container_2",children=[]),
        html.Div(id='dd-output-container_2'),
        dash_table.DataTable(
            data=top_relevant_topics.to_dict('records'),
            sort_action='native',
            columns=[{'name': i, 'id': i} for i in top_relevant_topics.columns],
            style_data_conditional=(
                data_bars(top_relevant_topics, 'Count') 
            ),
            style_cell={
                'width': '100px',
                'minWidth': '100px',
                'maxWidth': '100px',
                'overflow': 'hidden',
                'textOverflow': 'ellipsis',
            },
            page_size=20
        ),
#         dash_table.DataTable(
#             data=bottom_10_relevant_topics.to_dict('records'),
#             sort_action='native',
#             columns=[{'name': i, 'id': i} for i in bottom_10_relevant_topics.columns],
#             style_data_conditional=(
#                 data_bars(bottom_10_relevant_topics, 'Count') 
#             ),
#             style_cell={
#                 'width': '100px',
#                 'minWidth': '100px',
#                 'maxWidth': '100px',
#                 'overflow': 'hidden',
#                 'textOverflow': 'ellipsis',
#             },
#             page_size=20
#         ),
#         dash_table.DataTable(
#             data=top_10_relevant_topics.to_dict('records'),
#             sort_action='native',
#             columns=[{'name': i, 'id': i} for i in top_10_relevant_topics.columns],
#             style_data_conditional=styles
#         ),
#          dash_table.DataTable(
#             data=bottom_10_relevant_topics.to_dict('records'),
#             sort_action='native',
#             columns=[{'name': i, 'id': i} for i in bottom_10_relevant_topics.columns],
#             style_data_conditional=styles
#         )
    ])
    @app.callback(
        Output(component_id="barplot-container_2",component_property="children"),
        Input(component_id="demo-dropdown_2",component_property="value")
    )
    def update_output(value):
        my_sub_list = topic_model.get_topic(value)  
        cols_x = [a for (a,b) in my_sub_list]
        cols_y = [b for (a,b) in my_sub_list]
        fig_2 = px.bar(x=cols_x,y=cols_y,title = f"Most common features of topic number {value}")
        
        return [dcc.Graph(figure=fig_2)]
    return app
    
    

In [12]:
app = create_dash_bar_charts_for_each_topic(topic_model)


In [13]:
app.run_server(mode="external",port=8020)

Dash app running on http://127.0.0.1:8020/
