In [9]:
import pandas as pd
import plotly.express as px
df = pd.read_csv('csur-DEF-final-revision.csv', sep=',', low_memory=False)

In [23]:
# publication type by year vertical
# https://plotly.com/python/figure-labels/
# Fig. 3. Publication type by year
year_pub_type = df.groupby(['year', 'publication_type'])['Ref'].apply(list).reset_index()
year_pub_type['count'] = year_pub_type['Ref'].apply(lambda x: len(x))
fig = px.bar(year_pub_type, x='year', y='count', 
    color='publication_type',
    text_auto=True,
    color_discrete_sequence=["#bcd6c9", "#256875"],  
    opacity=0.8,
    labels={"publication_type": "Publication type"})
fig.update_xaxes(type='category', categoryorder='category ascending', title='')
fig.update_yaxes(title='', showgrid=True, showticklabels=False)
fig.update_traces(textfont_size=14, textangle=0)
fig.update_layout(coloraxis_colorbar_y=-0.15)
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()
fig.write_image("img/type-by-year.pdf", width=1024, height=500)





In [None]:
# pie chart part
# Fig. 4. The distribution of domains by year and the corresponding percentage distribution in the selected papers set.
grouped_domain = df.groupby(['Domain']).size().reset_index(name='count')
fig = px.pie(grouped_domain, values='count', names='Domain')

fig.update_layout(title_text='')
fig.update_traces(textposition='inside', textinfo='percent+label', marker=dict(colors=['#bcd6c9', '#256875', '#5796a4', '#87c6d4']))
fig.update_layout(showlegend=False)
fig.show()
fig.write_image("img/pie-chart-domain.pdf", width=400, height=300)

In [19]:
# bar graph part
#Fig. 4. The distribution of domains by year and the corresponding percentage distribution in the selected papers set.
grouped = df.groupby(['year', 'Domain']).size().reset_index(name='count')
pivot = grouped.pivot(index='year', columns='Domain', values='count')
fig = px.bar(
    pivot,
    orientation='h',
    text_auto=True,
    color_discrete_sequence=["#bcd6c9", "#256875","#5796a4","#87c6d4"],
    labels={'value': 'Domain'}
)
fig.update_layout(legend=dict(font=dict(size=16)),plot_bgcolor='rgba(0, 0, 0, 0)')
fig.update_xaxes(title='', showgrid=True, showticklabels=False)
fig.update_yaxes(title='', type='category', categoryorder='category ascending')
fig.update_traces(textfont=dict(size=16),textangle=0)
fig.show()
fig.write_image("img/domain-by-year.pdf", width=900, height=600)





In [20]:
import pandas as pd
import plotly.express as px
from plotly.colors import hex_to_rgb

def get_text_color(hex_color):
    """
    Determine whether to use white or black text based on the brightness of the background color.
    """
    r, g, b = hex_to_rgb(hex_color)
    # Calculate brightness using a common formula
    brightness = (r * 299 + g * 587 + b * 114) / 1000
    return "white" if brightness < 128 else "black"

# Function to generate a bubble chart for a given theme file
def generate_bubble_chart(file_path, max_size=45, output_file=None, width=None, height=None):
    """
    Generate a bubble chart for a given theme file.

    Parameters:
    - file_path: Path to the CSV file for a theme
    - output_file: Optional, path to save the resulting HTML plot
    """
    # Load the theme data
    theme_data = pd.read_csv(file_path, na_values=[], keep_default_na=False)

    # Bubble size according to paper count and concatenated references
    theme_data['bubble_size'] = theme_data['paper_count'] * 50 + theme_data['concatenated_refs'].str.len() * 10

    # Get unique subthemes and their corresponding types
    subthemes = theme_data['final subtheme'].unique()
    subtheme_types = {
        subtheme: theme_data[theme_data["final subtheme"] == subtheme]["type"].unique()
        for subtheme in subthemes
    }
    
    design_colors = ['#87c6d4',]

    theme_data["final subtheme"] = theme_data["final subtheme"].replace({
        "Improving Privacy Policy Visual Presentation": "Improving Privacy Policy<br> Visual Presentation",
        "Improving Privacy Policy Readability": "Improving Privacy <br>Policy Readability",
        "Advancing Privacy Policy Assessment": "Advancing Privacy<br> Policy Assessment",
        "Assessing Risk and Nudging Privacy Behaviour": "Assessing Risk and <br>Nudging Privacy Behaviour",
        "Visualising Disclosed Information": "Visualising Disclosed<br>Information"
    })

    fig = px.scatter(
        theme_data,
        x="type",  # X-axis: Type of paper
        y="design",  # Y-axis: Custom-ordered Design of paper
        size="bubble_size",  # Dynamically scaled bubble size
        text="concatenated_refs",  # Text inside the bubbles
        facet_col="final subtheme",  # Separate subthemes into columns
        hover_data={"concatenated_refs": True, "paper_count": True},  # Tooltip information
        labels={"type": "", "design": ""},  # Axis labels
        title="",  # Dynamic title
        facet_col_wrap=4,  # Adjust columns per row if too many subthemes
        size_max=max_size,  # Limit the maximum bubble size
        color_discrete_sequence=design_colors,
    )

    #  Update x-axis for each subplot to only show relevant types
    for subtheme in subthemes:
        # Get the unique types for this subtheme
        relevant_types = subtheme_types[subtheme]
        # Find the matching subplot column (facet index)
        facet_idx = list(subthemes).index(subtheme) + 1
        # Update the x-axis for this subplot
        fig.update_xaxes(matches=None, tickvals=relevant_types, row=1, col=facet_idx, tickangle=45)

        if len(relevant_types) == 2:
            # Reduce x-axis range for tighter padding
            fig.update_xaxes(
                range=[-0.8, 3],  # Adjust range to fit only two categories
                row=1,
                col=facet_idx
            )

    #  Add vertical lines between subplots
    for i in range(1, len(subthemes)):  # Start at 1 to avoid adding a line before the first subplot
        x_position = i / len(subthemes)  # Normalize x-position for subplot division
        fig.add_shape(
            type="line",
            xref="paper",  # Reference the full plot width
            yref="paper",  # Reference the full plot height
            x0=x_position, x1=x_position,  # Vertical line position
            y0=0, y1=1,  # Line spans the entire height
            line=dict(color="black", width=1, dash="dash")  # Style the line
        )
    # Remove the "final subtheme=" prefix in subplot titles
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

    # Customize the plot layout
    fig.update_traces(
        textfont_size=10,  # Adjust font size for text inside bubbles
        textposition="middle center",  # Center the text inside the bubbles
        marker=dict(
            opacity=0.6,  # Set bubble transparency
            line=dict(width=1, color="black")  # Add a black border around bubbles
        )
    )

    fig.update_layout(
        title_font_size=20,  # Title font size
        xaxis_title="",  # X-axis title
        yaxis_title="",
        xaxis=dict(showgrid=True),  # Show grid for better readability
        plot_bgcolor="white",  # Set background color
        width=900,  # Increase figure width
        height=450,
        font=dict(size=16)
    )

    # Show the plot
    fig.show()
    fig.write_image(output_file, format="pdf", width=width, height=height)


# Fig 9. Papers categorised under the subthemes of the theme Improving Privacy Policies that include user participation.
generate_bubble_chart("grouped_papers_with_refs-improving-pp.csv",35,"img/improving-privacy-policies-bubble.pdf", width=1200, height=415)

# Fig 10. Papers categorised under the subthemes of the theme Raising Privacy Awareness that include user participa
generate_bubble_chart("grouped_papers_with_refs-raising.csv",45, "img/raising-privacy-awareness-bubble.pdf", width=1000, height=450)

# Fig 11. Papers categorised under the subthemes of the theme Controlling Information Disclosure that include user participation
generate_bubble_chart("grouped_papers_with_refs-controlling-id.csv",50, "img/controlling-information-disclosure-bubble.pdf", width=1400, height=470)












