
# Homestuck Census Data Analysis Notebook

A CUDA-accelerated notebook for visualizing and analyzing Homestuck fandom census data.

```bash
# Required environment
uv pip install torch scikit-learn "umap-learn>=0.5.4,<0.5.7" "numba>=0.55,<0.56" "llvmlite<0.39" hdbscan plotly altair wordcloud matplotlib seaborn jupyter notebook ipywidgets widgetsnbextension pandas-profiling
```


In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import hdbscan
import umap
from sklearn.manifold import TSNE
import plotly.express as px
import re
import altair as alt
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Settings
pd.set_option('future.no_silent_downcasting', True)

# Load data
csv_path = 'census_results.csv'
df = pd.read_csv(csv_path, low_memory=False)

# Drop the first five rows (test data)
df.drop(index=[0, 1, 2, 3, 4], inplace=True)

# Private and useless data (duplicated rows and such)
columns_to_drop = [
    # Private rows
    "Leave your email here if you want to be contacted about the projects above",
    # Useless for display
    "Are you participating or have you ever participated in one of /r/homestuck's group rereads?",
    "Would you be interested in giving track art to an official Homestuck album that doesn't have it? (i.e. Beyond Canon Art Anthology)",
    "Would you be interested in contributing a track to a fanmusic album? (i.e. Land of Fans and Music) ",
    "Would you be interested in contributing track art to a fanmusic album? (i.e. Land of Fans and Music) ",
    "If you have any feedback to improve future surveys, here's your chance!",
    # Removed or changed at some point (only had data for the previous columns)
    "Do you want to order a full list or lazily select your top 3 characters and call it a day?",
    "Which of these Homestuck franchiselets have you perused? [Bard Quest]",
    "Which of these Homestuck franchiselets have you perused? [Namco High]",
    "Favorite major characters",
    "Select your 3 least favorite characters!.1",
    "Which 3 characters had the best complete arc?.1",
    "Which 3 characters had the worst arc?.1",
    "What's your favorite ship?.1"
]

# Drop private columns inplace if they exist
df.drop(columns=columns_to_drop, inplace=True, errors="ignore")

# Remove duplicate/troll answers based on manual check
submission_ids_to_drop = [
    # Explicitly dropped (trolls, worse duplicates, etc.)
    "R7Z8OK", "o4A1AV", "rxrxAp", "5yzDOM", "rxdxeM", "4g8YRA", "9zWyYK",
    "rxdxeM", "9zWyYK", "ed68xo", "Xxx9VzY", "Nppg5Gb", "QKKOxJG"
    # Others from confirmed duplicate IDs
    "Q6DVbX", "YLQvAN", "0pxY79", "XKDajY", "E4XOvq", "d5Dyvq", "2M4j4M",
    "XKqM7g", "E4XoEo", "Pa04xB", "BPXrgN", "GNpoLp", "b0MzME", "o48L7P",
    "M9jkD8", "M9kN8Y", "44XgVPB", "jd10Yx", "ob6a4zN", "Np5V1qN", "eqZL7bo",
    "BO908Y", "R706OQ", "ZpNKqB", "gR4YbD", "OpQ4Qg", "444KDlk", "5BKgEKd",
    "q2Yjad", "11YBAl"
]

# Rename columns
if "Submission ID" in df.columns:
    df = df[~df["Submission ID"].isin(submission_ids_to_drop)]
    df.rename(columns={"Submission ID": "submission_id"}, inplace=True)
    df.drop(columns=["Respondent ID"], errors="ignore", inplace=True)

df.sample(5)

In [None]:
def prepare_bar_data(series, top_n=9, other_label="Other"):
    """
    Given a Pandas Series (categorical data), return a DataFrame
    with two columns: 'category' and 'count'.
    It keeps the top_n categories (by count) and aggregates the
    remaining as a single bucket labeled `other_label`.

    If `other_label` already exists among the top_n categories,
    the new bucket count is added to that existing category.
    """
    counts = series.value_counts(dropna=True)

    # If the total number of categories is already <= top_n, no need to bucket
    if len(counts) <= top_n:
        df_out = counts.reset_index()
        df_out.columns = ["category", "count"]
        df_out["category"] = df_out["category"].astype(str)
        return df_out

    # Otherwise, bucket the extras
    top = counts.iloc[:top_n].copy()
    rest_sum = counts.iloc[top_n:].sum()

    # If 'Other' is already in the top categories, just add to it
    if other_label in top.index:
        top.loc[other_label] += rest_sum
    else:
        top[other_label] = rest_sum

    df_out = top.reset_index()
    df_out.columns = ["category", "count"]
    df_out["category"] = df_out["category"].astype(str)
    return df_out


def apply_fig_aesthetics(figure):
    figure.update_layout(
        xaxis_title="",            # Remove the X-axis label.
        yaxis_title="",
        plot_bgcolor="rgba(0, 0, 0, 0)",
        paper_bgcolor="rgba(0, 0, 0, 0)",
        font=dict(family="TYPOSTUCK", color="black", size=40),
        title_text="",             # Remove the title from the layout.
        margin=dict(l=0, r=0, t=0, b=0)
    )

# For rows with multiple comma-separated answers, keep only the first answer.
def keep_first_choice(text):
    parts = text.split(",")
    return parts[0].strip() if parts else text

import re
import plotly.express as px

def create_bar_chart(df, label, top_n=11, rename_dict=None, output_html=False, output_image=True, sorted_flag=False, show_title=False):
    """
    Create a Plotly Express bar chart from a survey column in a DataFrame, apply cosmetic
    settings, optionally sort the categories, and save the outputs (HTML and PNG) using a filename derived from the label.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing survey data.
        label (str): The column name containing the survey answers.
        top_n (int): Maximum number of top categories to include (default is 11).
        rename_dict (dict): Optional dictionary for renaming category values.
        output_html (bool): If True, the chart is saved as an HTML file.
        output_image (bool): If True, the chart is saved as a PNG image.
        sorted_flag (bool): If True, sort the categories (assuming numeric values) before plotting.
        show_title (bool/str): If true, doesn't hide title, if string, uses that as title

    Returns:
        fig (plotly.graph_objs._figure.Figure): The generated Plotly figure.

    Note:
        This function assumes that a helper function named `prepare_bar_data()`
        exists and returns a DataFrame with at least "category" and "count" columns.
    """
    # Normalize the label
    normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", label)

    source_series = df[label].astype("string").dropna().apply(keep_first_choice)

    # Prepare the aggregated survey data.
    survey_data = prepare_bar_data(source_series, top_n=top_n)

    # Rename categories if a renaming dictionary is provided.
    if rename_dict:
        for key, value in rename_dict.items():
            survey_data.loc[survey_data["category"] == key, "category"] = value

    # Convert categories to all uppercase.
    survey_data["category"] = survey_data["category"].str.upper()
    # Append two spaces to the end of each category (hack for spacing)
    survey_data["category"] = survey_data["category"].str.cat(["  "] * len(survey_data), sep="")

    # Optionally sort the data.
    if sorted_flag:
        # Try to convert category to a numeric value.
        # If conversion fails (e.g., for ">10"), coerce to NaN and then fill with a fallback value (here, 11)
        survey_data["sort_val"] = pd.to_numeric(survey_data["category"], errors="coerce")
        survey_data["sort_val"] = survey_data["sort_val"].fillna(11)
        # Sort by this numeric value.
        survey_data = survey_data.sort_values(by="sort_val")
        survey_data = survey_data.drop(columns="sort_val")

    # Create the bar chart using Plotly Express.
    fig = px.bar(
        survey_data,
        x="count",
        y="category",
        color="category",
        title=label,  # The title is passed here but later removed from the layout.
        template="plotly_white"  # Light mode template.
    )

    # Configure the bar chart traces.
    fig.update_traces(
        texttemplate="%{x}",
        textposition="inside",
        orientation="h",
        showlegend=False,
        textfont=dict(size=30)  # Increase text size for numbers.
    )

    apply_fig_aesthetics(fig)

    if show_title:
        if isinstance(show_title, str):
            label = show_title
        text_between_brackets = re.findall(r'\[(.*?)\]', label)
        if text_between_brackets:
            label = text_between_brackets[0]
        fig.update_layout(
            title=label,
            margin=dict(l=0, r=0, t=60, b=0)
        )

    if output_html:
        html_filename = f"graphs/{normalized_label}.html"
        fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

    if output_image:
        image_filename = f"graphs/{normalized_label}.png"
        fig.write_image(image_filename, width=1500, height=800)
    return fig


In [None]:
label = "Where did you find this survey?"
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", label)

source_series = df[label].dropna().apply(keep_first_choice)
survey_data = prepare_bar_data(source_series, top_n=11)

rename_dict = {
    "Homestuck Discord (the one currently doing a reread)": "Homestuck Discord",
    "Twitter-based HS Discord (e.g. Homestuck Anonymous)": "Twitter-based HS Discord",
}

fig = create_bar_chart(
    df,
    label="Where did you find this survey?",
    top_n=11,
    rename_dict=rename_dict
)

In [None]:
# Combine "Which gender do you identify the most with?" and "Are you transgender?" into a single virtual column.
gender_column = "Which gender do you identify the most with?"
transgender_column = "Are you transgender?"

def compute_gender_identity(row):
    """
    Compute a composite gender identity string based on the gender and transgender status in a row.
    Returns a string such as "Trans Male", "Cis Female", "Trans Nonbinary", or "Other".
    If either value is missing, returns pd.NA.
    """
    gender_val_raw = row.get(gender_column, None)
    trans_val_raw = row.get(transgender_column, None)

    if pd.isna(gender_val_raw) or pd.isna(trans_val_raw):
        return pd.NA

    # Clean and standardize the inputs.
    gender_val_raw = gender_val_raw.strip()
    trans_val_raw = trans_val_raw.strip()

    # Standardize the gender value.
    if gender_val_raw.lower() == "male":
        gender_standard = "Male"
    elif gender_val_raw.lower() == "female":
        gender_standard = "Female"
    elif gender_val_raw.lower() == "nonbinary":
        gender_standard = "Nonbinary"
    else:
        gender_standard = "Other"

    # Determine the transgender status.
    if trans_val_raw.lower() == "yes":
        trans_prefix = "Trans"
    elif trans_val_raw.lower() == "no":
        trans_prefix = "Cis"
    else:
        trans_prefix = ""

    # Combine the values.
    if trans_prefix:
        return f"{trans_prefix} {gender_standard}"
    else:
        return gender_standard

if gender_column in df.columns and transgender_column in df.columns:
    df["gender_identity"] = df.apply(compute_gender_identity, axis=1)

fig = create_bar_chart(
    df,
    label="gender_identity",
    top_n=11,
    rename_dict=rename_dict
)

In [None]:
import plotly.express as px

# Define the column to plot.
transgender_column = "Are you transgender?"

# Filter out missing responses.
trans_series = df[transgender_column].dropna()

# Map the responses: "Yes" becomes "Trans" and "No" becomes "Cis".
trans_series = trans_series.replace({"Yes": "Trans", "No": "Cis"})

# Compute counts for each mapped response.
trans_counts = trans_series.value_counts().reset_index()
trans_counts.columns = ["response", "count"]

# Create the pie chart using Plotly Express.
fig_pie = px.pie(
    trans_counts,
    names="response",
    values="count",
    title="Trans vs Cis",
    template="plotly_white"
)

# Update the pie chart trace so the labels appear inside the slices with both label and percentage.
fig_pie.update_traces(
    textposition='inside',
    textinfo='label+percent',
    showlegend=False
)

# Apply common figure aesthetics.
apply_fig_aesthetics(fig_pie)

# Display the pie chart.
fig_pie.write_image("graphs/trans_pie.png", width=400, height=400)

In [None]:
label = "Which orientation do you identify the most with?"
create_bar_chart(
    df,
    label=label,
    top_n=11,
    rename_dict={}
)

In [None]:
label = "What's your relationship status?"

source_series = df[label]
survey_data = prepare_bar_data(source_series, top_n=11)

create_bar_chart(
    df,
    label=label,
    top_n=11,
    rename_dict={}
)

In [None]:
label = "What's your relationship status?"
create_bar_chart(
    df,
    label=label,
    top_n=11,
    rename_dict={}
)

In [None]:
label = "Which ethnicity or race/s do you identify the most with?\n"
create_bar_chart(
    df,
    label=label,
    top_n=11,
    rename_dict={}
)

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
from upsetplot import UpSet, from_memberships
import seaborn as sns

def reduce_categories(df, col, top_n=9, other_label="Other"):
    """
    For the given column in df, keep only the top_n most frequent categories.
    All other values are replaced with the other_label.
    The column is then converted to a categorical dtype.
    """
    # Get value frequencies, excluding NaN (we will leave NaNs untouched)
    freq = df[col].value_counts(dropna=True)
    # Get the top_n categories (if there are fewer than top_n, they will all be kept)
    top_categories = freq.index[:top_n].tolist()

    # Replace values not in the top list with other_label, leave NaN as is
    df[col] = df[col].apply(lambda x: x if pd.isna(x) or x in top_categories else other_label)
    return df[col]

def create_upset_plot(
        df,
        label="Which ethnicity or race/s do you identify the most with?\n",
):
    """
    Creates an UpSet plot for the ethnicity/race question that:
      - Uses a transparent background and black TYPOSTUCK text
      - Colors the set-size bars in different colors (using a colormap)

    Parameters:
        df (pd.DataFrame): The survey DataFrame.
        label (str): The column name for the ethnicity/race question.
    """
    # 1. Parse the multi-choice responses.
    #    Split comma-separated strings and strip extra whitespace.
    eth_series = df[label].copy().dropna().apply(lambda x: [cat.strip() for cat in x.split(",")])

    # 4. Build the upset data.
    upset_data = from_memberships(eth_series).copy()  # explicitly make a copy
    if not upset_data.index.is_unique:
        upset_data = upset_data.groupby(level=list(range(upset_data.index.nlevels))).sum().copy()

    # 5. Set matplotlib aesthetics: TYPOSTUCK font
    plt.rcParams["font.family"] = "TYPOSTUCK"

    # 6. Build and plot the UpSet object.

    fig = plt.figure(figsize=(10, 6))
    upset = UpSet(upset_data, show_counts=True, sort_by="cardinality", facecolor="green")

    upset.plot(fig=fig)
    plt.tight_layout()
    plt.savefig("graphs/ethnicity_upset.png", dpi=500)

label = "Which ethnicity or race/s do you identify the most with?\n"
df[label] = reduce_categories(df.copy(), label, top_n=13, other_label="Other")
# create_upset_plot(df, label=label)


In [None]:
label = "Are you employed?"
create_bar_chart(
    df,
    label=label,
    top_n=11,
    rename_dict={}
)

In [None]:
from upsetplot import UpSet, from_indicators

col_system   = "\"I'm belong to/I am/I have...\" (I'm part of a System (DID/MPD/etc))"
col_adhd     = "\"I'm belong to/I am/I have...\" (I have ADHD/attention issues)"
col_autistic = "\"I'm belong to/I am/I have...\" (I'm autistic/on the spectrum)"
col_neurodiv = "\"I'm belong to/I am/I have...\" (I'm neurodivergent in another way (PTSD, OCD, BPD, etc))"
col_furry    = "\"I'm belong to/I am/I have...\" (I'm a furry)"

# Create a new DataFrame with just those columns and rename them.
selected = df[[col_system, col_adhd, col_autistic, col_neurodiv, col_furry]].copy()
selected = selected.rename(columns={
    col_system:   "system",
    col_adhd:     "adhd",
    col_autistic: "autistic",
    col_neurodiv: "neurodiv",
    col_furry:    "furry"
})

# Convert the columns to booleans.
for col in ["system", "adhd", "autistic", "neurodiv", "furry"]:
    selected[col] = selected[col].astype(bool)

# Create upset data from the boolean indicator DataFrame.
upset_data = from_indicators(selected)

# Set matplotlib aesthetics for a transparent background and black TYPOSTUCK text.
plt.rcParams["font.family"] = "TYPOSTUCK"

# Build and display the UpSet plot.
fig = plt.figure(figsize=(10, 6))
upset = UpSet(upset_data, show_counts=True, sort_by="cardinality", facecolor="green", subset_size="count")
upset.plot(fig=fig)
plt.tight_layout()
plt.savefig("graphs/identity_upset.png", dpi=500)
# plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

econ_col = "Economic scale (1 Left to 5 Right)"
soc_col  = "Social scale (1 Authoritarian to 5 Libertarian)"

# Create a new DataFrame with only the two columns (drop rows where either is missing)
compass = df[[econ_col, soc_col]].dropna()

# Set up the plot
plt.figure(figsize=(8, 8))

# Plot the scatter points (with low alpha so the density shows up clearly)
sns.scatterplot(data=compass, x=econ_col, y=soc_col, color="blue", alpha=0.2, s=20)

# Overlay a filled 2D kernel density estimation
sns.kdeplot(
    data=compass,
    x=econ_col,
    y=soc_col,
    fill=True,
    cmap="binary",
    alpha=1.0,
    thresh=0.05
)

# Draw dotted lines at the midpoint (3) for both axes to delineate quadrants.
plt.axhline(3, color="black", linestyle="--", linewidth=1)
plt.axvline(3, color="black", linestyle="--", linewidth=1)

# Set axis labels and title
plt.xlabel("", fontsize=12, fontname="TYPOSTUCK", color="black")
plt.ylabel("", fontsize=12, fontname="TYPOSTUCK", color="black")

plt.gca().set_ylim(6, 0)
plt.gca().set_xlim(0, 6)
plt.gca().patch.set_alpha(0.0)
plt.gcf().patch.set_alpha(0.0)

plt.tight_layout()
plt.savefig("graphs/compass_density.png", dpi=500)
# plt.show()


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # required for 3D plotting
from scipy.stats import gaussian_kde
import matplotlib.image as mpimg

# -------------------------------
# 1. Data Preparation
# -------------------------------
# Assume these are the columns you are using:
econ_col = "Economic scale (1 Left to 5 Right)"
soc_col  = "Social scale (1 Authoritarian to 5 Libertarian)"

# Extract data and drop rows with missing values.
compass = df[[econ_col, soc_col]].dropna()
x = compass[econ_col].values
y = compass[soc_col].values

# -------------------------------
# 2. Compute the 2D Density
# -------------------------------
# Stack the x and y arrays
values = np.vstack([x, y])
kde = gaussian_kde(values)

# Create a regular grid over the range [1, 5] for both axes.
x_grid = np.linspace(1, 5, 100)
y_grid = np.linspace(1, 5, 100)
xx, yy = np.meshgrid(x_grid, y_grid)

# Evaluate the KDE on the grid.
grid_positions = np.vstack([xx.ravel(), yy.ravel()])
density = np.reshape(kde(grid_positions), xx.shape)

# -------------------------------
# 3. Load the Texture Image and Rotate It
# -------------------------------
img = mpimg.imread("texturemap.png")
# Convert to float format (so that the RGBA values are in [0,1]) if necessary.
if img.dtype.kind != 'f':
    img = img.astype(np.float32) / 255.0
# If the image is RGB (3 channels) add an alpha channel.
if img.shape[2] == 3:
    alpha_channel = np.ones((img.shape[0], img.shape[1], 1))
    img = np.dstack([img, alpha_channel])

# -------------------------------
# 4. Compute Texture Mapping Coordinates
# -------------------------------
# Since our x and y ranges are [1,5] (Economic and Social scales),
# we want to map these values to [0,1] for texture coordinates.
# Here we do NOT invert the axes.
u = (xx - 1) / (5 - 1)   # Economic axis mapping: (x - 1) / 4.
v = (yy - 1) / (5 - 1)   # Social axis mapping: (y - 1) / 4.
# Clip u and v into [0,1] to avoid indexing errors.
u_clip = np.clip(u, 0, 1)
v_clip = np.clip(v, 0, 1)

# Determine the image dimensions.
nrows, ncols, _ = img.shape
# Map (u,v) coordinates to the pixel indices of the rotated image.
row_idx = (v_clip * (nrows - 1)).astype(np.int32)
col_idx = (u_clip * (ncols - 1)).astype(np.int32)
# Use these indices to sample the image and get RGBA facecolors.
facecolors = img[row_idx, col_idx, :]

# -------------------------------
# 5. Plot the 3D Density Surface with the Texture Overlay
# -------------------------------
fig = plt.figure(figsize=(10, 8), facecolor="none")
ax = fig.add_subplot(111, projection='3d', facecolor="none")
fig.patch.set_alpha(0.0)
# Set the axis pane colors to transparent.
ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.grid(False)

# Plot the surface. We disable shading so that our facecolors (the texture) display unmodified.
surf = ax.plot_surface(xx, yy, density, rstride=1, cstride=1, facecolors=facecolors,
                       edgecolor='none', shade=False, alpha=0.8)

# Set the labels and title.
ax.set_xlabel("Economic (1 = Left, 5 = Right)", fontname="TYPOSTUCK", color="black")
ax.set_ylabel("Social (1 = Authoritarian, 5 = Libertarian)", fontname="TYPOSTUCK", color="black")
ax.set_zlabel("Density", fontname="TYPOSTUCK", color="black")
ax.set_title("3D Density Plot with Texture Overlay", fontname="TYPOSTUCK", color="black")

# Optionally, add a color bar.
fig.colorbar(surf, ax=ax, shrink=0.5, aspect=10)

plt.tight_layout()
plt.savefig("graphs/compass_3d_density.png", dpi=500)
# plt.show()

In [None]:
from collections import Counter

fav_col = "Select your 3 favorite characters!"
least_col = "Select your 3 least favorite characters!"

# Helper function to get counts
def count_characters(df, column):
    all_chars = []
    for entry in df[column].dropna():
        chars = [c.strip() for c in entry.split(",") if c.strip()]
        all_chars.extend(chars)
    return Counter(all_chars)

# Count frequencies
fav_counts = count_characters(df, fav_col)
least_counts = count_characters(df, least_col)

# Union of all character names
all_chars = set(fav_counts.keys()).union(set(least_counts.keys()))

# Compute controversy metrics
rows = []
for char in all_chars:
    fav = fav_counts.get(char, 0)
    least = least_counts.get(char, 0)
    total = fav + least
    nci = (2 * min(fav, least)) / total if total > 0 else 0
    rows.append({
        "character": char,
        "favorite_count": fav,
        "least_favorite_count": least,
        "normalized_controversy_index": round(nci, 3),
        "total_mentions": total
    })

# Sort by NCI first, then total mentions
controversy_df = pd.DataFrame(rows).sort_values(
    by=["normalized_controversy_index", "total_mentions"],
    ascending=[False, False]
)

controversy_df

In [None]:
import pandas as pd
orig_area = [
  {"orig_area": "STEM", "count": "1737"},
  {"orig_area": "Arts", "count": "1303"},
  {"orig_area": "Retail", "count": "653"},
  {"orig_area": "Humanities", "count": "369"},
  {"orig_area": "Social Sciences", "count": "300"},
  {"orig_area": "Skilled Trades", "count": "290"},
  {"orig_area": "Undecided", "count": "279"},
  {"orig_area": "Health", "count": "279"},
  {"orig_area": "Education", "count": "258"},
  {"orig_area": "Business & Finance", "count": "229"},
  {"orig_area": "Public Service", "count": "209"},
  {"orig_area": "Other", "count": "104"},
  {"orig_area": "Communications", "count": "77"},
  {"orig_area": "Law", "count": "70"}
]

df_orig = pd.DataFrame(orig_area)

df_orig["count"] = df_orig["count"].astype(int)

responses = []
for _, row in df_orig.iterrows():
    responses.extend([row["orig_area"]] * row["count"])

df_responses = pd.DataFrame({"orig_area": responses})

create_bar_chart(
    df_responses,
    label="orig_area",  # Column name containing our responses.
    top_n=11,           # For example, to show top 11 categories.
    rename_dict={}
)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
import numpy as np

# Columns (dates in string format)
last_read_col = "Approximately when did you last \"read\" the comic?"
join_col = "Approximately when did you join the Homestuck fandom?"

# Convert the date columns to datetime.
df['join_date'] = pd.to_datetime(df[join_col], errors='coerce')
df['last_read_date'] = pd.to_datetime(df[last_read_col], errors='coerce')

# Define a complete date range.
full_range = pd.date_range(start='2009-04-13', end='2025-04-01')

# Compute daily counts for each column.
join_counts = df['join_date'].value_counts().sort_index()
last_read_counts = df['last_read_date'].value_counts().sort_index()

# Reindex counts over the full date range (filling missing dates with 0).
join_series = join_counts.reindex(full_range, fill_value=0)
last_read_series = last_read_counts.reindex(full_range, fill_value=0)

# Smooth the counts with a 30-day rolling average.
join_smooth = join_series.rolling(window=30, min_periods=1).mean()
last_read_smooth = last_read_series.rolling(window=30, min_periods=1).mean()

# Add an offset to handle zeros when using log scale.
join_smooth_offset = join_smooth + 1
last_read_smooth_offset = last_read_smooth + 1

plt.figure(figsize=(12, 6))

# Plot join events.
plt.plot(full_range, join_smooth_offset, label='Joined Fandom', color='blue')
plt.fill_between(full_range, join_smooth_offset, color='blue', alpha=0.3)

# Plot last read events.
plt.plot(full_range, last_read_smooth_offset, label='Last Read Comic', color='red')
plt.fill_between(full_range, last_read_smooth_offset, color='red', alpha=0.3)

# Set logarithmic y-axis.
plt.yscale('log')

# Draw a horizontal line at the offset value (1) for reference.
plt.axhline(1, color='black', linestyle='--', linewidth=1)

plt.xlabel("Date", fontsize=12, fontname="TYPOSTUCK", color="black")
plt.ylabel("Number of Responses (30-day avg.)", fontsize=12, fontname="TYPOSTUCK", color="black")
plt.title("Timeline of Joining vs. Last Reading (Smoothed)", fontsize=14, fontname="TYPOSTUCK", color="black")
plt.legend(fontsize=12)

# --- Annotate Peaks ---
# Define our date range of interest.
lower_bound = pd.Timestamp("2010-01-01")
upper_bound = pd.Timestamp("2024-01-01")

# Create masks for the valid date range.
mask_join = (full_range >= lower_bound) & (full_range < upper_bound)
mask_read = (full_range >= lower_bound) & (full_range < upper_bound)

# Restrict unsmoothed series to the valid range.
join_series_valid = join_series[mask_join]
last_read_series_valid = last_read_series[mask_read]

# Find peaks on the raw (unsmoothed) data.
peaks_join, _ = find_peaks(join_series_valid.values, prominence=0.5)
peaks_read, _ = find_peaks(last_read_series_valid.values, prominence=0.5)

# Retrieve corresponding dates from the valid indices.
valid_join_dates = join_series_valid.index
valid_read_dates = last_read_series_valid.index

# Build DataFrames of peaks for each series.
df_peaks_join = pd.DataFrame({
    "date": valid_join_dates[peaks_join],
    "value": join_series_valid.values[peaks_join],
    "type": "join"
})
df_peaks_read = pd.DataFrame({
    "date": valid_read_dates[peaks_read],
    "value": last_read_series_valid.values[peaks_read],
    "type": "read"
})

# Select top 5 joined peaks (by raw count) and top 10 read peaks.
df_top_join = df_peaks_join.sort_values(by="value", ascending=False).head(5)
df_top_read = df_peaks_read.sort_values(by="value", ascending=False).head(10)

# Combine the two sets.
df_top_peaks = pd.concat([df_top_join, df_top_read], ignore_index=True)
# If duplicates exist for the same date, drop them.
df_top_peaks = df_top_peaks.drop_duplicates(subset=["date"])

# To annotate on the plotted (smoothed+offset) curve, define a helper.
def get_annotated_value(date, typ):
    # For join events, use join_smooth_offset; for read events, last_read_smooth_offset.
    if typ == "join":
        return join_smooth_offset.loc[date]
    else:
        return last_read_smooth_offset.loc[date]

# Annotate each selected peak.
# Instead of multiplying the y-value, we use an offset in "offset points" to ensure the
# annotation appears clearly above the peak.
for _, row in df_top_peaks.iterrows():
    date = row["date"]
    typ = row["type"]
    y_val = get_annotated_value(date, typ)
    date_str = date.strftime("%Y-%m-%d")
    plt.annotate(date_str,
                 xy=(date, y_val),
                 xytext=(0,20),          # 20 points above the arrow tip
                 textcoords="offset points",
                 arrowprops=dict(arrowstyle="->", color='black'),
                 fontsize=10, fontname="TYPOSTUCK", color="black")

plt.tight_layout()
plt.savefig("graphs/timeline_density_smoothed.png", dpi=500, transparent=True)
# plt.show()


In [None]:
import pandas as pd

# Ensure the join_date column is in datetime format.
df['join_date'] = pd.to_datetime(df["Approximately when did you join the Homestuck fandom?"], errors='coerce')

# Drop any rows where join_date is NaT (if you want to ignore them).
df_valid = df.dropna(subset=['join_date']).copy()

# Extract the year from the join_date.
df_valid['join_year'] = df_valid['join_date'].dt.year

# Count the number of respondents per join year.
year_counts = df_valid['join_year'].value_counts().sort_index()

# Calculate the total number of valid join entries.
total = year_counts.sum()

# Compute percentages and round to (say) 1 decimal place.
percentages = (year_counts / total * 100).round(1)

# Create a new DataFrame for output.
result_df = pd.DataFrame({
    'Fandom Join Year': year_counts.index,
    'Count': year_counts.values,
    'Percentage': percentages.values
})

result_df = result_df.sort_values(by='Fandom Join Year')

# Display the resulting table.
print(result_df.to_string(index=False))


In [None]:
col_how_many_times = "How many times have you \"read\" Homestuck?"
col_hardcore = "On a scale of 0 (accidentally started this survey) to 10 (has put thousands of hours into the comic/fandom) how hardcore a fan of Homestuck are you?"

df_howmany = df.copy()
# floor value to int
# df_howmany[col_how_many_times] = df_howmany[col_how_many_times].dropna().apply(lambda x: x if x <= 10 else 11)
# df_howmany[col_how_many_times] = df_howmany[col_how_many_times].dropna().astype("int").astype("string")
# df_howmany[col_hardcore] = df_howmany[col_hardcore].dropna().astype("int").astype("string")

numeric_rename_dict = {"1.0": "1", "2.0": "2", "3.0": "3", "4.0": "4", "5.0": "5", "6.0": "6", "7.0": "7", "8.0": "8", "9.0": "9", "10.0": "10"}
create_bar_chart(
    df_howmany,
    label=col_how_many_times,  # Column name containing our responses.
    top_n=9,           # For example, to show top 11 categories.
    rename_dict=numeric_rename_dict,
    sorted_flag=True
)

In [None]:
create_bar_chart(
    df_howmany,
    label=col_hardcore,  # Column name containing our responses.
    top_n=10,           # For example, to show top 11 categories.
    rename_dict=numeric_rename_dict,
    sorted_flag=True
)

In [None]:
learn_of_hs_col = "How did you first learn of Homestuck?"

create_bar_chart(
    df,
    label = learn_of_hs_col,  # Column name containing our responses.
    top_n = 11,  # For example, to show top 11 categories.
    rename_dict = {}
)

In [None]:
import pandas as pd
import plotly.express as px
import re

# Define the desired order as a list.
order_list = [
    "Act 1", "Act 2", "Act 3", "Intermission 1", "Act 4",
    "Act 5 Act 1 (Hivebent)", "Act 5 Act 2", "Intermission 2",
    "A6A1", "A6I1", "A6A2", "A6I2", "A6A3", "A6I3 (Meenah)", "A6A4", "A6I4", "A6A5 (Trickster)", "A6I5",
    "A6A6A1", "A6A6I1", "A6A6A2", "A6A6I2", "A6A6A3", "A6A6I3", "A6A6A4", "A6A6I4 (Retcon)", "A6A6A5", "A6A6I5 (Lilypad)",
    "A6A6A6 (Collide)", "Act 7"
]

groups = {
    "A1-A5": {"Act 1", "Act 2", "Act 3", "Intermission 1", "Act 4", "Act 5 Act 1 (Hivebent)", "Act 5 Act 2", "Intermission 2"},
    "A6A1-5": {"A6A1", "A6I1", "A6A2", "A6I2", "A6A3", "A6I3 (Meenah)", "A6A4", "A6I4", "A6A5 (Trickster)", "A6I5"},
    "A6A6 (All)": {"A6A6A1", "A6A6I1", "A6A6A2", "A6A6I2", "A6A6A3", "A6A6I3", "A6A6A4", "A6A6I4 (Retcon)", "A6A6A5", "A6A6I5 (Lilypad)", "A6A6A6 (Collide)"},
}

groups["A6 (All)"] = groups["A6A1-5"].union(groups["A6A6 (All)"])

# Define your color rules.
def get_colors(act):
    # early acts
    if act in {"Act 1", "Act 2", "Act 3", "Act 4", "Intermission 1", "A1-A5"}:
        if act == "Intermission 1":
            return ("green", "green")
        else:
            return ("white", "black")
    elif act == "Act 5 Act 1 (Hivebent)":
        return ("blue", "blue")
    elif act == "Act 5 Act 2":
        return ("red", "red")
    elif act == "Intermission 2":
        return ("green", "green")
    elif act in {"A6 (All)", "A6A1-5", "A6A6 (All)"}:
        return ("green", "green")
    elif act in {"A6A1", "A6A2", "A6A3", "A6A4", "A6A5 (Trickster)",
                 "A6I1", "A6I2", "A6I3 (Meenah)", "A6I4", "A6I5"}:
        # Early Act 6: if the act contains an "I", use light green; otherwise white.
        if "I" in act:
            return ("#90EE90", "#90EE90")
        else:
            return ("white", "black")
    elif act in {"A6A6I1", "A6A6I2", "A6A6I3", "A6A6I4 (Retcon)", "A6A6I5 (Lilypad)",
                 "A6A6A1", "A6A6A2", "A6A6A3", "A6A6A4", "A6A6A5"}:
        # Late Act 6: if the act contains an "I", use white; otherwise dark green.
        if "I" in act:
            return ("white", "black")
        else:
            return ("#006400", "#006400")
    elif act in {"A6A6A6 (Collide)", "Act 7"}:
        return ("black", "black")
    else:
        return ("gray", "gray")

# Create an aggregated DataFrame from the survey responses.
# Assume your DataFrame is df and has a column "What is your favorite Act?"
act_counts = df["What is your favorite Act?"].value_counts().reset_index()
act_counts.columns = ["act", "count"]

# Convert count to integer (if necessary).
act_counts["count"] = act_counts["count"].astype(int)

# Add fill and stroke color columns.
act_counts["fill"] = act_counts["act"].apply(lambda x: get_colors(x)[0])
act_counts["stroke"] = act_counts["act"].apply(lambda x: get_colors(x)[1])

# Create an ordering column based on the desired order_list.
def get_order(act):
    try:
        # Lower index means earlier appearance.
        return order_list.index(act)
    except ValueError:
        # If not found in the list, assign a high value.
        return 999

act_counts["order"] = act_counts["act"].apply(get_order)

# Calculate the aggregated counts for each group defined in 'groups'
group_rows = []
# Determine a starting order value that is larger than any individual act.
group_order_start = act_counts["order"].max() + 1
for i, (group_name, act_set) in enumerate(groups.items()):
    # Sum the counts of the acts that are in the current group.
    group_count = act_counts[act_counts["act"].isin(act_set)]["count"].sum()
    # Create a new row for this group.
    group_rows.append({
        "act": group_name,
        "count": group_count,
        "fill": get_colors(group_name)[0],
        "stroke": get_colors(group_name)[1],
        "order": group_order_start + i
    })
# Create a DataFrame from the group rows and append to act_counts.
group_df = pd.DataFrame(group_rows)
act_counts = pd.concat([act_counts, group_df], ignore_index=True)

# Sort the DataFrame by the custom order.
act_counts = act_counts.sort_values("order")

# Create a vertical bar chart with the x-axis carrying the labels and y-axis the count.
fig = px.bar(
    act_counts,
    x="act",
    y="count",
    text="count",
    title=""
)

# Update layout for transparent background and custom fonts.
fig.update_layout(
    xaxis_title="",            # Remove X-axis title.
    yaxis_title="",
    plot_bgcolor="rgba(0, 0, 0, 0)",
    paper_bgcolor="rgba(0, 0, 0, 0)",
    font=dict(family="TYPOSTUCK", color="black", size=25),
    title_text="",             # Remove main title from layout.
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=False          # Remove the legend.
)

# Update marker settings to use the per-bar colors.
fig.update_traces(
    marker=dict(
        color=list(act_counts["fill"]),
        line=dict(color=list(act_counts["stroke"]), width=2)
    ),
    texttemplate="%{text}"
)

# Use a normalized label for file names.
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", "What is your favorite Act?")
# html_filename = f"{normalized_label}.html"
# fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

image_filename = f"graphs/{normalized_label}.png"
fig.write_image(image_filename, width=1500, height=800)

# fig.show()


In [None]:
import pandas as pd
import plotly.express as px
import re

# Ensure the age column is numeric.
df["How old are you?"] = pd.to_numeric(df["How old are you?"], errors='coerce')

# Remove rows with missing age values.
df_age = df.dropna(subset=["How old are you?"]).copy()

# Define a function to bucket the age.
def bucket_age(age):
    if age < 10:
        return "<10"
    elif age <= 42:
        return str(int(age))
    else:
        return ">42"

# Create a new column "age_bucket" using the bucketing function.
df_age["age_bucket"] = df_age["How old are you?"].apply(bucket_age)

# Aggregate the data: count the number of respondents per age bucket.
age_counts = df_age["age_bucket"].value_counts().reset_index()
age_counts.columns = ["age_bucket", "count"]
age_counts["count"] = age_counts["count"].astype(int)

# Define a helper function to get the ordering value for each bucket.
def bucket_order(bucket):
    if bucket == "<10":
        return 0
    elif bucket == ">42":
        return 43
    else:
        try:
            return int(bucket)  # for numeric buckets (10 through 41)
        except:
            return 999

age_counts["order"] = age_counts["age_bucket"].apply(bucket_order)

# Sort the DataFrame by our custom order.
age_counts = age_counts.sort_values("order")

# Create a vertical bar chart using Plotly Express.
fig = px.bar(
    age_counts,
    x="age_bucket",
    y="count",
    text="count",
    title="Age Distribution",
    color="age_bucket"
)

# Update the layout to use a transparent background and TYPOSTUCK font.
fig.update_layout(
    xaxis_title="Age Bucket",
    yaxis_title="Count",
    plot_bgcolor="rgba(0, 0, 0, 0)",
    paper_bgcolor="rgba(0, 0, 0, 0)",
    font=dict(family="TYPOSTUCK", color="black", size=25),
    title_text="",
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=False
)

apply_fig_aesthetics(fig)

# For file names, normalize the label.
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", "How old are you?")
# html_filename = f"{normalized_label}.html"
# fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

image_filename = f"graphs/{normalized_label}.png"
fig.write_image(image_filename, width=1500, height=800)

# fig.show()


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import re

# Helper function to extract the country name from a response.
def extract_country(text):
    if pd.isna(text):
        return None
    if "Flag" in text:
        after = text.split("Flag", 1)[1]
        country = after.splitlines()[0].strip()
    else:
        country = text.strip()
    if country == "United Kingdom of Great Britain and Northern Ireland":
        return "United Kingdom"
    return country

# Process the "Respondent's country" column.
df["Respondent_country"] = df["Respondent's country"].apply(extract_country)
df_valid = df.dropna(subset=["Respondent_country"]).copy()

# Aggregate counts per country.
country_counts = df_valid["Respondent_country"].value_counts().reset_index()
country_counts.columns = ["country", "count"]

# Compute percentages.
total = country_counts["count"].sum()
country_counts["percent"] = (country_counts["count"] / total * 100).round(1)

# For a log scale effect, compute the log10 of the percent.
country_counts["log_percent"] = np.log10(country_counts["percent"])

# Define tick values and labels for the colorbar.
ticks = np.linspace(country_counts["log_percent"].min(), country_counts["log_percent"].max(), 5)
tick_labels = [f"{10**t:.1f}%" for t in ticks]

# Create a Plotly Express choropleth using the log-transformed percentage.
fig = px.choropleth(
    country_counts,
    locations="country",
    locationmode="country names",
    color="log_percent",
    color_continuous_scale="Greens",
    hover_data=["count", "percent"],
    title=""
)

# Update layout: use transparent backgrounds and custom fonts.
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(family="TYPOSTUCK", color="black"),
    showlegend=False,
    coloraxis_colorbar=dict(
        title="Percentage",
        tickvals=ticks,
        ticktext=tick_labels
    )
)

# Update geo settings to achieve a light-mode look.
fig.update_geos(
    bgcolor="rgba(0,0,0,0)",        # transparent background for the map container
    lakecolor="rgba(255,255,255,1)",  # lakes as white
    landcolor="rgba(240,240,240,1)",  # light gray for land
    showcountries=True,
    countrycolor="rgba(200,200,200,1)"  # light borders for countries
)

# fig.show()

apply_fig_aesthetics(fig)

# Save the figure as HTML and as a PNG image (1500 x 800).
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", "Respondent's country")
# html_filename = f"{normalized_label}.html"
# fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

image_filename = f"graphs/{normalized_label}.png"
fig.write_image(image_filename, width=1500, height=800)


In [None]:
country_counts

In [None]:
col_platforms = "What platform did you last \"read\" Homestuck on (including active rereads)?"

create_bar_chart(
    df,
    label = col_platforms,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {}
)


In [None]:
df["Which of these Homestuck franchiselets have you perused? [Homestuck]"].value_counts()

In [None]:

franchise_columns = [
    "Which of these Homestuck franchiselets have you perused? [Homestuck]",
    "Which of these Homestuck franchiselets have you perused? [HS Epilogues]",
    "Which of these Homestuck franchiselets have you perused? [SBaHJ]",
    "Which of these Homestuck franchiselets have you perused? [Problem Sleuth]",
    "Which of these Homestuck franchiselets have you perused? [Jailbreak]",
    "Which of these Homestuck franchiselets have you perused? [Paradox Space]",
    "Which of these Homestuck franchiselets have you perused? [HSBC/HS^2]",
    "Which of these Homestuck franchiselets have you perused? [Hiveswap Act 1]",
    "Which of these Homestuck franchiselets have you perused? [Hiveswap Act 2]",
    "Which of these Homestuck franchiselets have you perused? [Friendsim]",
    "Which of these Homestuck franchiselets have you perused? [Pesterquest]",
    "Which of these Homestuck franchiselets have you perused? [Skaianet Systems]",
    "Which of these Homestuck franchiselets have you perused? [Psycholonials]"
]

# Define the response categories (as they appear in responses)
response_categories = ["Finished", "Started", "Dropped", "Didn't start", "Never heard of"]

# Define a function to extract the area name from the column header.
def extract_area(col_name):
    m = re.search(r"\[(.+?)\]", col_name)
    if m:
        return m.group(1)
    return col_name

# Create a list to hold aggregated dictionaries.
agg_list = []

# Loop through each franchiselet column.
for col in franchise_columns:
    # Get the raw counts via value_counts().
    counts = df[col].dropna().value_counts()
    # Build a dict with keys for each response category.
    data = {cat: counts.get(cat, 0) for cat in response_categories}
    data["franchiselet"] = extract_area(col)
    agg_list.append(data)

# Create a DataFrame where each row is a franchiselet and columns are response counts.
df_franchise = pd.DataFrame(agg_list)
# --- Reshape Data to Long Format ---
# Each row will represent one response category for a given franchiselet.
df_long = df_franchise.melt(
    id_vars=["franchiselet"],
    value_vars=["Finished", "Started", "Dropped", "Didn't start", "Never heard of"],
    var_name="Response",
    value_name="Count"
)

# --- Define Colors for Each Response Category ---
color_map = {
    "Dropped": "red",
    "Started": "#90EE90",       # Light green
    "Finished": "#006400",      # Dark green
    "Didn't start": "lightgrey",
    "Never heard of": "white"
}

# --- Create a Stacked Bar Chart ---
fig = px.bar(
    df_long,
    x="franchiselet",
    y="Count",
    color="Response",
    text="Count",
    title="",
    color_discrete_map=color_map,
    barmode="stack"  # Stacked mode
)

# Update the layout for a transparent background and custom fonts.
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    paper_bgcolor="rgba(0,0,0,0)",
    font=dict(family="TYPOSTUCK", color="black", size=25),
    title_text="",
    xaxis_title="",            # Remove the X-axis label.
    yaxis_title="",
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=True
)

fig.update_xaxes(tickangle=45)

# --- Save the Figure ---
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", "Which_of_these_Homestuck_franchiselets_have_you_perused")
# html_filename = f"{normalized_label}.html"
# fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

image_filename = f"graphs/{normalized_label}.png"
fig.write_image(image_filename, width=1500, height=800)


In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re

# List of rating question columns in desired order.
rating_columns = [
    "How do you rate Homestuck overall?",
    "How do you rate Homestuck's ending?",
    "How do you rate the Homestuck Epilogues?",
    "How do you rate Problem Sleuth?",
    "How do you rate Paradox Space?",
    "How do you rate Homestuck: Beyond Canon/Homestuck^2?",
    "How do you rate Hiveswap Act 1?",
    "How do you rate Hiveswap Act 2?",
    "How do you rate Hiveswap Friendsim?",
    "How do you rate Pesterquest?",
    "How do you rate Psycholonials?"
]

# Create a copy of the DataFrame and ensure numeric conversion.
df_ratings = df.copy()
for col in rating_columns:
    df_ratings[col] = pd.to_numeric(df_ratings[col], errors="coerce")
# Drop rows that are missing values in all rating columns.
df_ratings = df_ratings.dropna(subset=rating_columns)

# Melt the DataFrame to long format.
df_long = df_ratings.melt(
    value_vars=rating_columns,
    var_name="Question",
    value_name="Rating"
)

# Define a function to simplify question text.
def simplify_question(q):
    if q == "How do you rate Homestuck: Beyond Canon/Homestuck^2?":
        return "HSBC/HS^2"
    # Remove the "How do you rate" prefix and any trailing punctuation.
    q = re.sub(r"^How do you rate the\s*", "", q)
    q = re.sub(r"^How do you rate\s*", "", q)
    q = re.sub(r"\?$", "", q)
    return q.strip()

df_long["Question_Simple"] = df_long["Question"].apply(simplify_question)

# Compute the average rating per simplified question.
avg_per_question = df_long.groupby("Question_Simple")["Rating"].mean().round(1)

# Create a box plot (horizontal, i.e. box plot on x, categories on y).
fig = px.box(
    df_long,
    x="Rating",
    y="Question_Simple",
    orientation="h",
    title="",
    points=False,  # Do not show all discrete data points
    color="Question_Simple",
)

# Update layout for transparent background and custom fonts.
fig.update_layout(
    xaxis_title="",
    yaxis_title="",
    plot_bgcolor="rgba(0,0,0,0)",
    paper_bgcolor="rgba(0,0,0,0)",
    font=dict(family="TYPOSTUCK", color="black", size=30),
    margin=dict(l=100, r=50, t=50, b=50),
    showlegend=False
)

# Overlay the average ratings as scatter markers.
# Create a new trace for averages.
avg_trace = go.Scatter(
    x=list(avg_per_question.values),
    y=list(avg_per_question.index),
    mode="text",
    text=avg_per_question,
    name="Average Rating"
)
fig.add_trace(avg_trace)

# Update colorbar etc. Here we don't have a colorbar for the box plot.
# Set the category ordering on the y-axis according to the order of rating_columns.
ordered_questions = [simplify_question(q) for q in rating_columns]
fig.update_yaxes(categoryorder="array", categoryarray=ordered_questions)

# Save the figure as HTML and PNG (1500x800), then display.
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", "Ratings_of_Homestuck_and_Related")
# html_filename = f"{normalized_label}.html"
# fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

apply_fig_aesthetics(fig)

image_filename = f"graphs/{normalized_label}.png"
fig.write_image(image_filename, width=1500, height=800)


In [None]:
import plotly.express as px

# Define the column to plot.
lunar_column = "What is your Lunar Sway?"

# Filter out missing responses.
lunar_series = df[lunar_column].dropna()

# Compute counts for each mapped response.
lunar_counts = lunar_series.value_counts().reset_index()
lunar_counts.columns = ["response", "count"]

# Prospit should be gold, Derse should be purple
color_map = {"Prospit": "gold", "Derse": "purple"}

# Create the pie chart using Plotly Express.
fig_pie = px.pie(
    lunar_counts,
    names="response",
    values="count",
    title="",
    color="response",
    color_discrete_map=color_map,
)


# Apply common figure aesthetics.
apply_fig_aesthetics(fig_pie)

# Update the pie chart trace so the labels appear inside the slices with both label and percentage.
fig_pie.update_traces(
    textposition='inside',
    textinfo='label+percent',
    showlegend=False
)

# Display the pie chart.
fig_pie.write_image("graphs/moon_pie.png", width=400, height=400)

In [None]:
import pandas as pd
import plotly.express as px
import re

# List of fanwork types (the text inside the square brackets in your columns).
fanwork_types = [
    "Fanart", "Fanmusic", "Fanvideos", "Fanfiction/adventures",
    "Cosplay", "Theories", "Roleplay", "Fangames/mods", "Other (i.e. wikis)"
]

# Rename dictionary for long fanwork labels.
rename_dict = {
    "Fanfiction/adventures": "Fanfiction",
    "Fangames/mods": "Fangames",
    "Other (i.e. wikis)": "Wiki/other"
}

# We'll loop over the fanwork types to aggregate data.
agg_list = []

for fanwork in fanwork_types:
    col_label = f"How often have you created fanwork for the Homestuck fandom, and in which ways? [{fanwork}]"
    # Get value counts for the column.
    counts = df[col_label].dropna().value_counts().reset_index()
    counts.columns = ["response", "count"]
    # Add a column for fanwork type.
    counts["Fanwork_Type"] = fanwork
    agg_list.append(counts)

# Concatenate all fanwork aggregated data.
df_fanwork = pd.concat(agg_list, ignore_index=True)

# Apply renaming for fanwork types.
df_fanwork["Fanwork_Type"] = df_fanwork["Fanwork_Type"].replace(rename_dict)

# Create a faceted bar chart using Plotly Express.
fig = px.bar(
    df_fanwork,
    x="response",
    y="count",
    color="response",
    text="count",
    facet_col="Fanwork_Type",
    category_orders={"Fanwork_Type": sorted(df_fanwork["Fanwork_Type"].unique(), key=lambda x: fanwork_types.index(x) if x in fanwork_types else 999)},
    barmode="stack",
    title=""  # Remove title.
)

# Remove any automatic facet prefixes.
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1].strip()))

# Update layout: transparent backgrounds, custom TYPOSTUCK font.
fig.update_layout(
    plot_bgcolor="rgba(0, 0, 0, 0)",
    paper_bgcolor="rgba(0, 0, 0, 0)",
    font=dict(family="TYPOSTUCK", color="black", size=20),
    margin=dict(l=50, r=50, t=50, b=50),
    showlegend=True  # We want a legend here.
)

# Rotate x-axis tick labels (response labels) so they are diagonal.
fig.update_xaxes(tickangle=45)

# Save the figure as HTML and PNG with dimensions 1500 x 800.
normalized_label = re.sub(r"[^a-zA-Z0-9]", "_", "Fanwork_Creation")
# html_filename = f"{normalized_label}.html"
# fig.write_html(html_filename, include_plotlyjs="cdn", full_html=False)

image_filename = f"graphs/{normalized_label}.png"
fig.write_image(image_filename, width=1500, height=800)

In [None]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re

def deduplicate_response(text):
    """
    Lowercase the text, remove punctuation, split into words, deduplicate,
    and return a space-separated string of unique words.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    unique_words = set(words)
    return " ".join(unique_words)

# Combine the two free text columns.
old_texts = df["What's your favorite \"old\" fanwork?"].dropna().astype(str).tolist()
current_texts = df["What's your favorite \"current\" fanwork?"].dropna().astype(str).tolist()

# Process each response to remove duplicate words.
old_texts = [deduplicate_response(resp) for resp in old_texts]
current_texts = [deduplicate_response(resp) for resp in current_texts]

combined_text = " ".join(current_texts + old_texts)

# Create a custom set of stopwords.
custom_stopwords = set(STOPWORDS)
custom_stopwords.update([
    "fanwork", "work", "favorite", "old", "current", "homestuck",
    "the", "and", "of", "in", "a", "to", "https", "goes", "idk",
    "dunno", "know", "s", "stuff", "fanfic", "fanfiction",
    "fic", "anything", "really", "yet", "probably", "none",
    "youtube", "youtu", "archiveofourown", "org", "ao3", "called",
])

# Create the WordCloud object.
wc = WordCloud(
    width=1500,
    height=800,
    background_color="white",   # Light background.
    stopwords=custom_stopwords,
    colormap="viridis",
    font_path="fontstuck-extended.ttf",
    collocations=True
).generate(combined_text)

# Plot the word cloud.
plt.figure(figsize=(15, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig("graphs/fanwork_wordcloud.png", format="png", dpi=300, transparent=True)
plt.show()


In [None]:
quest_bedding_col = "Would you buy quest bed bedding?"
create_bar_chart(
    df,
    label = quest_bedding_col,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="Quest Bedding"
)

In [None]:
wizard_col = "Would you buy a physical SBURB guide/lorebook à la Wizardology? "
create_bar_chart(
    df,
    label = wizard_col,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="SBURB guide"
)

In [None]:
streaming_col = "Do you want more official Homestuck musicians to individually add all their Homestuck songs to streaming, à la Bowmanstuck?"
create_bar_chart(
    df,
    label = streaming_col,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="Homestuck Music on Streaming"
)

In [None]:
streaming_col = "Would you buy a physical copy of the Homestuck Soundtrack? "
create_bar_chart(
    df,
    label = streaming_col,  # Column name containing our responses.
    top_n = 3,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="Physical Soundtrack"
)

In [None]:
paradox_col = "Would you buy a new physical book of official non-canon Homestuck side comics à la Paradox Space?"
create_bar_chart(
    df,
    label = paradox_col,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="Paradox Space-esque book"
)

In [None]:
books_cont_col = "Would you buy a continuation of the Homestuck books from Cascade onwards?"
create_bar_chart(
    df,
    label = books_cont_col,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="More Homestuck books"
)

In [None]:
new_albums_col = "Would you buy/stream new official Homestuck music albums?"
create_bar_chart(
    df,
    label = new_albums_col,  # Column name containing our responses.
    top_n = 4,  # For example, to show top 11 categories.
    rename_dict = {},
    show_title="More Homestuck albums"
)

In [None]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re

col = "Share your merch wishlist beyond the above"

# Drop missing responses and convert them to strings.
responses = df[col].dropna().astype(str).tolist()

def deduplicate_response(text):
    """
    Process a single response: lower-case it, remove punctuation,
    split into words, and then return a string of the unique words (order is not preserved).
    """
    # Lower-case the text.
    text = text.lower()
    # Remove punctuation (using a regex that matches alphanumeric words).
    words = re.findall(r'\b\w+\b', text)
    # Use a set to deduplicate.
    unique_words = set(words)
    # Return as a space-separated string.
    return " ".join(unique_words)

# Process each response so that repeated words in the same response are only counted once.
processed_responses = [deduplicate_response(resp) for resp in responses]

# Combine the processed responses into a single aggregated text.
aggregated_text = " ".join(processed_responses)

# Create a custom set of stopwords.
custom_stopwords = set(STOPWORDS)
# I can't use something more advanced because so much of Homestuck is
# common words, like "god" for god tier or "kind" for kind abstrata
custom_stopwords.update([
    "merch", "wishlist", "the", "and", "of", "in", "a", "to", "for",
    "any", "things", "i", "you", "it", "on", "with", "or", "this", "that",
    "my", "just", "more", "like", "but", "is", "are", "was", "were",
    "https", "http", "com", "org", "www", "official", "please", "buy",
    "anything", "homestuck", "much", "im", "stuff", "zendaya", "love",
    "know", "think", "though", "will", "fucking", "put", "really",
    "literally", "kill", "now", "etc", "whatever", "dont", "never",
    "still", "shit", "way", "nice", "funny", "cute", "release",
    "m", "oh", "wish", "miss", "thing", "give", "lot", "people",
    "little", "especially", "new", "actual", "version", "many",
    "look", "actually", "alway", "even", "wanted", "every",
    "good", "real", "pretty", "long", "well", "sell", "honestly",
    "come", "don", "available", "make", "man", "got", "used",
    "tbh", "take", "u", "something", "related", "themed", "ass",
    "ve", "us", "find", "two", "bought", "sick", "d", "wanna",
    "able", "stupid", "beg", "idea", "better", "great", "id",
    "always", "begging", "bring", "cool", "maybe", "money",
    "everything", "fuck", "ll", "care", "lol", "go", "mostly",
    "idk", "re", "option", "style", "pl", "full", "need",
    "want", "guy", "subtle", "sweet", "one", "quality",
    "main", "back", "buying", "different", "e", "probably",
    "might", "form", "already", "specifically", "let",
    "basically", "bad", "see", "work", "broke", "ones",
    "happen", "say", "getting", "piece", "thank", "guess",
    "doesn", "use", "exist", "including", "s", "possible",
    "making", "definitely", "least", "h", "t", "year", "isn",
    "hs"
])

# Create the WordCloud object.
wc = WordCloud(
    width=1500,
    height=800,
    background_color="white",   # Use a light background.
    stopwords=custom_stopwords,
    colormap="viridis",
    font_path="fontstuck-extended.ttf",
    collocations=False  # Avoid combining words based on co-occurrence.
).generate(aggregated_text)

# Plot the word cloud.
plt.figure(figsize=(15, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig("graphs/merch_wordcloud_unique.png", format="png", dpi=300, transparent=True)
plt.show()
