# Analysing the February 2020 mybinder.org user survey responses

From February to March 2020, the mybinder.org operating team ran a survey to collect feedback from our userbase.
It comprised of three questions, two of which were multiple choice and the third was a free-form text response.
The survey was advertised in a banner along the top of the mybinder.org homepage and was optional and anonymous to complete.
The only data that was collected other than the responses to the questions were the date and time of survey completion.
The free-form responses were checked for identifying features before being made available for analysis.

This notebook analyses the responses from this survey using Natural Language Processing.

In [None]:
# Import libraries

import nltk
import spacy
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("en_core_web_md")
stop = stopwords.words("english")
exclude = set(string.punctuation)

%matplotlib inline

np.random.seed(123)  # Set a random seed

In [None]:
# Load in the data
resps = pd.read_csv(
    "data/mybinder.org-user-survey-feb-2020.csv",
    header=0,
    names=["Timestamp", "Q1", "Q2", "Q3"],
)

# Get questions for plot titles
with open("data/mybinder.org-user-survey-feb-2020.csv", "r") as f:
    titles = f.readline().strip("\n").split('"')

titles = list(
    filter(lambda a: a != "" and a != "," and a != "Timestamp", titles)
)

# Calculate total number of responses
total_resps = len(resps)
print(f"Total number of responses to the survey: {total_resps}")

## Q1. Would you recommend mybinder.org to a friend?

This question is multiple choice: `Yes`, `No`, or `Maybe`.

In [None]:
# Count the Yes/No/Maybe responses for Question 1
bar_plot_dict = dict(Counter(resps["Q1"]))

# Remove NAN values
try:
    del bar_plot_dict[np.nan]
except KeyError:
    pass

# Create a bar plot of results
bar_plot_df = pd.Series(bar_plot_dict)
bar_plot_df.sort_values(ascending=False, inplace=True)
bar_plot_df.plot(kind="bar", title=titles[0])

# Calculate percentage
percentages = 100 * (bar_plot_df.values / bar_plot_df.sum())

## Q2. If you could change one thing about Binder, what would it be?

This question was a free-form response and we've applied Natural Language Processing to identify commonly recurring words and topics from the data.

In [None]:
# Define some functions


def get_closest_to_centroid(vectors, centroid):
    """Rank and return responses by closeness to centroid"""
    rank = [
        [
            x,
            cosine_similarity(
                centroid.reshape(1, -1), vectors[x].reshape(1, -1)
            )[0][0],
        ]
        for x in range(len(vectors))
    ]

    rank.sort(key=lambda x: x[1], reverse=True)
    closest = rank[:3]

    return closest


def clean_text(text):
    """Clean up the text responses"""
    text = text.lower()
    text = "".join(ch for ch in text if ch not in exclude)
    text = [x for x in nltk.word_tokenize(text) if x not in stop]
    return text

In [None]:
# Generate and plot a word cloud from the responses
answer_embs = []

for index, row in resps.iterrows():
    answer = row["Q2"]
    if type(answer) is str and len(answer) > 3:
        answer_emb = nlp(answer).vector
        answer_embs.append([index, answer, answer_emb])

comment_words = " ".join([x[1] for x in answer_embs])

wordcloud = WordCloud(
    width=800, height=800, background_color="white", min_font_size=10
).generate(comment_words)

plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0);

In [None]:
# Find clusters of topics

n_clusters = 10

X = np.array([x[2] for x in answer_embs])
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
clusters = {
    x: {"centroid": kmeans.cluster_centers_[x], "answers": [], "vectors": []}
    for x in range(len(kmeans.cluster_centers_))
}

for x in range(len(answer_embs)):
    answer = answer_embs[x][1]
    vector = answer_embs[x][2]
    label = kmeans.labels_[x]
    clusters[label]["answers"].append(answer)
    clusters[label]["vectors"].append(vector)

In [None]:
# Print answers closest to the centroid of the clustered topics

for cluster, infos in clusters.items():
    answers = infos["answers"]
    vectors = infos["vectors"]
    centroid = infos["centroid"]

    comment_words = [word for answer in answers for word in clean_text(answer)]
    most_common = Counter(comment_words).most_common(5)
    central_ids = get_closest_to_centroid(vectors, centroid)
    central_answers = [answers[id_[0]] for id_ in central_ids]

    for answ in central_answers:
        print(answ, "\n")

    print(
        "---> other",
        len(answers) - len(central_answers),
        "messages around the same topic",
        "\n\n",
    )

In the next cells, a search query can be defined to return responses that mention a specific topic.
In the example, the search term is for the R programming language.

In [None]:
# Define a search query function


def search_query(q, text):
    """Search for a query within the text"""
    # For multiword queries
    if len(q.split(" ")) > 1:
        if q.lower() in answer.lower():
            return True
    else:
        # Transform to lowercase and remove punctuation
        # then search for a perfect match
        text = clean_text(text)
        if q.lower() in text:
            return True

In [None]:
# Try it yourself!
# Define a query below and re-run the cell
query = "R"

for index, row in resps.iterrows():
    answer = row["Q2"]

    if (type(answer) is str) and (len(answer) > 3):
        check_relevance = search_query(query, answer)
        if check_relevance is True:
            print(answer + "\n")

## Q3. What do you (mainly) use mybinder.org for?

This question was multiple choice and the purpose is to identify how mybinder.org is being used by the community.
The categories available for this question were:

- Reproducible publishing
- Pre-university teaching
- University teaching
- Workshops/training courses
- Demos and talks
- Documentation and examples
- Sharing and collaborating with a team
- Other

In [None]:
# Count categorical responses to Question 3
raw_dict = dict(Counter(resps["Q3"]))

# Remove NAN values
try:
    del raw_dict[np.nan]
except KeyError:
    pass

# Filter out the defined categories
categories = [key for key, value in raw_dict.items() if value >= 10]

# Construct a dictionary with non-specified answers concatenated into "Other"
concat_dict = {"Other": 0}
for key, value in raw_dict.items():
    if key in categories:
        concat_dict[key] = value
    else:
        concat_dict["Other"] += 1

# Create pie plot of results
concat_df = pd.Series(concat_dict)
concat_df.sort_values(ascending=False, inplace=True)
concat_df.plot(kind="pie", title=titles[2])
plt.ylabel("")

# Calculate percentage
percentages = 100 * (concat_df.values / concat_df.sum())