In [25]:
import pandas as pd
from ast import literal_eval
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()  # for pandas progress_apply()

import re
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import string
punctuation = string.punctuation + "•’"

**to use 'en_core_web_sm' run:**
`python -m spacy download en_core_web_sm`

In [26]:
import spacy
nlp = spacy.load("en_core_web_sm")

Required for the sentiment analysis (can comment out if not needed)
`pip install spacytextblob`
`pip install spacy[transformers]`

In [41]:
from spacytextblob.spacytextblob import SpacyTextBlob # required to run  nlp.add_pipe('spacytextblob')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x255f047d988>

In [27]:
sns.set(rc={'figure.figsize': (20, 6)})
sns.set(font_scale=1.5)

In [28]:
# read df
converters = {"album_images": literal_eval, "hashtags": literal_eval, "comments_flat": literal_eval,
              "timestamp": pd.to_datetime, "image_label": literal_eval}
df = pd.read_csv("DepotBoijmans.csv", converters=converters, index_col="timestamp")
df

Unnamed: 0_level_0,shortcode,owner_username,MicroRole,MacroRole,likes,comment_count,location,owner_followercount,interactions,caption_en,hashtags,comment_en,num_images,image_label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-01 14:34:39,postaf1cd604a7f645109218b27f109b3fea1b6720faee...,user8392bf148647eddd2fc3e3edf65add3268736213ed...,Individual,Private,27,2,,93,29,#rotterdam#nederland#netherlands #holland#depo...,"[rotterdam, nederland, netherlands, holland, d...","['great photo henk, Happy new year to you and ...",1,[Outside]
2020-01-03 12:30:28,post3b75c80595d506a8d022c1751aa23c83faf0b8ebb9...,userc0337aa6f789320dd6040cefdd4261e0c8f98ed61c...,Individual,Private,11,0,Museum Boijmans Van Beuningen,176,11,#depotBoijmans #underconstruction #fascinating...,"[depotBoijmans, underconstruction, fascinating...",[],5,"[Outside, Outside, Outside, Outside]"
2020-01-03 14:53:16,post6a8292ba8694c8073b2fb6a4bad92a97afcdf023d7...,userf493e1a39259129fa2658ec2474b4e09689b2ba265...,Enterprise,Private,40,0,Westkop,960,40,Start the first weekend of the new year right ...,"[westkop, rotterdam, museumpark, bier, weekend...",[],1,[Misc]
2020-01-03 21:35:55,post7bfd20688f33426e1e5eaf767124178580f10b0182...,usereb1e6224cc3371fe9262695619be88cfa23095ea4e...,Individual,Private,9,1,,43,10,#depot#museum#boijmans@user#january2020,"[depot, museum, boijmans, january2020]","['Wij waren ook in Rotterdam, hadden we nog ku...",1,[Misc]
2020-01-04 01:35:44,post8135f43902591b4a073e72b39531776c7cdf006b7d...,user59216ece376ad65994c9ccab8edb8b681c82e4ea41...,Individual,Private,309,6,Museum Boijmans Van Beuningen,4280,315,#Depot #boijmansvanbeuningen #museum #undercon...,"[Depot, boijmansvanbeuningen, museum, undercon...","['⚓️😍', '😃 @user', 'Nice fine contrast on this...",1,[Outside]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 16:02:27,post7c1c3c1db2129fe79a0226bbfe1356229e866a16d0...,userd6740396be33cad354155dda1a167b51ae1170d6f9...,Individual,Private,37,5,Rotterdam Centrum,190,42,Rotterdam's new ICON #skylinerotterdam #bearde...,"[depot, depotboijmansvanbeuningen, rotterdam, ...","['Mooi, wij moeten er binnenkort ook maar eens...",3,"[Outside, Outside]"
2021-12-31 18:47:37,post9d57b38dcb71de96f81340a67516acf4248f33ddf4...,user0982559bde8b51afab37ecbb54dc2c29a77cf56190...,Individual,Private,39,0,"Rotterdam, Netherlands",998,39,"Especially photos of the village where I live,...","[rotterdam, rottergram, rottergram010, rotterd...",[],1,[Misc]
2021-12-31 19:15:19,post464bb11a852053bb173d88a348cab76b7cbcd2e672...,user5bbe3e9245c38f2e8cc6e672213fedfdd5534c6968...,Individual,Private,14,0,Depot Boijmans van Beuningen,345,14,#batiment #architectural #musee #museum #build...,"[batiment, architectural, musee, museum, build...",[],1,[Outside]
2021-12-31 19:21:02,post52587efabf903274d65d9108c20c43b50a2de85539...,user4622de33452f1db17290f13143765b249548921c77...,Individual,Private,2,0,,97,2,#depotboijmansvanbeuningen 31 december 2021,[depotboijmansvanbeuningen],[],1,[Outside]


# Dataset notes
The data was originally scraped from Instagram using a third-party scraper from rapidapi.com.
The dataset was pre-processed and cleaned in the following ways (not included in this notebook because they're manual or time-consuming):
- shortcode/owner_username: has been anonymized (hashed) for the publication of this dataset
- MicroRole/MacroRole: 107 'high-interaction' users were reviewed by two researchers (pre-anonymization) and categorized into 6 roles (see the paper for more details):
    - Public side: Affiliated Enterprise, Depot Boijmans Designer, Public Institution
    - Private side: Enterprise, Individual, Private Tourism Outlet
    All other users were classified as 'Individual'
- caption_en/comment_en: posts caption and comments was translated into English using https://pypi.org/project/deep-translator/ (not included in this notebook because it's time-consuming)
- image_label:  was set as the result of clustering+manual post-processing of clusters as well as labeling, see below in this notebook

# Topic Modeling with Bertopic

Some pre-processing. BERTopic works on natural language and does not need lemmatization etc. Though we found that removing mentions and stopwords improved the topic coherence.

In [29]:
def remove_stopwords(words):
    return [word for word in words if word not in stop_words]

def remove_mentions(text):
    text = re.sub(r"@(\w+)", "", text)
    return text

def bertopic_topics_to_df(model, max_topics=10):
    """
    :param model: BERTopic model
    :param max_topics: topic modeling topics
    :return: topic modeling topics as dataframe
    """
    columns = {}
    for i in range(max_topics):
        topic = model.get_topic(i)
        if topic:
            column_data = list(map(lambda t: t[0], topic))
            columns[i] = column_data

    return pd.DataFrame(columns)

def format_bracket(val1, val2, percent=False, round_to=1, stdev=False):
    print(val1,val2)
    """
    Formats as 'val1 (val2)' or 'val1 (val2%)'
    """
    if round_to == 0:
        round_to = None
    if percent:
        str2 = " ({:.1%})"
    elif stdev:
        str2 = " (±{:,})"
        val2 = round(val2, round_to)
    else:
        str2 = " ({:,})"
        val2 = round(val2, round_to)

    return "{:,}".format(round(val1, round_to)) + str2.format(val2)

def split_by_trailing_hashtags(text, punctuation_string=""):
    """
    Splits text into two parts:
    head: text including hashtags in the text
    tail: all trailing hashtags
    :param text:
    :param punctuation_string: characters in this string will be ignored when splitting
    e.g. "text .#bla #blub" will be split into "text" ".#bla #blub" (note the 'lost' space in the middle, have to add that when merging back)
    :return:
    """
    tail = []

    arr = text.split(" ")
    while len(arr) > 0:
        if arr[-1].startswith("#") or arr[-1].startswith("@") or arr[-1] in punctuation_string:
            tail = [arr[-1]] + tail
            arr = arr[:-1]
        else:
            break
    head = " ".join(arr)
    tail = " ".join(tail)

    return head, tail

df["caption_en"] = df["caption_en"].fillna("")
df["caption_en_head"] = df["caption_en"].apply(lambda x: split_by_trailing_hashtags(x)[0])

df["caption_en_nostopwords"] = df["caption_en"].apply(lambda s: " ".join(remove_stopwords(s.lower().split(" "))))
df["caption_en_head_nostopwords"] = df["caption_en_head"].apply(
    lambda s: " ".join(remove_stopwords(s.lower().split(" "))))

micro_roles = list(df["MicroRole"].unique())

Quick topic model of the whole dataset

In [30]:
model = BERTopic(verbose=True)
texts = df["caption_en"]
topics, probabilities = model.fit_transform(texts)
model.visualize_topics()

Batches:   0%|          | 0/258 [00:00<?, ?it/s]

2022-12-26 16:16:40,308 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:16:52,136 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:16:52,497 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Topics over time
**Result**: The only real spike is during the Eurovision Song Contest, otherwise the topics are fairly stable (barring the silver opening and inauguration)

In [31]:
timestamps = df.index.to_list()
topics_over_time = model.topics_over_time(texts, topics, timestamps, nr_bins=10)
model.visualize_topics_over_time(topics_over_time, top_n_topics=6)

10it [00:01,  5.00it/s]


Topic Model for each actor group

In [32]:
# topic_output_folder_bertopic = "results/topic_modeling/bertopic"
# os.makedirs(topic_output_folder_bertopic, exist_ok=True)

# No pre-processing needed:
# https://maartengr.github.io/BERTopic/faq.html

for micro_role in micro_roles:
    print("-----{}-------".format(micro_role))
    # fpath_fig = os.path.join(topic_output_folder_bertopic, "bert_{}.png".format(micro_role))
    # fpath_csv = os.path.join(topic_output_folder_bertopic, "bert_{}.csv".format(micro_role))

    slice = df[df["MicroRole"] == micro_role]

    """"
    tweaks:
    remove stopwords
    remove mentions
    diversity=0.5
    hdbscan: min_cluster_size=3,  min_samples=5, for Individuals: default params
    """
    texts = slice["caption_en_head_nostopwords"].apply(remove_mentions)

    if micro_role == "Individual":
        hdbscan_model = HDBSCAN(min_cluster_size=10)
    else:
        hdbscan_model = HDBSCAN(min_cluster_size=3, min_samples=5)

    # default umap model with random state for reproducibility
    umap_model = UMAP(n_neighbors=15, n_components=5,
                      min_dist=0.0, metric='cosine', random_state=42)

    model = BERTopic(verbose=True, diversity=0.5, hdbscan_model=hdbscan_model, umap_model=umap_model)
    topics, probabilities = model.fit_transform(texts)
    try:
        fig = model.visualize_barchart(top_n_topics=20)
        fig.show()
        # fig.write_image(fpath_fig)
    except ValueError:
        pass

    topwords_df = bertopic_topics_to_df(model, max_topics=20)
    # topwords_df.to_csv(fpath_csv, index=False)

    # print(topwords_df.to_string())

-----Individual-------


Batches:   0%|          | 0/231 [00:00<?, ?it/s]

2022-12-26 16:17:09,426 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:17:23,999 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:17:24,255 - BERTopic - Clustered UMAP embeddings with HDBSCAN


-----Enterprise-------


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

2022-12-26 16:17:36,311 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:17:38,153 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:17:38,177 - BERTopic - Clustered UMAP embeddings with HDBSCAN


-----Affiliated enterprise-------


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2022-12-26 16:17:46,996 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:17:47,972 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:17:47,979 - BERTopic - Clustered UMAP embeddings with HDBSCAN


-----Public institution-------


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2022-12-26 16:17:56,067 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:17:56,873 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:17:56,883 - BERTopic - Clustered UMAP embeddings with HDBSCAN


-----Private tourism outlet-------


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-12-26 16:18:05,048 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:18:05,772 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:18:05,778 - BERTopic - Clustered UMAP embeddings with HDBSCAN


-----Depot Boijmans designer-------


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-12-26 16:18:13,767 - BERTopic - Transformed documents to Embeddings
2022-12-26 16:18:14,506 - BERTopic - Reduced dimensionality with UMAP
2022-12-26 16:18:14,513 - BERTopic - Clustered UMAP embeddings with HDBSCAN


**Result:**
The topic models for actor groups with few posts are very small and extremely difficult to compare to other groups qualitatively.
This issue arises because BERToptic needs ~1000 documents to produce coherent topic models as indicated by the author of BERTopic.
Another issue was that named entities such as 'Depot' 'Boijmans' 'Van' 'Beuningen' dominated the topic model though removing them did not make the topic model more coherent.

Other topic modeling algos like NMF and LDA were also considered but could not provide easily interpretable results either.
Though I'm no expert on natural language processing and it's possible that with a different topic modeling algo or better pre-processing the results could be improved.



For a quantitative comparison of topics discussed by the actor groups we turn to a more simple method of word frequency counting.

# Topic Analysis with Word Counting

Pre-processing: part-of-speech tagging and dependency parsing
Analysis: we are looking at top:
- nouns -> not that insightful
- adjectives -> missing some context, i.e. what the adjective refers to
- adjective-noun combinations -> solving the context problem from above

In [33]:
def remove_punctuation(text, punctuation_str):
    text = "".join([char for char in text if char not in punctuation_str])
    return text

def spacy_pos_tag(text):
    pos = []
    doc = nlp(text)
    for token in doc:
        pos.append((token.text, token.pos_))
    return pos

caption_cleaned = df["caption_en_head"].apply(lambda s: remove_punctuation(s, punctuation).lower())
# POS tagging
df["caption_en_pos"] = caption_cleaned.progress_apply(spacy_pos_tag)

# Creating spacy NLP objects for dependency parsing
df["nlp_text"] = sentences = df["caption_en_head"].apply(lambda s: remove_mentions(s).lower())
df["nlp"] = df["nlp_text"].progress_apply(nlp)

100%|██████████| 8228/8228 [00:40<00:00, 200.77it/s]
100%|██████████| 8228/8228 [00:44<00:00, 184.66it/s]


In [34]:
def get_top_pos(pos_corpus, target_pos):
    """
    Finds all words in a POS-tagged corpus that match the target POS
    :param pos_corpus: list of list of  ("word, [POS TAG]) tuples
    :param target_pos: https://universaldependencies.org/u/pos/
    :return: list of words that match the POS
    """
    all_words = []
    for pos_list in pos_corpus:
        for tup in pos_list:
            if tup[1] == target_pos:
                all_words.append(tup[0])
    return all_words


def get_pos_count_df(df, group_column, target_pos, text_column="caption_en_pos"):
    """
    :param df: dataframe
    :param group_column: column name with different groups by which the results are split by
    :param target_pos: https://universaldependencies.org/u/pos/
    :param text_column: column with list of ("word, [POS TAG]) tuples
    :return: dataframe with top 20 most common words that match the POS per group
    """
    df_pos = pd.DataFrame()
    for tag in df[group_column].unique():
        slice = df[df[group_column] == tag]
        words = get_top_pos(slice[text_column], target_pos=target_pos)

        ser = pd.Series(words).value_counts().reset_index()
        ser.columns = [tag, tag + "_count"]

        df_pos = pd.concat([df_pos, ser[:20]], axis=1)
    return df_pos


def get_top_adjnouns(docs):
    """
    Gets all ADJ-NOUN pairs in a corpus (docs) by parsing the dependency tree
    :param docs: list of spacy nlp() objects
    """
    collected = []
    for doc in docs:
        for token in doc:
            pos = token.pos_
            if pos == "ADJ":
                head = token
                for i in range(len(doc)):  # avoid infinite cycles due to circiular dependenceis
                    head = head.head
                    if (str(head.dep_) == str(head.text)) or str(head.pos_) == "NOUN":
                        break
                if str(head.pos_) == "NOUN":
                    collected.append("{} {}".format(token.text, head.text))
    return collected


def get_adjnoun_count_df(df, group_column, docs_column):
    """
    :param df: dataframe
    :param group_column: column name with different groups by which the results are split by
    :param docs_column: column that contains spacy NLP objects
    :return: dataframe with top 20 most common ADJ-NOUN combinations
    """
    df_collecter = pd.DataFrame()
    for tag in df[group_column].unique():
        slice = df[df[group_column] == tag]

        docs = list(filter(lambda s: len(s) > 0, slice[docs_column]))

        words = get_top_adjnouns(docs)
        # lazy named entity removal
        words = list(filter(lambda w: w not in ["️marieke diemen", "winy maas"], words))

        ser = pd.Series(words).value_counts().reset_index()
        ser.columns = [tag, tag + "_count"]

        df_collecter = pd.concat([df_collecter, ser[:20]], axis=1)

    return df_collecter


def format_bracket(val1, val2, percent=False, round_to=1, stdev=False):
    """
    Formats as 'val1 (val2)' or 'val1 (val2%)'
    """
    if round_to == 0:
        round_to = None
    if percent:
        str2 = " ({:.1%})"
    elif stdev:
        str2 = " (±{:,})"
        val2 = round(val2, round_to)
    else:
        str2 = " ({:,})"
        val2 = round(val2, round_to)

    return "{:,}".format(round(val1, round_to)) + str2.format(val2)

def format_count_df(df):
    count_df = df.copy()
    # iterate through uneven columns
    for i in range(1, len(count_df.columns), 2):
        count_df.iloc[:, i] = count_df.iloc[:, i].apply(
            lambda v: format_bracket(v, v / count_df.iloc[:, i].sum(), percent=True))
    return count_df

Top nouns

In [35]:
res = get_pos_count_df(df, "MacroRole", "NOUN")
format_count_df(res)

Unnamed: 0,Private,Private_count,Public,Public_count
0,depot,"4,121 (24.7%)",depot,364 (22.7%)
1,user,"2,432 (14.6%)",user,237 (14.8%)
2,boijmans,"1,525 (9.1%)",boijmans,115 (7.2%)
3,museum,"1,508 (9.0%)",art,113 (7.0%)
4,art,"1,471 (8.8%)",museum,103 (6.4%)
5,building,974 (5.8%),collection,78 (4.9%)
6,city,549 (3.3%),building,71 (4.4%)
7,collection,435 (2.6%),bag,69 (4.3%)
8,depotboijmansvanbeuningen,384 (2.3%),opening,61 (3.8%)
9,opening,363 (2.2%),shopping,56 (3.5%)


Top adjectives

In [36]:
res = get_pos_count_df(df, "MacroRole", "ADJ")
format_count_df(res)

Unnamed: 0,Private,Private_count,Public,Public_count
0,new,864 (18.6%),new,106 (21.1%)
1,beautiful,636 (13.7%),accessible,48 (9.5%)
2,first,351 (7.5%),first,35 (7.0%)
3,more,325 (7.0%),more,32 (6.4%)
4,accessible,285 (6.1%),open,26 (5.2%)
5,open,235 (5.0%),public,24 (4.8%)
6,nice,230 (4.9%),complete,22 (4.4%)
7,great,211 (4.5%),special,20 (4.0%)
8,last,203 (4.4%),next,19 (3.8%)
9,public,166 (3.6%),last,19 (3.8%)


Top adjective-noun combinations

In [37]:
adjnoun = get_adjnoun_count_df(df, "MacroRole", "nlp")
format_count_df(adjnoun)

Unnamed: 0,Private,Private_count,Public,Public_count
0,new depot,268 (19.9%),first depot,28 (12.7%)
1,first depot,167 (12.4%),new bag,26 (11.8%)
2,accessible depot,127 (9.4%),accessible depot,24 (10.9%)
3,first facility,85 (6.3%),new icon,13 (5.9%)
4,new building,81 (6.0%),complete collection,12 (5.5%)
5,beautiful building,79 (5.9%),new type,11 (5.0%)
6,complete collection,63 (4.7%),sturdy room,10 (4.5%)
7,entire collection,60 (4.5%),new depot,10 (4.5%)
8,new icon,54 (4.0%),accessible room,10 (4.5%)
9,accessible facility,50 (3.7%),official opening,9 (4.1%)


# Sentiment Analysis

In [38]:
def remove_newlines(text, replace_with=""):
    return text.replace('\n', replace_with)

def remove_hashtag_mention_symbols(text):
    """
    removes @ and # from text while leaving the words behind those symbols
    Example: I went to the #mall -> I went to the mall
    """
    text = re.sub("#", "", text)
    text = re.sub("@", "", text)
    return text

def preprocess_for_sentiments(text):
    text = text.lower()
    # text = convert_emojis_to_word(text, preface="") # TODO: add back in
    text = remove_newlines(text, replace_with=" ")
    text = remove_hashtag_mention_symbols(text)
    return text

In [42]:
# https://spacy.io/universe/project/spacy-textblob
"""
The polarity score is a float within the range [-1.0, 1.0].
The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
"""

sentiment_texts = []
for text in tqdm(df["caption_en_head"]):
    sentiment_texts.append(preprocess_for_sentiments(text))
df["sentiment_text"] = sentiment_texts

sentiments = []
subjectivities = []
assessments = []

for text in tqdm(df["sentiment_text"]):
    doc = nlp(text)
    sentiments.append(doc._.polarity)
    subjectivities.append(doc._.subjectivity)
    assessments.append(doc._.assessments)

df["sentiment"] = sentiments
df["subjectivity"] = subjectivities
df["sent_assessment"] = assessments

100%|██████████| 8228/8228 [00:00<00:00, 249355.01it/s]
100%|██████████| 8228/8228 [00:44<00:00, 184.96it/s]


Top positive/negative sentiment captions

In [43]:
df[["sentiment_text", "sentiment", "subjectivity", "sent_assessment"]].sort_values("sentiment", ascending=True)

Unnamed: 0_level_0,sentiment_text,sentiment,subjectivity,sent_assessment
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-03-26 10:56:08,rotterdam's newest spaceship the boijmans va...,-1.000000,1.000000,"[([insane], -1.0, 1.0, None)]"
2021-12-16 07:17:15,insane pivot doors in the,-1.000000,1.000000,"[([insane], -1.0, 1.0, None)]"
2021-12-03 20:06:34,terrible weather all week but nothing can ruin...,-1.000000,1.000000,"[([terrible], -1.0, 1.0, None)]"
2021-02-26 13:14:26,"my jacket says it, so s💜m🙃i💜l🙂e💜 rokjesdag ma...",-0.780000,1.000000,"[([very, cold], -0.78, 1.0, None)]"
2020-08-29 20:17:37,🎶 jump around! jump around! jump around! jump ...,-0.741747,0.288889,"[([down, !, !, !, !, !, !, !], -0.741746690538..."
...,...,...,...,...
2020-07-01 14:02:33,reflection inception depot boijmansvanbeuninge...,1.000000,1.000000,"[([breathtaking], 1.0, 1.0, None)]"
2021-04-06 09:01:18,"even with a hailstorm, the depot is beautiful....",1.000000,1.000000,"[([beautiful, !], 1.0, 1.0, None)]"
2021-12-03 22:54:26,something incredible!,1.000000,0.900000,"[([incredible, !], 1.0, 0.9, None)]"
2021-12-17 12:21:47,"“did you see that dude? yes, isn't it?!?” deli...",1.000000,1.000000,"[([delicious], 1.0, 1.0, None), ([beautiful, !..."


Looking at these captions, the sentiment analysis is easily distratced by certain keywords...

Top words for positive/negative sentiments

In [50]:
# os.makedirs("results/nlp_sentiment_analysis", exist_ok=True)
neg = df.sort_values("sentiment", ascending=True)[["sentiment_text", "sentiment", "subjectivity"]][:100]
# neg.to_csv("results/nlp_sentiment_analysis/top_negative.csv")

pos = df.sort_values("sentiment", ascending=False)[["sentiment_text", "sentiment", "subjectivity"]][:100]
# pos.to_csv("results/nlp_sentiment_analysis/top_positive.csv")

df_neg = df[df.index.isin(neg.index)]
df_neg["word"] = "word"
get_pos_count_df(df_neg, "word", "ADJ", text_column="caption_en_pos")

Unnamed: 0,word,word_count
0,broken,18
1,cold,10
2,crazy,8
3,mockup,4
4,bio,4
5,last,4
6,impossible,4
7,weird,3
8,new,3
9,complicated,2


In [51]:
df_pos = df[df.index.isin(pos.index)]
df_pos["word"] = "word"
get_pos_count_df(df_pos, "word", "ADJ", text_column="caption_en_pos")

Unnamed: 0,word,word_count
0,beautiful,30
1,impressive,21
2,happy,9
3,great,8
4,wonderful,7
5,best,6
6,delicious,3
7,awesome,3
8,easter,2
9,rotterdam,2


Average sentiment by actor group

In [45]:
df.groupby(["MacroRole"])["sentiment"].mean().round(2)

MacroRole
Private    0.13
Public     0.19
Name: sentiment, dtype: float64

**Result**: Public actors write captions with slightly more positive sentiments. But the sentiment analysis is not so robust that this is a reliable finding

In [46]:
df.groupby(["MicroRole"])["sentiment"].mean().round(2)

MicroRole
Affiliated enterprise      0.18
Depot Boijmans designer    0.19
Enterprise                 0.15
Individual                 0.13
Private tourism outlet     0.30
Public institution         0.20
Name: sentiment, dtype: float64