In [1]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"
pio.renderers.default = "iframe_connected"

In [2]:
import pandas as pd
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from collections import Counter

In [3]:
p = r"data\csv\SNP2719372X-19180101-0-0-0-0.csv"   # <-- change this to one file
df = pd.read_csv(p, nrows=3, encoding="utf-8")   # try utf-8 first
print("columns:", list(df.columns))
print("\npreview:")
#display(df)

columns: ['Token', 'Lemma']

preview:


In [4]:
folder = Path(r"data\csv")
csvs = sorted(folder.glob("*.csv"))
print(f"found {len(csvs)} csv files\n")

headers = {}
for f in csvs:
    try:
        h = list(pd.read_csv(f, nrows=0, encoding="utf-8").columns)
    except Exception:
        try:
            h = list(pd.read_csv(f, nrows=0, encoding="latin1").columns)
        except Exception as e:
            h = [f"ERROR: {e}"]
    headers[f.name] = tuple(h)

# show first 20 files and their headers
for i, (name, cols) in enumerate(headers.items()):
    if i < 20:
        print(f"{name} -> {cols}")
    else:
        break

found 1323 csv files

SNP27112366-19180101-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180102-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180103-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180104-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180105-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180106-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180107-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180108-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180109-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180110-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180111-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180112-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180113-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180114-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180115-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180116-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180117-0-0-0-0.csv -> ('Token', 'Lemma')
SNP27112366-19180118-0-0-0-0.csv -> ('Token

In [5]:
schema_counts = Counter(headers.values())
print("Number of distinct schemas:", len(schema_counts))
for cols, cnt in schema_counts.most_common():
    print(f"{cnt} files -> {cols}")

Number of distinct schemas: 1
1323 files -> ('Token', 'Lemma')


In [6]:
folder = Path(r"data\csv")   # <-- change this
files = sorted(folder.glob("*.csv"))

dfs = []
for f in tqdm(files):
    df = pd.read_csv(f, encoding="utf-8")
    df["source_file"] = f.stem  # store filename (without .csv)
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
print("Combined shape:", df_all.shape)
#df_all.head()

100%|██████████████████████████████████████████████████████████████████████████████| 1323/1323 [00:17<00:00, 73.69it/s]


Combined shape: (33185569, 3)


In [7]:
df_all["source_file"].unique()[:20]

array(['SNP27112366-19180101-0-0-0-0', 'SNP27112366-19180102-0-0-0-0',
       'SNP27112366-19180103-0-0-0-0', 'SNP27112366-19180104-0-0-0-0',
       'SNP27112366-19180105-0-0-0-0', 'SNP27112366-19180106-0-0-0-0',
       'SNP27112366-19180107-0-0-0-0', 'SNP27112366-19180108-0-0-0-0',
       'SNP27112366-19180109-0-0-0-0', 'SNP27112366-19180110-0-0-0-0',
       'SNP27112366-19180111-0-0-0-0', 'SNP27112366-19180112-0-0-0-0',
       'SNP27112366-19180113-0-0-0-0', 'SNP27112366-19180114-0-0-0-0',
       'SNP27112366-19180115-0-0-0-0', 'SNP27112366-19180116-0-0-0-0',
       'SNP27112366-19180117-0-0-0-0', 'SNP27112366-19180118-0-0-0-0',
       'SNP27112366-19180119-0-0-0-0', 'SNP27112366-19180120-0-0-0-0'],
      dtype=object)

In [8]:
# extract the 8-digit date part using a regex
df_all["date"] = df_all["source_file"].str.extract(r"-(\d{8})-")

# convert to datetime
df_all["date"] = pd.to_datetime(df_all["date"], format="%Y%m%d")

# quick check
df_all[["source_file", "date"]].head()

Unnamed: 0,source_file,date
0,SNP27112366-19180101-0-0-0-0,1918-01-01
1,SNP27112366-19180101-0-0-0-0,1918-01-01
2,SNP27112366-19180101-0-0-0-0,1918-01-01
3,SNP27112366-19180101-0-0-0-0,1918-01-01
4,SNP27112366-19180101-0-0-0-0,1918-01-01


In [9]:
df_all["date"].min(), df_all["date"].max()

(Timestamp('1918-01-01 00:00:00'), Timestamp('1919-12-31 00:00:00'))

In [10]:
# lowercase lemmas
df_all["Lemma_lower"] = df_all["Lemma"].str.lower()

# keep only alphabetic words longer than 2 letters
df_all_filtered = df_all[df_all["Lemma_lower"].str.isalpha() & (df_all["Lemma_lower"].str.len() > 2)]

german_stopwords = set(stopwords.words("german"))
custom_stopwords = {
    "auch", "noch", "wird", "sein", "einer", "einem", "eines", 
    "sich", "wurde", "wären", "kann", "können", "dass", "sind",
    "wurde", "waren", "haben", "hatte", "worden", "diese",
    "dieser", "dieses", "diesen", "jahr", "zeit", "zeitung"
}
extended_stopwords = german_stopwords.union(custom_stopwords)

df_all_filtered = df_all_filtered[~df_all_filtered["Lemma_lower"].isin(extended_stopwords)]

# calculate frequencies per lemma per date
lemma_date_counts = (
    df_all_filtered.groupby(["date", "Lemma_lower"])
    .size()
    .reset_index(name="count")
)

# remove rare lemmas (<5 occurrences)
lemma_date_counts = lemma_date_counts[lemma_date_counts["count"] >= 5]

# quick check
#lemma_date_counts.head()

In [11]:
seed_words = ['grippe','tote','influenza','pandemie','epidemie','krankheit','tag','hoch','kampf','kreis','mensch','täglich','tot']

In [12]:
dates_with_flu = lemma_date_counts[
    lemma_date_counts['Lemma_lower'].isin(seed_words)
]['date'].unique()

print(f"Number of unique dates containing flu-related words: {len(dates_with_flu)}")

Number of unique dates containing flu-related words: 700


In [13]:
# take all rows where the date is in the flu-related dates
df_co = lemma_date_counts[lemma_date_counts['date'].isin(dates_with_flu)].copy()

# check size and sample
print(f"Total rows in co-occurrence set: {len(df_co)}")
#print(df_co.head())

Total rows in co-occurrence set: 348909


In [14]:
# 1. Create a set of seed words for fast lookup
seed_set = set(seed_words)

# 2. For each date, get all lemmas appearing
lemmas_by_date = df_co.groupby('date')['Lemma_lower'].apply(set)

# 3. Count co-occurrences: if a seed word is on a date, increment count for all lemmas on that date
co_counts = Counter()

for date, lemmas in lemmas_by_date.items():
    if seed_set & lemmas:  # any seed word present
        for lemma in lemmas:
            if lemma not in seed_set:  # exclude seed words themselves
                co_counts[lemma] += 1

# 4. Convert to DataFrame and sort
df_cooccurrence = (
    pd.DataFrame(co_counts.items(), columns=['Lemma_lower', 'co_occurrence_count'])
    .sort_values(by='co_occurrence_count', ascending=False)
    .reset_index(drop=True)
)

#print(df_cooccurrence.head(20))  # top 20 co-occurring lemmas

In [15]:
top_n = 50
flu_vocab = df_cooccurrence['Lemma_lower'].head(top_n).tolist() + seed_words  # include seed words

In [16]:
seed_words = ['grippe','tote','influenza','pandemie','epidemie','krankheit','hoch','kampf','kreis','mensch','täglich','tot']

In [17]:
# Filter 'lemma_date_counts' to keep only rows where the lemma is in the seed words
df_flu_seed_only = lemma_date_counts[
    lemma_date_counts['Lemma_lower'].isin(seed_words)
].copy()

# Sort by date for readability
df_flu_seed_only = df_flu_seed_only.sort_values(by=['date', 'count'], ascending=[True, False])

# Print the head and shape of this new DataFrame
print(f"Shape of the seed-only DataFrame: {df_flu_seed_only.shape}")
#print("\nPreview of the seed-only DataFrame (top 10 rows):")
#print(df_flu_seed_only.head(10).to_string())

Shape of the seed-only DataFrame: (2300, 3)


In [18]:
import plotly.graph_objects as go

# --- Aggregate totals for plotting ---
daily_total = (
    df_flu_seed_only.groupby('date')['count']
    .sum()
    .reset_index(name='total_abs_freq')
)

df_flu_seed_only['week'] = df_flu_seed_only['date'].dt.to_period('W').dt.start_time
weekly_total = (
    df_flu_seed_only.groupby('week')['count']
    .sum()
    .reset_index(name='total_abs_freq')
)

df_flu_seed_only['month'] = df_flu_seed_only['date'].dt.to_period('M').dt.start_time
monthly_total = (
    df_flu_seed_only.groupby('month')['count']
    .sum()
    .reset_index(name='total_abs_freq')
)

# --- Create Plotly figure ---
fig = go.Figure()

# Daily
fig.add_trace(go.Scatter(
    x=daily_total['date'], y=daily_total['total_abs_freq'],
    mode='lines', name='Daily', visible=True
))

# Weekly
fig.add_trace(go.Scatter(
    x=weekly_total['week'], y=weekly_total['total_abs_freq'],
    mode='lines', name='Weekly', visible=False
))

# Monthly
fig.add_trace(go.Scatter(
    x=monthly_total['month'], y=monthly_total['total_abs_freq'],
    mode='lines', name='Monthly', visible=False
))

# --- Toggle buttons ---
buttons = [
    dict(label='Daily',
         method='update',
         args=[{'visible': [True, False, False]},
               {'title': 'Total Daily Mentions of Flu-related Words'}]),
    dict(label='Weekly',
         method='update',
         args=[{'visible': [False, True, False]},
               {'title': 'Total Weekly Mentions of Flu-related Words'}]),
    dict(label='Monthly',
         method='update',
         args=[{'visible': [False, False, True]},
               {'title': 'Total Monthly Mentions of Flu-related Words'}])
]

# --- Layout with horizontal seed word legend below the chart ---
fig.update_layout(
    updatemenus=[dict(
        type='buttons',
        direction='left',
        buttons=buttons,
        x=0.5, xanchor='center',
        y=1.15, yanchor='top'
    )],
    title='Total Mentions of Flu-related Words Over Time',
    xaxis_title='Date',
    yaxis_title='Absolute Frequency (Total)',
    template='plotly_white',
    hovermode='x unified',
    width=950,
    height=550,
    margin=dict(b=120)  # make room for the seed word legend
)

# --- Add horizontal seed word display below the X-axis ---
seed_words = ['grippe','tote','influenza','pandemie','epidemie',
              'krankheit','hoch','kampf','kreis','mensch','täglich','tot']

fig.add_annotation(
    text="<b>Seed Words:</b> " + " • ".join(seed_words),
    showarrow=False,
    xref='paper', yref='paper',
    x=0.5, y=-0.25,  # below the X-axis
    xanchor='center',
    font=dict(size=12)
)

fig.show()

In [22]:
px.area(monthly_total, x='month', y='total_abs_freq', title="Wave-like Public Attention")

In [27]:
daily_total['month'] = daily_total['date'].dt.month
daily_total['year'] = daily_total['date'].dt.year
daily_total['day'] = daily_total['date'].dt.day

px.density_heatmap(
    daily_total, x='day', y='month', z='total_abs_freq', facet_col='year',
    color_continuous_scale='Reds', title="Daily Mentions Heatmap"
)

In [20]:
px.bar(
    lemma_date_counts[lemma_date_counts['Lemma_lower'].isin(seed_words)],
    x='Lemma_lower', y='count', color='Lemma_lower',
    animation_frame='date', range_y=[0, lemma_date_counts['count'].max()],
    title="Daily Mentions of Each Seed Word Over Time"
)