# ðŸš€ Analyse syntaktischer N-Gramme  
Basierend auf der Dependenzannotation mit `spaCy`

## Import der Bibliotheken

In [37]:
from pathlib import Path
from typing import Dict, List, Union, Tuple
from collections import OrderedDict, Counter
from time import time
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin, Doc

In [None]:
!python -m spacy info

Laden des spaCy-Modells

In [None]:
! python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")

## Laden der Annotationen

In [None]:
# Bei Verwendung eines anderen Korpus hier den Verzeichnisnamen anpassen
annotation_dir = Path("../data/spacy")

if not annotation_dir.exists():
    print("The directory does not exist, please check the path again.")

In [None]:
# Create dictionary to save the corpus data (filenames and tables)
annotated_docs = {}

start = time()
# Iterate over spacy files
for fp in tqdm(annotation_dir.iterdir(), desc="Reading annotated data"):
    # check if the entry is a file, not a directory
    if fp.is_file():
        # check if the file has the correct suffix spacy
        if fp.suffix == '.spacy':
            print( f"Loading file: {fp.name}" )
            # load spacy DocBin objects
            doc_bin = DocBin().from_disk(fp)
            chunk_docs = list(doc_bin.get_docs(nlp.vocab))
            # merge bins into one single document
            full_doc = Doc.from_docs(chunk_docs)

            # save the data frame to the dictionary, key=filename (without suffix), value=spacy.Doc
            annotated_docs[fp.stem] = full_doc
took = time() - start
print(f"Loading the data took: {round(took, 4)} seconds")

In [None]:
print(f"Annotations of first 20 lines of the text: {list(annotated_docs.keys())[0]}:\n")
print("Token\tLemma\tPoS")
for token in annotated_docs[list(annotated_docs.keys())[0]][:20]:
    print(f"{token.text}\t{token.lemma_}\t{token.pos_}")

## Metadaten einlesen

In [None]:
metadata_df = pd.read_csv("../metadata/metadata_corpus-german_language_fiction_1820-1900_50-per-decade.csv")
metadata_df = metadata_df[metadata_df['ID'].isin(annotated_docs.keys())]
# Datentyp der Datumsspalte fÃ¼r eine einfachere Weiterverarbeitung Ã¤ndern
metadata_df['year'] = pd.to_datetime(metadata_df['year'], format="%Y")

In [None]:
metadata_df_alt = pd.read_csv("../metadata/metadata_corpus-german_language_fiction_1820-1900_50-per-decade_ALT.csv")
metadata_df_alt = metadata_df_alt[metadata_df_alt['ID'].isin(annotated_docs.keys())]
# Datentyp der Datumsspalte fÃ¼r eine einfachere Weiterverarbeitung Ã¤ndern
metadata_df_alt['year'] = pd.to_datetime(metadata_df_alt['year'], format="%Y")

In [None]:
metadata_df.shape[0]

In [None]:
metadata_df_alt.shape[0]

In [None]:
metadata_df.head()

In [None]:
metadata_df['lastname'].value_counts()[:20]

## Syntaktische N-Gramme extrahieren

In einem weiteren Schritt kÃ¶nnen wir die Adjektive extrahieren, die mit dem Nomen Luft in Verbindung stehen. Wir machen dabei Gebrauch von den Dependenzstrukturen, die sich durch das spaCy-eigene `Doc` einfach navigieren lassen. 

In [None]:
def extract_dependent_adjective_list(spacy_docs: Dict, metadata_df: pd.DataFrame,
                                     noun_input: Union[str, List[str]], top_n: int = 10) -> Tuple[pd.DataFrame, List[str]]:
    """
    Extract adjective modifiers (amod) for a noun or list of nouns and track their frequency over time.

    Parameters:
    -----------
    spacy_docs : dict
        Dictionary with file_ids as keys and spaCy Doc objects as values
    metadata_df : pd.DataFrame
        DataFrame with columns: 'lastname', 'firstname', 'title', 'year', 'volume', 'ID', 'decade'
    noun_input : str or list of str
        Single noun lemma (e.g., 'liebe') or list of noun lemmata (e.g., ['liebe', 'leidenschaft'])
    top_n : int
        Number of most frequent adjectives to extract (default: 10)

    Returns:
    --------
    tuple : (pd.DataFrame, list)
        - DataFrame with columns: filename, title, year, adjective, count, noun_count
        - List of the top N adjectives found
    """
    # Convert single noun to list for uniform processing
    if isinstance(noun_input, str):
        noun_list = [noun_input]
    else:
        noun_list = noun_input

    # Convert to lowercase for case-insensitive matching
    noun_list_lower = [noun.lower() for noun in noun_list]

    # First pass: count all adjectives modifying any noun in the list across entire corpus
    all_adjectives = Counter()

    for file_id, doc in spacy_docs.items():
        for token in doc:
            # Check if this token is one of our target nouns
            if token.lemma_.lower() in noun_list_lower and token.pos_ == 'NOUN':
                # Find any dependent adjective
                for child in token.children:
                    #if child.dep_ == 'amod' and child.pos_ == 'ADJ':
                    if child.pos_ == 'ADJ':
                        all_adjectives[child.lemma_.lower()] += 1

    # Get top N most frequent adjectives
    top_adjectives = [adj for adj, count in all_adjectives.most_common(top_n)]

    # Second pass: calculate frequencies per document
    results = []

    for file_id, doc in spacy_docs.items():
        # Get metadata for this file
        meta_row = metadata_df[metadata_df['ID'] == file_id]

        if meta_row.empty:
            continue

        # Count adjectives modifying the target nouns
        adjective_counts = Counter()
        noun_count = 0

        for token in doc:
            # Check if this token is one of our target nouns
            if token.lemma_.lower() in noun_list_lower and token.pos_ == 'NOUN':
                noun_count += 1
                #print('found noun:', token.text)
                # Find adjective modifiers (amod dependency)
                for child in token.children:
                    #print('child dep:', child.dep_, 'pos:', child.pos_)
                    #if child.dep_ == 'amod' and child.pos_ == 'ADJ':
                    if child.pos_ == 'ADJ':
                        #print('  found dependent adjective:', child.text)
                        adjective_counts[child.lemma_.lower()] += 1

        # Create a row for each top adjective found in this document
        # (even if count is 0, we want to track that)
        for adjective in top_adjectives:
            count = adjective_counts.get(adjective, 0)
            if noun_count > 0 or count > 0:  # Include if we have nouns or this adjective
                results.append({
                    'filename': file_id,
                    'title': meta_row['title'].values[0],
                    'year': meta_row['year'].values[0],
                    'adjective': adjective,
                    'count': count,
                    'noun_count': noun_count
                })

    return pd.DataFrame(results), top_adjectives


def get_top_adjectives_list(adj_df: pd.DataFrame, top_n: int = 10) -> list:
    """
    Get the top N most frequent adjectives from the adjective dataframe.

    Parameters:
    -----------
    adj_df : pd.DataFrame
        DataFrame returned by extract_adjective_modifiers_list()
    top_n : int
        Number of most frequent adjectives to return

    Returns:
    --------
    list
        List of top N adjectives
    """
    total_counts = adj_df.groupby('adjective')['count'].sum().sort_values(ascending=False)
    return total_counts.head(top_n).index.tolist()


def plot_adjective_trends_list(adj_df: pd.DataFrame, top_adjectives: list,
                               noun_input: Union[str, List[str]]):
    """
    Create a plot showing adjective modifier trends over time for a noun or noun list.

    Parameters:
    -----------
    adj_df : pd.DataFrame
        DataFrame returned by extract_adjective_modifiers_list()
    top_adjectives : list
        List of adjectives to plot (e.g., from get_top_adjectives_list())
    noun_input : str or list of str
        The noun(s) being analyzed
    show_individual_texts : bool
        If True, show individual text data points with titles; if False, show yearly means only

    Returns:
    --------
    
    .graph_objects.Figure
        The figure object (will display automatically in Jupyter)
    """
    # Filter for only the top adjectives
    filtered_df = adj_df[adj_df['adjective'].isin(top_adjectives)].copy()

    # Calculate relative frequency (per 100 noun occurrences)
    # Handle division by zero
    filtered_df['rel_freq'] = filtered_df.apply(
        lambda row: (row['count'] / row['noun_count']) * 100 if row['noun_count'] > 0 else 0,
        axis=1
    )

    # Create figure
    fig = go.Figure()


    # Show individual texts as scatter points
    for adj in top_adjectives:
        adj_data = filtered_df[filtered_df['adjective'] == adj]

        fig.add_trace(go.Scatter(
            x=adj_data['year'],
            y=adj_data['rel_freq'],
            mode='markers',
            name=adj,
            text=adj_data['title'],
            hovertemplate='<b>%{fullData.name}</b><br>' +
                         '<b>%{text}</b><br>' +
                         'Year: %{x}<br>' +
                         'Frequency: %{y:.2f} per 100 occurrences<br>' +
                         '<extra></extra>',
            marker=dict(size=8, opacity=0.7)
        ))

    # Create title based on input
    if isinstance(noun_input, str):
        title_noun = f'"{noun_input}"'
    else:
        noun_str = ', '.join(noun_input[:3])
        if len(noun_input) > 3:
            noun_str += f', ... ({len(noun_input)} total)'
        title_noun = f'[{noun_str}]'

    # Update layout
    fig.update_layout(
        title=f'Adjective Syntactic Dependents of {title_noun} Over Time',
        xaxis_title='Year',
        yaxis_title=f'Relative Frequency (per 100 noun occurrences)',
        hovermode='closest',
        height=600,
        legend=dict(
            title='Adjectives',
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99
        )
    )

    return fig


def plot_adjective_trends_moving_avg_plotly(
    adj_df: pd.DataFrame,
    top_adjectives: list,
    noun_input,
    window_years: int = 10,
    n_plot: int = 8,
    value_col: str = "rel_freq",
    show_points: bool = True,
):
    """
    Plotly lineplot per year for adjective dependents + centered moving average window.

    Expects adj_df to have at least: year, adjective, count, noun_count
    If value_col (default: rel_freq) is missing, it will be computed as (count / noun_count) * 100.
    """

    df = adj_df.copy()

    # --- Ensure year is integer year (avoid datetime nanoseconds weirdness) ---
    if "year" not in df.columns:
        raise ValueError("adj_df must contain a 'year' column.")

    if pd.api.types.is_datetime64_any_dtype(df["year"]):
        df["year"] = df["year"].dt.year
    else:
        df["year"] = pd.to_numeric(df["year"], errors="coerce")

    df = df.dropna(subset=["year"])
    df["year"] = df["year"].astype(int)

    # --- Compute relative frequency if needed ---
    if value_col not in df.columns:
        if not {"count", "noun_count"}.issubset(df.columns):
            raise ValueError(
                f"adj_df must contain '{value_col}' or both 'count' and 'noun_count'."
            )
        df[value_col] = df.apply(
            lambda row: (row["count"] / row["noun_count"]) * 100 if row["noun_count"] else 0,
            axis=1,
        )

    # --- Filter adjectives ---
    df = df[df["adjective"].isin(top_adjectives)].copy()
    if df.empty:
        raise ValueError("After filtering by top_adjectives, no rows remain.")

    # --- Yearly aggregate (mean across texts) ---
    yearly = (
        df.groupby(["year", "adjective"])[value_col]
          .mean()
          .unstack("adjective")
          .sort_index()
    )

    # --- Moving average on yearly aggregates ---
    moving = yearly.rolling(window=window_years, center=True, min_periods=1).mean()

    # --- Limit to n_plot adjectives (keep original top_adjectives order if possible) ---
    cols_in_data = [a for a in top_adjectives if a in moving.columns]
    cols_to_plot = cols_in_data[:n_plot] if cols_in_data else list(moving.columns)[:n_plot]
    moving_plot = moving[cols_to_plot].copy()

    # --- Build title ---
    if isinstance(noun_input, str):
        title_noun = f'"{noun_input}"'
    else:
        noun_str = ", ".join(noun_input[:3])
        if len(noun_input) > 3:
            noun_str += f", ... ({len(noun_input)} total)"
        title_noun = f"[{noun_str}]"

    # --- Long format for plotly ---
    moving_long = (
        moving_plot.reset_index()
                  .melt(id_vars="year", var_name="adjective", value_name=value_col)
    )

    # --- Plotly line chart ---
    fig = px.line(
        moving_long,
        x="year",
        y=value_col,
        color="adjective",
        markers=show_points,
        title=f"Top {min(n_plot, len(cols_to_plot))} Adjectives â€“ {window_years}-Year Moving Average â€“ {title_noun}",
        labels={
            "year": "Year",
            value_col: f"{value_col} (moving avg, window={window_years}y)",
            "adjective": "Adjective",
        },
    )

    # Make x axis show integer years cleanly (no scientific notation)
    fig.update_xaxes(
        tickmode="linear",
        dtick=10,          # change to 5/1 if you want denser ticks
        tickformat="d",
    )

    fig.update_layout(
        width=1000,
        height=500,
        legend_title_text="",
        hovermode="x unified",
        margin=dict(l=40, r=40, t=70, b=40),
    )

    fig.show()
    return yearly, moving

In [None]:
noun = "Luft"

In [None]:
adj_df, top_adjs = extract_dependent_adjective_list(annotated_docs, metadata_df, noun, top_n=10)


In [None]:
adj_df_alt, top_adjs_alt = extract_dependent_adjective_list(annotated_docs, metadata_df_alt, noun, top_n=10)

## Analyse und Visualisierung

In [None]:
# sample 1
plot_adjective_trends_list(adj_df, top_adjs, noun)

In [39]:
# NEW: yearly lineplots + moving average
yearly_1, moving_1 = plot_adjective_trends_moving_avg_plotly(
    adj_df, top_adjs, noun_input=noun, window_years=10, n_plot=8
)

In [None]:
# sample 2
plot_adjective_trends_list(adj_df_alt, top_adjs_alt, noun)

In [40]:
# NEW: yearly lineplots + moving average
yearly_2, moving_2 = plot_adjective_trends_moving_avg_plotly(
    adj_df_alt, top_adjs_alt, noun_input=noun, window_years=10, n_plot=8
)

## Add plots for specific words that we found

TO BE CHANGED / UPDATED / CONTINUED