In [1]:
import os
import json
import spacy
from collections import Counter
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
import matplotlib.patches as mpatches
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
nlp = spacy.load("en_core_web_sm")
ALIGNED_DIR = "/workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/aligned_transcripts"

# Define grounded categories
emphasis_words = {"really", "always", "definitely", "super", "so", "important", "very"}
filler_words = {"uh", "um", "like", "you know", "i mean", "so", "well"}
content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
negation_words = {"not", "no", "never", "n't"}
first_person_pronouns = {"i", "we"}
third_person_pronouns = {"he", "she", "it", "they"}

for filename in os.listdir(ALIGNED_DIR):
    if filename.endswith("_aligned.json"):
        path = os.path.join(ALIGNED_DIR, filename)
        with open(path, "r") as f:
            data = json.load(f)

        for segment in data:
            text = segment["text"]
            doc = nlp(text)

            imperative_type = "none"
            for token in doc:
                if token.pos_ == "VERB" and token.dep_ == "ROOT" and not any(child.dep_ == "nsubj" for child in token.children):
                    imperative_type = "procedural" if any(child.dep_ in {"dobj", "obj"} for child in token.children) else "directive"
                    break

            noun_count = sum(1 for t in doc if t.pos_ == "NOUN")
            pron_count = sum(1 for t in doc if t.pos_ == "PRON")
            noun_pron_ratio = noun_count / pron_count if pron_count > 0 else noun_count

            sub_clauses = sum(1 for t in doc if t.dep_ in {"advcl", "ccomp", "mark"})
            emphasis_used = any(t.lemma_.lower() in emphasis_words for t in doc)
            filler_count = sum(1 for t in doc if t.text.lower() in filler_words)

            words = [t for t in doc if t.is_alpha]
            content_words = [t for t in words if t.pos_ in content_pos]
            content_word_ratio = len(content_words) / len(words) if words else 0

            sentence_mood = "declarative"
            if any(t.tag_ == "MD" for t in doc) and text.strip().endswith("?"):
                sentence_mood = "interrogative"
            elif imperative_type != "none":
                sentence_mood = "imperative"

            has_negation = int(any(t.text.lower() in negation_words for t in doc))

            pronoun_type = "none"
            if any(t.text.lower() in first_person_pronouns for t in doc):
                pronoun_type = "first"
            elif any(t.text.lower() in third_person_pronouns for t in doc):
                pronoun_type = "third"

            dep_depths = [abs(t.head.i - t.i) for t in doc if t != t.head]
            avg_token_depth = sum(dep_depths) / len(dep_depths) if dep_depths else 0

            clause_count = sum(1 for t in doc if t.dep_ in {"ROOT", "ccomp", "advcl", "relcl"})

            segment.update({
                "imperative_type": imperative_type,
                "segment_length": segment["end_time"] - segment["start_time"],
                "noun_to_pronoun_ratio": noun_pron_ratio,
                "subordinate_clauses": sub_clauses,
                "uses_emphasis_word": int(emphasis_used),
                "filler_word_count": filler_count,
                "content_word_ratio": content_word_ratio,
                "sentence_mood": sentence_mood,
                "has_negation": has_negation,
                "pronoun_type": pronoun_type,
                "avg_token_depth": avg_token_depth,
                "clause_count": clause_count
            })

        with open(path, "w") as f:
            json.dump(data, f, indent=2)

        print(f"✅ Fully enriched: {filename}")

✅ Fully enriched: EasyBlueberryMuffinsRecipe_aligned.json
✅ Fully enriched: PerfectWeeknightShrimpFriedRice_aligned.json
✅ Fully enriched: PeruvianChickenRecipe_aligned.json
✅ Fully enriched: InasFavoriteThings_aligned.json


In [3]:
ALIGNED_DIR = "/workspaces/Gesture-Language-Alignment-in-Instructional-Videos/00_videos/aligned_transcripts"

def load_all_features():
    rows = []

    for filename in os.listdir(ALIGNED_DIR):
        if filename.endswith("_aligned.json"):
            video_name = filename.replace("_aligned.json", "")
            path = os.path.join(ALIGNED_DIR, filename)

            with open(path, "r") as f:
                segments = json.load(f)

            for seg in segments:
                row = {
                    "video_name": video_name,
                    "text": seg["text"],
                    "gesture_avg": seg.get("gesture_avg", 0),
                    "segment_length": seg.get("segment_length", 0),
                    "imperative_type": seg.get("imperative_type", "none"),
                    "noun_to_pronoun_ratio": seg.get("noun_to_pronoun_ratio", 0),
                    "subordinate_clauses": seg.get("subordinate_clauses", 0),
                    "uses_emphasis_word": seg.get("uses_emphasis_word", 0),
                    "filler_word_count": seg.get("filler_word_count", 0),
                    "content_word_ratio": seg.get("content_word_ratio", 0),
                    "sentence_mood": seg.get("sentence_mood", "declarative"),
                    "has_negation": seg.get("has_negation", 0),
                    "pronoun_type": seg.get("pronoun_type", "none"),
                    "avg_token_depth": seg.get("avg_token_depth", 0),
                    "clause_count": seg.get("clause_count", 0),
                }
                rows.append(row)

    return pd.DataFrame(rows)

df = load_all_features()
df.head()

Unnamed: 0,video_name,text,gesture_avg,segment_length,imperative_type,noun_to_pronoun_ratio,subordinate_clauses,uses_emphasis_word,filler_word_count,content_word_ratio,sentence_mood,has_negation,pronoun_type,avg_token_depth,clause_count
0,EasyBlueberryMuffinsRecipe,"Hey guys, I'm Dina from SimplyHomeCook.com and...",0.029776,7.1,none,2.0,0,0,0,0.421053,declarative,0,first,2.954545,2
1,EasyBlueberryMuffinsRecipe,They're loaded with a ton of blueberries and t...,0.109254,5.92,directive,2.0,0,0,0,0.529412,imperative,0,third,3.105263,1
2,EasyBlueberryMuffinsRecipe,So you want to start off by combining all of y...,0.042972,2.84,none,0.333333,0,1,1,0.461538,declarative,0,none,2.230769,1
3,EasyBlueberryMuffinsRecipe,So in a large bowl combine two cups of all-pur...,0.060303,5.32,procedural,6.0,0,1,1,0.588235,imperative,0,none,3.105263,1
4,EasyBlueberryMuffinsRecipe,"half a teaspoon of baking soda, and a quarter ...",0.05128,3.88,none,5.0,0,0,0,0.5,declarative,0,none,3.0,1


In [9]:
df_encoded = pd.get_dummies(df, columns=["imperative_type", "sentence_mood", "pronoun_type"], drop_first=False)

In [11]:
formula = (
    "gesture_avg ~ segment_length + noun_to_pronoun_ratio + subordinate_clauses + "
    "uses_emphasis_word + filler_word_count + content_word_ratio + "
    "has_negation + avg_token_depth + clause_count + "
    "imperative_type_directive + imperative_type_procedural + "
    "sentence_mood_imperative + "
    "pronoun_type_first + pronoun_type_third + "
    "C(video_name)"
)

model = smf.ols(formula=formula, data=df_encoded).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            gesture_avg   R-squared:                       0.223
Model:                            OLS   Adj. R-squared:                  0.194
Method:                 Least Squares   F-statistic:                     7.613
Date:                Thu, 17 Apr 2025   Prob (F-statistic):           7.50e-16
Time:                        23:11:23   Log-Likelihood:                 1004.1
No. Observations:                 442   AIC:                            -1974.
Df Residuals:                     425   BIC:                            -1905.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------