In [18]:
pip install nltk librosa pandas

Note: you may need to restart the kernel to use updated packages.


In [152]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vinbo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
import pandas as pd
from pathlib import Path

# Path to the segmented transcript
segments_path = Path("../data/processed/commentary_segments.txt")

segments = []
with open(segments_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            # Example format: [start -> end] segment_text
            parts = line.split("] ")
            time_range = parts[0][1:]  # remove starting '['
            start, end = time_range.replace("s","").split(" -> ")
            text = parts[1]
            segments.append({"start": float(start), "end": float(end), "text": text})

df = pd.DataFrame(segments)

In [154]:
df.head(5)

Unnamed: 0,start,end,text
0,0.0,6.28,How do the Norwich players feel at this point...
1,6.28,10.72,Absolutely loving it because the fans will be...
2,10.72,15.08,"They all want first touches, give it to a yel..."
3,15.08,18.16,Jamal Lewis up against Bernardo Silva looks...\n
4,18.16,19.88,"Wow, that looks a problem, that one does.\n"


In [None]:
import librosa
import numpy as np

# Path to audio
audio_file = Path("../data/raw/commentary.wav")

# Load audio
y, sr = librosa.load(audio_file, sr=None)

# Short-time energy (simple excitement metric)
frame_length = 2048
hop_length = 512
energy = np.array([sum(abs(y[i:i+frame_length]**2)) for i in range(0, len(y), hop_length)])

# Normalise 0–1
energy = (energy - energy.min()) / (energy.max() - energy.min())

# Map segment start/end to energy
def get_segment_energy(start, end):
    start_frame = int(start * sr / hop_length)
    end_frame = int(end * sr / hop_length)
    return energy[start_frame:end_frame].mean()

df["audio_score"] = df.apply(lambda row: get_segment_energy(row["start"], row["end"]), axis=1)

In [219]:
df.sort_values(by='audio_score', ascending=False).head(5)

Unnamed: 0,start,end,text,audio_score,vader_score,keyword_score,final_score
1675,5883.16,5885.16,Norwich have done it!\n,0.496202,0.0,0.0,2.97721
363,1683.0,1691.56,It's 2-0! Norwich City have had two chances i...,0.469073,0.2714,1.0,6.085836
1676,5885.16,5887.16,That is a wonderful Carrow Road night\n,0.423164,0.5719,0.0,3.110882
248,1083.0,1091.32,"side delivers, McLean's head on, and he's in,...",0.418614,0.5267,0.0,3.038385
1674,5881.16,5883.16,Norwich putting him under pressure\n,0.377807,-0.296,0.0,1.970841


In [220]:
df["vader_score"] = df["text"].apply(lambda x: analyser.polarity_scores(x)["compound"])

In [221]:
def get_keyword_score(text):
    text_lower = text.lower()
    
    #Common key phrases when a team scores
    key_phrases = [
    "slots home",
    "slots it home",
    "tucks it away",
    "buries it",
    "beats the keeper",
    "finds the goal",
    "finds the net",
    "back of the net",
    "into the net",
    "in the net", 
    "got one back",
    "gets one back",
    "pulled one back",
    "pulls one back",
    "in the lead",
    "into the lead",
    "it's in"
    "breaks the deadlock",
    "opens the scoring",
    "doubles the lead",
    "extends the lead",
    "levels it",
    "levels the score",
    "equalizer",
    "equaliser",
    "draws level",
    "all square",
    "restored parity",
    "takes the lead",
    "regains the lead",
    "puts them ahead",
    "puts them in front",
    "from the spot",
    "converts the penalty",
    "smashes it home",
    "thunderbolt",
    "what a strike",
    "wonder goal",
    "spectacular finish",
    "clinical finish",
    "composed finish",
    "curls it in",
    "heads home",
    "heads it in",
    "nodded home",
    "taps in",
    "tap in",
    "empty net",
    "deflected in",
    "own goal",
    "puts the game to bed",
    "seals the win",
    "completes his hat-trick",
    "hat trick",
    "brace"
]
    
    # Scoreline patterns (e.g., "it's 1-0", "it's one-nil", "makes it 1-0")
    scoreline_patterns = [
        r"it's\s+\d+[-–]\d+",  # "it's 1-0"
        r"it's\s+\w+[-–]\w+",  # "it's one-nil"
        r"makes?\s+it\s+\d+[-–]\d+",  # "makes it 1-0"
        r"now\s+\d+[-–]\d+",  # "now 1-0"
        r"\d+[-–]\d+\s+now"  # "1-0 now"
    ]
    
    import re
    for pattern in scoreline_patterns:
        if re.search(pattern, text_lower):
            return 1.0
    
    # Check for key phrases
    for phrase in key_phrases:
        if phrase in text_lower:
            return 1.0
    
    return 0

df["keyword_score"] = df["text"].apply(get_keyword_score)

In [222]:
df.sort_values(by='keyword_score', ascending=False).head(10)

Unnamed: 0,start,end,text,audio_score,vader_score,keyword_score,final_score
583,2745.48,2750.28,being a dangerous lead but you know 2-0 at an...,0.039181,0.5927,1.0,3.827788
584,2750.28,2756.92,to be in it's now 2-1 and the game's back in ...,0.031053,-0.3252,1.0,2.861115
623,2940.28,2944.68,Second half has started Manchester City chasi...,0.042501,0.0,1.0,3.255007
69,248.44,252.36,"he's been closed down by Buendia, he's 25 yar...",0.068053,0.0,1.0,3.408316
425,1974.76,1978.52,Kenny McLean's already dumped header into the...,0.106633,0.0,1.0,3.639798
581,2736.76,2741.16,Tim Krull and into the net and that will be f...,0.021703,-0.4404,1.0,2.68982
342,1580.2,1583.96,Manchester City searching for a Carreroad equ...,0.037962,0.0,1.0,3.227772
370,1726.04,1731.96,simplest of tasks to pass the ball in the bac...,0.174113,0.3818,1.0,4.42648
571,2682.04,2687.56,and they have got one back Manchester City 30...,0.120786,0.0,1.0,3.724718
363,1683.0,1691.56,It's 2-0! Norwich City have had two chances i...,0.469073,0.2714,1.0,6.085836


In [227]:
alpha = 0.1  # weight for sentiment
beta = 0.65  # weight for audio
gamma = 0.25 # weight for keywords

df["final_score"] = alpha * (df["vader_score"] * 10) + beta * (df["audio_score"] * 10) + gamma * (df['keyword_score'] * 10)

In [224]:
df.sort_values(by='final_score', ascending=False).head(5)

Unnamed: 0,start,end,text,audio_score,vader_score,keyword_score,final_score
363,1683.0,1691.56,It's 2-0! Norwich City have had two chances i...,0.469073,0.2714,1.0,6.085836
370,1726.04,1731.96,simplest of tasks to pass the ball in the bac...,0.174113,0.3818,1.0,4.42648
583,2745.48,2750.28,being a dangerous lead but you know 2-0 at an...,0.039181,0.5927,1.0,3.827788
1578,5681.16,5683.16,In search of an equaliser\n,0.131489,0.0,1.0,3.788935
571,2682.04,2687.56,and they have got one back Manchester City 30...,0.120786,0.0,1.0,3.724718


In [None]:
# Select top segments
top_segments = df.sort_values("final_score", ascending=False).head(17).copy()

# Get match duration (maximum end time in the dataframe)
match_duration = df["end"].max()

# Extend each segment by 10 seconds before and after 
buffer_time = 10  

expanded_segments = top_segments.copy()
expanded_segments["start"] = expanded_segments["start"].apply(lambda x: max(0, x - buffer_time))
expanded_segments["end"] = expanded_segments["end"].apply(lambda x: min(x + buffer_time, match_duration))

# Sort by start time
expanded_segments = expanded_segments.sort_values("start").reset_index(drop=True)

output_csv = Path("../data/features/highlight_segments_with_context.csv")

# Save results
expanded_segments.to_csv(output_csv, index=False)
expanded_segments

Unnamed: 0,start,end,text,audio_score,vader_score,keyword_score,final_score
0,238.44,262.36,"he's been closed down by Buendia, he's 25 yar...",0.068053,0.0,1.0,2.942342
1,1073.0,1101.32,"side delivers, McLean's head on, and he's in,...",0.418614,0.5267,0.0,3.247692
2,1673.0,1701.56,It's 2-0! Norwich City have had two chances i...,0.469073,0.2714,1.0,5.820372
3,1716.04,1741.96,simplest of tasks to pass the ball in the bac...,0.174113,0.3818,1.0,4.013537
4,1964.76,1988.52,Kenny McLean's already dumped header into the...,0.106633,0.0,1.0,3.193114
5,2672.04,2697.56,and they have got one back Manchester City 30...,0.120786,0.0,1.0,3.285111
6,2735.48,2760.28,being a dangerous lead but you know 2-0 at an...,0.039181,0.5927,1.0,3.347379
7,2930.28,2954.68,Second half has started Manchester City chasi...,0.042501,0.0,1.0,2.776258
8,3176.36,3202.04,in the penalty area plays it to Pukian this i...,0.342157,0.6369,0.0,2.860923
9,3356.28,3381.0,Amadou really giving the ball away to Bernard...,0.085742,0.1513,1.0,3.208624


In [None]:
from moviepy import VideoFileClip, concatenate_videoclips

video_path = Path("../data/raw/NOR.mp4")
output_path = Path("../outputs/highlight_reel.mp4")

# Merge overlapping segments
merged_segments = []
for idx, row in expanded_segments.iterrows():
    start = row["start"]
    end = row["end"]
    if not merged_segments:
        merged_segments.append({"start": start, "end": end})
    else:
        last = merged_segments[-1]
        if start <= last["end"]:  # overlap or consecutive
            last["end"] = max(last["end"], end)
        else:
            merged_segments.append({"start": start, "end": end})

# Load original video
video = VideoFileClip(str(video_path))

# Extract clips
clips = []
for seg in merged_segments:
    start = seg["start"]
    end = seg["end"]
    if end - start > 0.5:  # skip very short segments
        clips.append(video.subclipped(start, end))

# Concatenate clips
highlight_reel = concatenate_videoclips(clips)

# Export final video
highlight_reel.write_videofile(str(output_path), codec="libx264", audio_codec="aac")

# Close the video
video.close()
highlight_reel.close()

MoviePy - Building video highlight_reel.mp4.
MoviePy - Writing audio in highlight_reelTEMP_MPY_wvf_snd.mp4


                                                                      

MoviePy - Done.
MoviePy - Writing video highlight_reel.mp4



                                                                           

MoviePy - Done !
MoviePy - video ready highlight_reel.mp4
