In [18]:
pip install nltk librosa pandas

Note: you may need to restart the kernel to use updated packages.


In [19]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vinbo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [20]:
import pandas as pd

segments = []
with open("commentary_segments.txt", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            # Example format: [start -> end] segment_text
            parts = line.split("] ")
            time_range = parts[0][1:]  # remove starting '['
            start, end = time_range.replace("s","").split(" -> ")
            text = parts[1]
            segments.append({"start": float(start), "end": float(end), "text": text})

df = pd.DataFrame(segments)


In [21]:
df.head(5)

Unnamed: 0,start,end,text
0,0.0,6.28,How do the Norwich players feel at this point...
1,6.28,10.72,Absolutely loving it because the fans will be...
2,10.72,15.08,"They all want first touches, give it to a yel..."
3,15.08,18.16,Jamal Lewis up against Bernardo Silva looks...\n
4,18.16,19.88,"Wow, that looks a problem, that one does.\n"


In [22]:
import librosa
import numpy as np

audio_file = "commentary.wav"
y, sr = librosa.load(audio_file, sr=None)

# Short-time energy (simple excitement metric)
frame_length = 2048
hop_length = 512
energy = np.array([sum(abs(y[i:i+frame_length]**2)) for i in range(0, len(y), hop_length)])

# Normalize to 0–1
energy = (energy - energy.min()) / (energy.max() - energy.min())

# Map segment start/end to energy
def get_segment_energy(start, end):
    start_frame = int(start * sr / hop_length)
    end_frame = int(end * sr / hop_length)
    return energy[start_frame:end_frame].mean()

df["audio_score"] = df.apply(lambda row: get_segment_energy(row["start"], row["end"]), axis=1)

In [23]:
df.sort_values(by='audio_score', ascending=False).head(5)

Unnamed: 0,start,end,text,audio_score
1675,5883.16,5885.16,Norwich have done it!\n,0.496202
363,1683.0,1691.56,It's 2-0! Norwich City have had two chances i...,0.469073
1676,5885.16,5887.16,That is a wonderful Carrow Road night\n,0.423164
248,1083.0,1091.32,"side delivers, McLean's head on, and he's in,...",0.418614
1674,5881.16,5883.16,Norwich putting him under pressure\n,0.377807


In [24]:
df["vader_score"] = df["text"].apply(lambda x: analyser.polarity_scores(x)["compound"])

In [25]:
df.sort_values(by='audio_score', ascending=False).head(5)

Unnamed: 0,start,end,text,audio_score,vader_score
1675,5883.16,5885.16,Norwich have done it!\n,0.496202,0.0
363,1683.0,1691.56,It's 2-0! Norwich City have had two chances i...,0.469073,0.2714
1676,5885.16,5887.16,That is a wonderful Carrow Road night\n,0.423164,0.5719
248,1083.0,1091.32,"side delivers, McLean's head on, and he's in,...",0.418614,0.5267
1674,5881.16,5883.16,Norwich putting him under pressure\n,0.377807,-0.296


In [26]:
alpha = 0.5  # weight for sentiment
beta = 0.5   # weight for audio

df["final_score"] = alpha * (df["vader_score"] * 10) + beta * (df["audio_score"] * 10) 

In [27]:
df.sort_values(by='final_score', ascending=False).head(5)

Unnamed: 0,start,end,text,audio_score,vader_score,final_score
257,1137.0,1142.84,and they get them in that box from that corne...,0.181506,0.8481,5.148031
698,3311.96,3319.0,how good are these set-up backs keep raving a...,0.161819,0.8658,5.138096
252,1109.88,1116.28,"What a goal, great delivery from the corner, ...",0.168945,0.8481,5.085225
367,1711.8,1717.0,"that's brilliant from Norwich City, what a gr...",0.15936,0.8481,5.037298
1676,5885.16,5887.16,That is a wonderful Carrow Road night\n,0.423164,0.5719,4.975318


In [28]:
# --- Select top 15 segments ---
top_segments = df.sort_values("final_score", ascending=False).head(15).copy()

# --- Include previous and next segments ---
expanded_indices = set()
for idx in top_segments.index:
    if idx - 1 in df.index: expanded_indices.add(idx - 1)
    expanded_indices.add(idx)
    if idx + 1 in df.index: expanded_indices.add(idx + 1)

expanded_segments = df.loc[sorted(expanded_indices)].drop_duplicates(subset=["start","end"]).sort_values("start").reset_index(drop=True)

# --- Save results ---
expanded_segments.to_csv("highlight_segments_with_context.csv", index=False)
expanded_segments

Unnamed: 0,start,end,text,audio_score,vader_score,final_score
0,316.76,320.44,"left-hand side, Byron should make the challen...",0.065584,0.0772,0.713921
1,320.44,324.76,"Byron does make the challenge, and wins a fre...",0.09002,0.8555,4.727599
2,324.76,329.48,"applause for the former West Ham player, and ...",0.057842,-0.3612,-1.516789
3,520.28,524.2,"the end of that one, but Norwich I don't thin...",0.109971,-0.6956,-2.928146
4,524.2,530.44,tackle. Yeah well Steeperman went in hard on ...,0.054855,0.9001,4.774777
5,530.44,534.84,"kick to Manchester City, but after the decisi...",0.048958,0.3919,2.204291
6,974.44,978.28,"deflections, and eventually falls kindly for ...",0.042176,0.4939,2.680378
7,978.28,983.16,"the play out towards the right hand side, and...",0.06779,0.8777,4.727451
8,983.16,987.8,that ball in play going back towards his own ...,0.058793,0.6486,3.536967
9,1078.04,1083.0,side but driving at them like they just did t...,0.099751,0.7096,4.046755


In [33]:
from moviepy import VideoFileClip, concatenate_videoclips

buffer_sec = 2  # seconds to add before and after each segment

# Step 1: Merge overlapping segments with buffer
merged_segments = []
for idx, row in expanded_segments.iterrows():
    start = max(0, row["start"] - buffer_sec)  # don't go below 0
    end = row["end"] + buffer_sec
    if not merged_segments:
        merged_segments.append({"start": start, "end": end})
    else:
        last = merged_segments[-1]
        if start <= last["end"]:  # overlap or consecutive
            last["end"] = max(last["end"], end)
        else:
            merged_segments.append({"start": start, "end": end})

# Step 2: Load original video
video = VideoFileClip("NOR.mp4")

# Step 3: Extract clips
clips = []
for seg in merged_segments:
    start = seg["start"]
    end = seg["end"]
    if end - start > 0.5:  # skip very short segments
        clips.append(video.subclip(start, end))

# Step 4: Concatenate clips
highlight_reel = concatenate_videoclips(clips)

# Step 5: Export final video
highlight_reel.write_videofile("highlight_reel_with_buffer.mp4", codec="libx264", audio_codec="aac")


AttributeError: 'VideoFileClip' object has no attribute 'subclip'