In [24]:
import cv2
import numpy as np
import time
from skimage.feature import hog
import librosa
import scipy

def extract_frames_from_video(video_path, take_every_n_frame = 50):
    # Открываем видеофайл
    cap = cv2.VideoCapture(video_path)
    
    # Проверяем, удалось ли открыть видеофайл
    if not cap.isOpened():
        print(f"Ошибка: не удалось открыть видеофайл {video_path}")
        return []

    frames = []
    c = 0
    
    while cap.isOpened():
        # Читаем кадр за кадром
        ret, frame = cap.read()
        c+=1
        
        if not ret:
            break
        
        # Добавляем кадр в список
        if c%take_every_n_frame == 0: frames.append(frame)
    
    # Освобождаем захват видео
    cap.release()
    
    return np.array(frames)

def get_the_most_complicated_frames(video_path, take_every_n_frame = 50, border_of_std = 0.1):
    frames = extract_frames_from_video(video_path, take_every_n_frame)
    print(f"Извлечено {len(frames)} кадров из видео")

    average_hogs = []
    for i in range(len(frames)):
        # Загрузка изображения
        image = cv2.cvtColor(frames[i],cv2.COLOR_BGR2GRAY)
        
        #Вычисление HOG
        fd = hog(image, pixels_per_cell=(16, 16), cells_per_block=(4, 4))
        # Вычисление среднего значения HOG дескрипторов
        average_hog_descriptor = np.mean(fd) #выше коэффициент - больше деталей и обьектов в изображении
        average_hogs.append(average_hog_descriptor)
        
    entropies = []
    for input_image in frames:
        # Загрузка изображения
        image = cv2.cvtColor(input_image,cv2.COLOR_RGB2GRAY)
       
        hist = cv2.calcHist([image], [0], None, [256], [0, 256])
        # Нормализация гистограммы
        hist = hist / np.sum(hist)
        
        # Вычисление меры энтропии
        entropy = -np.sum(hist * np.log2(hist + 1e-10))
        entropies.append(entropy)

    entropies_ = entropies-np.mean(entropies) 

    more_detailed_than_usual_frame_indexes = np.where((entropies_ > np.mean(entropies_)+border_of_std*np.std(entropies_)) & 
                                                      (average_hogs > np.mean(average_hogs)+border_of_std*np.std(average_hogs)))[0]
    less_detailed_than_usual_frame_indexes = np.where((entropies_ < np.mean(entropies_)-border_of_std*np.std(entropies_)) & 
                                                      (average_hogs < np.mean(average_hogs)-border_of_std*np.std(average_hogs)))[0]
    
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    mspf = 1000/fps #milliseconds per frame

    coef = []

    for ind in list(more_detailed_than_usual_frame_indexes):
        coef.append([ ( round((abs(int(ind)-1)*mspf)), round((int(ind)+1)*mspf) ), abs(float(entropies_[ind]+ average_hogs[ind])) ])

    for ind in list(less_detailed_than_usual_frame_indexes):
        coef.append([ ( round((abs(int(ind)-1)*mspf)), round((int(ind)+1)*mspf) ), -1*abs(float(entropies_[ind]+ average_hogs[ind])) ])
    
    coef.sort(key=lambda x: x[0][0])
    
    return coef

def audio_analysis(video_path, seconds_between_grouped_peaks = 1) -> list[tuple[tuple[int, int], float]]:
    audio, sr = librosa.load(video_path)
    rms = librosa.feature.rms(y=audio)[0]

    onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
    peak_indices = np.where((onset_env > np.mean(onset_env)) & (rms > np.mean(rms)))[0]

    times = librosa.times_like(onset_env, sr=sr)
    t = (seconds_between_grouped_peaks * sr * len(times)) / np.shape(audio)[0]
    clusters = scipy.cluster.hierarchy.fcluster(
        scipy.cluster.hierarchy.linkage(peak_indices[:, None], method='single'),
        t=t, criterion='distance')

    segment_energies = []
    for cluster_id in set(clusters):
        cluster_indices = peak_indices[clusters == cluster_id]
        segment_energy = np.sum(rms[cluster_indices])
        segment_energies.append(((round(times[cluster_indices[0]] * 1000),round(times[cluster_indices[-1]] * 1000)), segment_energy))

    segment_energies.sort(key=lambda x: x[0][0])

    return segment_energies

In [25]:
start_time = time.time()

video_path = '/Users/polinanazarova/Desktop/1c6bc481dd52a9938e78e755f1e5c90e.mp4'
compl_coef = get_the_most_complicated_frames(video_path)
audio_peaks = audio_analysis(video_path)

end_time = time.time()
execution_time = end_time - start_time
print(f"Время выполнения: {execution_time} секунд")

Извлечено 392 кадров из видео


  audio, sr = librosa.load(video_path)


Время выполнения: 27.673201084136963 секунд


In [5]:
compl_coef

[[(200, 267), -0.054767640844155675],
 [(233, 300), -0.0861666460581327],
 [(267, 333), -0.08184773711929304],
 [(300, 367), -0.07186339619569454],
 [(333, 400), -0.11076927859413419],
 [(367, 433), -0.45215245785121383],
 [(400, 467), -0.4694655205168657],
 [(433, 500), -0.7018261507652602],
 [(467, 533), -0.5272515793637934],
 [(500, 567), -0.2515403552424841],
 [(533, 600), -0.4532144142827558],
 [(567, 633), -0.9302975347251137],
 [(600, 667), -0.3110997779988947],
 [(633, 700), -1.001092397446911],
 [(667, 733), -0.7386433938179892],
 [(700, 767), -0.27906548215645044],
 [(733, 800), -0.07852016320381108],
 [(767, 833), -0.33106333436665303],
 [(800, 867), -0.28892091541455556],
 [(833, 900), -0.36152662710334116],
 [(867, 933), -0.11789254202200783],
 [(900, 967), 0.36495564173337636],
 [(933, 1000), 0.6177161024644745],
 [(967, 1033), 0.8068547646900242],
 [(1000, 1067), -0.23074843758205843],
 [(1033, 1100), -0.2320611152063437],
 [(1200, 1267), 0.31818822164562505],
 [(1233, 1

In [8]:
audio_peaks[0][0][0]

484352

In [28]:
overal = []

for period in audio_peaks:
    start = period[0][0]
    end = period[0][1]

    if end-start>100:
        for frame_period in compl_coef:
            fr_start = period[0][0]
            fr_end = period[0][1]
            
            if start <= fr_start and end >= fr_end:
                if fr_start - start > 0: 
                    overal.append([(start, fr_start-1), period[1]])
                    
                overal.append([(fr_start, fr_end), period[1]+frame_period[1]])
    
                if end - fr_end > 0: 
                    start = fr_end+1
                else:
                    start = end
                    break
                    
            elif start <= fr_start and end <= fr_end:
                if fr_start - start > 0: 
                    overal.append([(start, fr_start-1), period[1]])
                
                overal.append([(fr_start, end), period[1]+frame_period[1]])

                start = end
                break
                
            elif start >= fr_start and end >= fr_end:
                overal.append([(start, fr_end), period[1]+frame_period[1]])
            
                if end - fr_end > 0: start = fr_end+1
                else: 
                    start = end
                    break
                    
            if end - start < 0: break

        if end - start > 0: overal.append([(start, end), period[1]])

In [29]:
overal

[[(139, 36688), np.float32(25.655794)],
 [(37918, 38893), np.float32(0.8874819)],
 [(40658, 41030), np.float32(1.1333416)],
 [(42144, 60256), np.float32(12.2864485)],
 [(61440, 69172), np.float32(6.291669)],
 [(70426, 90999), np.float32(15.454059)],
 [(93460, 107973), np.float32(9.14404)],
 [(109018, 117052), np.float32(4.6917486)],
 [(118445, 119536), np.float32(1.0245967)],
 [(121115, 133979), np.float32(11.312343)],
 [(135117, 154668), np.float32(14.72972)],
 [(155968, 163770), np.float32(7.0094013)],
 [(165558, 166325), np.float32(0.86557394)],
 [(167369, 171108), np.float32(3.6294305)],
 [(172362, 172849), np.float32(0.7923129)],
 [(174057, 177261), np.float32(2.4005444)],
 [(178445, 188267), np.float32(7.053402)],
 [(190032, 213856), np.float32(12.93529)],
 [(215528, 224955), np.float32(8.0586)],
 [(226255, 229599), np.float32(2.8861275)],
 [(232292, 235683), np.float32(3.5988848)],
 [(236960, 247571), np.float32(8.321306)],
 [(248848, 254909), np.float32(3.7336223)],
 [(258113, 

In [36]:
470831 / 1000 /60

7.847183333333334

In [35]:
audio, sr = librosa.load(video_path)

border_seconds = 1
border_ticks = border_seconds * sr

start_time = 470831//1000*sr
end_time = 490684//1000*sr
print(start_time,end_time)

if (start_time - border_ticks >= 0): start_time-=border_ticks
else: start_time=0
if (end_time + border_ticks <= np.shape(audio)[0]): end_time+=border_ticks
else: end_time=np.shape(audio)[0]
    
Audio(audio[start_time:end_time], rate = sr)

  audio, sr = librosa.load(video_path)


10363500 10804500


In [19]:
from IPython.display import Audio
import librosa
import numpy as np

audio, sr = librosa.load(video_path)
powerful_segments = []
border_seconds = 1
border_ticks = border_seconds * sr

for timecode, _ in audio_peaks:  # Берем топ-3 сегмента
    start_time = timecode[0]/1000*sr
    end_time = timecode[1]/1000*sr
    
    if (start_time - border_ticks >= 0): start_time-=border_ticks
    else: start_time=0
    if (end_time + border_ticks <= np.shape(audio)[0]): end_time+=border_ticks
    else: end_time=np.shape(audio)[0]
        
    powerful_segments.append(audio[start_time:end_time])
    
Audio(powerful_segments[0], rate = sr)

  audio, sr = librosa.load(video_path)


0.13931972789115646 36.6875283446712
37.918185941043085 38.89342403628118
40.658140589569165 41.029659863945575
42.14421768707483 60.25578231292517
61.44 69.17224489795919
70.4261224489796 90.9990022675737
93.46031746031746 107.97278911564626
109.01768707482994 117.05179138321995
118.44498866213152 119.53632653061224
121.11528344671201 133.97913832199546
135.11691609977325 154.6681179138322
155.96843537414966 163.77034013605441
165.55827664399092 166.32453514739228
167.36943310657597 171.10784580498867
172.36172335600907 172.84934240362813
174.05678004535147 177.26113378684806
178.4453514739229 188.26739229024943
190.0321088435374 213.85578231292516
215.52761904761905 224.95492063492063
226.2552380952381 229.59891156462584
230.62058956916098 230.64380952380952
232.29242630385488 235.68253968253967
236.9596371882086 247.57115646258504
248.84825396825397 254.90866213151926
257.0448979591837 257.0448979591837
258.11301587301585 259.1114739229025
260.6207709750567 262.89632653061227
264.59

In [17]:
Audio(powerful_segments[0], rate = sr)

In [18]:
808960/22050

36.6875283446712