In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline

import cv2


In [2]:
cap = cv2.VideoCapture('video.mp4') 

arr = np.empty((0,4096), int)   #initializing 4096 dimensional array to store 'flattened' color histograms
D=dict()   #to store the original frame (array)
count=0    #counting the number of frames
start_time = time.time()
while cap.isOpened():
    
    # Read the video file.
    ret, frame = cap.read()
    
    # If we got frames.
    if ret == True:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  #since cv reads frame in bgr order so rearraning to get frames in rgb order
        D[count] = frame_rgb   #storing each frame (array) to D , so that we can identify key frames later 
        hist = cv2.calcHist(frame_rgb, [0, 1, 2], None, [16, 16, 16], [0, 256, 0, 256, 0, 256])  #finding histograms
        hist1= hist.flatten()  #flatten the hist to one-dimensinal vector         
        arr =np.vstack((arr, hist1))  #appending each one-dimensinal vector to generate N*M matrix (where N is number of frames
          #and M is 4096)
        count+=1
    else:
        break

print("--- %s seconds ---" % (time.time() - start_time))

final_arr = arr.transpose()   #transposing so that i will have all frames in columns i.e M*N dimensional matrix 
#where M is 4096 and N is number of frames

print(final_arr.shape)
print(count)


--- 40.23987436294556 seconds ---
(4096, 1832)
1832


In [3]:
U, S, V = np.linalg.svd(final_arr) #SVD of Final_arr matrix

In [4]:
print(U.shape, S.shape, V.shape)

(4096, 4096) (1832,) (1832, 1832)


In [5]:
S1= S[:15]  #top 15 singular values from 44152 to 2045

In [6]:
v1= V[:15, :]  # 15 vectors from right singular vector corresponding to 15 largest singular value

In [8]:
v1_t = v1.transpose()

In [9]:
projections = v1_t @ np.diag(S1) #the column vectors i.e the frame histogram data has been projected onto the orthonormal basis 
#formed by vectors of the left singular matrix U .The coordinates of the frames in this space are given by v1_t @ np.diag(S1)
#So we can see that , now we need only 15 dimensions to represent each column/frame 
print(projections.shape)

(1832, 15)


In [10]:
#dynamic clustering of projected frame histograms to find which all frames are similar i.e make shots
f=projections
C = dict() #to store frames in respective cluster
for i in range(f.shape[0]):
    C[i] = np.empty((0,15), int)
    
#adding first two projected frames in first cluster i.e Initializaton    
C[0] = np.vstack((C[0], f[0]))   
C[0] = np.vstack((C[0], f[1]))

E = dict() #to store centroids of each cluster
for i in range(projections.shape[0]):
    E[i] = np.empty((0,15), int)
    
E[0] = np.mean(C[0], axis=0) #finding centroid of C[0] cluster

count = 0
for i in range(2,f.shape[0]):
    similarity = np.dot(f[i], E[count])/( (np.dot(f[i],f[i]) **.5) * (np.dot(E[count], E[count]) ** .5)) #cosine similarity
    #this metric is used to quantify how similar is one vector to other. The maximum value is 1 which indicates they are same
    #and if the value is 0 which indicates they are orthogonal nothing is common between them.
    #Here we want to find similarity between each projected frame and last cluster formed chronologically. 
     
    
    if similarity < 0.9: #if the projected frame and last cluster formed  are not similar upto 0.9 cosine value then 
                         #we assign this data point to newly created cluster and find centroid 
                         #We checked other thresholds also like 0.85, 0.875, 0.95, 0.98
                        #but 0.9 looks okay because as we go below then we get many key-frames for similar event and 
                        #as we go above we have lesser number of key-frames thus missed some events. So, 0.9 seems optimal.
                        
        count+=1         
        C[count] = np.vstack((C[count], f[i])) 
        E[count] = np.mean(C[count], axis=0)   
    else:  #if they are similar then assign this data point to last cluster formed and update the centroid of the cluster
        C[count] = np.vstack((C[count], f[i])) 
        E[count] = np.mean(C[count], axis=0)          

In [11]:
b = []  #find the number of data points in each cluster formed.

#We can assume that sparse clusters indicates 
#transition between shots so we will ignore these frames which lies in such clusters and wherever the clusters are densely populated indicates they form shots
#and we can take the last element of these shots to summarise that particular shot

for i in range(f.shape[0]):
    b.append(C[i].shape[0])

last = b.index(0)  #where we find 0 in b indicates that all required clusters have been formed , so we can delete these from C
b1=b[:last ] #The size of each cluster.

In [12]:
res = [idx for idx, val in enumerate(b1) if val >= 25] #so i am assuming any dense cluster with atleast 25 frames is eligible to 
#make shot.
print(len(res)) #so total 27 shots with 44 (71-27) cuts

27


In [13]:
GG = C #copying the elements of C to GG, the purpose of  the below code is to label each cluster so later 
#it would be easier to identify frames in each cluster
for i in range(last):
    p1= np.repeat(i, b1[i]).reshape(b1[i],1)
    GG[i] = np.hstack((GG[i],p1))

In [14]:
#the purpose of the below code is to append each cluster to get multidimensional array of dimension N*16, N is number of frames
F=  np.empty((0,16), int) 
for i in range(last):
    F = np.vstack((F,GG[i]))

In [15]:
#converting F (multidimensional array)  to dataframe
colnames = ['v1', 'v2', 'v3','v4', 'v5', 'v6', 'v7', 'v8','v9', 'v10', 'v11', 'v12', 'v13','v14', 'v15', 'v16' ]
df = pd.DataFrame(F, columns= colnames)

In [16]:
df['v16']= df['v16'].astype(int)  #converting the cluster level from float type to integer type

In [17]:
df1 =  df[df.v16.isin(res)]   #filter only those frames which are eligible to be a part of shot or filter those frames who are
#part of required clusters that have more than 25 frames in it

In [18]:
new = df1.groupby('v16').tail(1)['v16'] #For each cluster /group take its last element which summarize the shot i.e key-frame

In [19]:

new1 = new.index #finding key-frames (frame number so that we can go back get the original picture)
                                   

In [122]:
#output the frames in png format
for c in new1:
    frame_rgb1 = cv2.cvtColor(D[c], cv2.COLOR_RGB2BGR) #since cv consider image in BGR order
    frame_num_chr = str(c)
    file_name = 'frame'+ frame_num_chr +'.png'
    cv2.imwrite(file_name, frame_rgb1)
    