### A. TV Spots
  
#### 1. Anmeldung bei Google Cloud Storage & Preprocessing
+ Anmeldung am GC Storage (Sourcefiles)
+ Download der Sourcefiles auf temporäres lokales Laufwerk
+ Preprocessing: Thumbnails extrahieren
+ Preprocessing: Audiospur extrahieren
+ Thumbnails und Audiofiles in GC Cloud hochladen

#### 2. Mediafiles in die KI-Schnittstelle(n) schicken
+ Video-API: Labelrecognition auf Spotebene
+ Video-API: Labelrecognition auf Frameebene => # Labels pro 5" Intervall
+ Video-API: Logorecognition (Logo mit jeweiliger Sequenz (von bis))
+ Video-API: Shotrecognition (Szenenwechsel mit jeweiligem Frame)
+ Video-API: Textrecognition (Text mit jeweiliger Sequenz (von bis), vgl. Logo)
+ Speech2Text-API: Audiotranskript (kompletter erkannter Text, Confidence)

### Imports & Settings

In [1]:
import os, io, random
import pandas as pd
import numpy as np 
import datetime as dt

import moviepy.editor as mp
from moviepy.editor import VideoFileClip
from PIL import Image
from typing import Optional, Sequence

from google.cloud import vision, storage, speech #, videointelligence
from google.cloud import videointelligence as vi
from google.oauth2 import service_account
from GCBucketTools import *
from googleAPIfuncs import * #connectGoogleAPI, createCollectors, parseResultsLabelDetection

from IPython.core.interactiveshell import InteractiveShell
from andreasTools import *
from macos_speech import Synthesizer
speaker = Synthesizer(voice='Alex')

InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 400
TODAY = dt.date.today().strftime("%d_%m_%Y")


### 0. Anmeldung am GC Storage (Sourcefiles)

In [2]:
# Generelle Settings
credentials, storage_client = connectGoogleAPI()
bucket_name = "september_tvc" 
bucketURI = f"gs://{bucket_name}"
bucket = storage_client.bucket(bucket_name)
blobliste = list_blobs(bucket_name, credentials)
print(f"Anzahl Files im Bucket: {len(blobliste)}")
#batch = [f for f in blobliste if f.startswith("THUMBNAILS/") ] #and f.endswith("")
#print(f"Anzahl Files in der Batch: {len(batch)}")

Anzahl Files im Bucket: 14878


### 2. Call Video API

In [3]:
batch = [f for f in blobliste if f.startswith("PROCESSING/") and f.endswith("mp4")]
len(batch)

miniBatch = batch[:10]
miniBatch

275

['PROCESSING/19107333_Dis_1u1.mp4',
 'PROCESSING/19107333_Dis_Audi.mp4',
 'PROCESSING/19107333_Dis_Commerzbank.mp4',
 'PROCESSING/19107333_Dis_Generali.mp4',
 'PROCESSING/19107333_Dis_Haribo.mp4',
 'PROCESSING/19107333_Dis_Nivea.mp4',
 'PROCESSING/19107333_Dis_Opel.mp4',
 'PROCESSING/19107333_Dis_Samsung.mp4',
 'PROCESSING/19107333_Dis_Telekom.mp4',
 'PROCESSING/19107333_EON_kurz_1.mp4']

In [4]:
dfTVC, dfFrame, dfShotLabel, dfLogo, dfShots, dfText, dfAudio = createCollectors()

# General settings for Video API
video_client = vi.VideoIntelligenceServiceClient(credentials=credentials)
features = [vi.Feature.LABEL_DETECTION]
video_mode = vi.LabelDetectionMode.SHOT_AND_FRAME_MODE #SHOT_MODE           # SHOT_AND_FRAME_MODE 
video_config = vi.LabelDetectionConfig(label_detection_mode=video_mode)
context = vi.VideoContext(label_detection_config=video_config)

# General settings for Speech-2-Text API
# speech_client = speech.SpeechClient(credentials=credentials)
# audio_config = speech.RecognitionConfig(
#     # encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
#     sample_rate_hertz=16_000,
#     language_code="de-DE",
#     max_alternatives=1,  # Anzahl der Vorschläge
#     # model="phone_calL" #"video"
#     )

for i, mediafile in enumerate(miniBatch,1):

    if (i-1)%5==0:
        dfTVC      .to_pickle(f"dfTVC_{TODAY}_{i-1}.pkl")
        dfFrame    .to_pickle(f"dfFrame_{TODAY}_{i-1}.pkl")
        dfShotLabel.to_pickle(f"dfShotLabel_{TODAY}_{i-1}.pkl")
        dfLogo     .to_pickle(f"dfLogo_{TODAY}_{i-1}.pkl")
        dfShots    .to_pickle(f"dfShots_{TODAY}_{i-1}.pkl")
        dfText     .to_pickle(f"dfText_{TODAY}_{i-1}.pkl")
        dfAudio    .to_pickle(f"dfAudio_{TODAY}_{i-1}.pkl")
        print("Temporary storing of all dataframes sucessful")
        speaker.say(f"Ok, next one. Number {i}")
    
    # print("="*50)
    print(f"\nProcessing video # {i}, {mediafile}") 
    mediapath = os.path.join(bucketURI, mediafile)

  # ==========================
    # Call different methods from API with different responses stored in individual result variables
    # >> Call Label Detection Method: Detects labels given a GCS path.
    featuresLabel = [vi.Feature.LABEL_DETECTION]
    operationLabel = video_client.annotate_video(request={"features": featuresLabel, "input_uri": mediapath, "video_context": context})
    resultLabel = operationLabel.result(timeout=180)
    # >> Call Logo Detection Method
    featuresLogo = [vi.Feature.LOGO_RECOGNITION]
    operationLogo = video_client.annotate_video(request={"features": featuresLogo, "input_uri": mediapath})
    resultLogo = operationLogo.result(timeout=180)
    # >> Call Shot Recognition Method
    featuresShots = [vi.Feature.SHOT_CHANGE_DETECTION]
    operationShots = video_client.annotate_video(request={"features": featuresShots, "input_uri": mediapath})
    resultShots = operationShots.result(timeout=90)
    # >> Call Text Detection Method
    featuresText = [vi.Feature.TEXT_DETECTION]
    operationText = video_client.annotate_video(request={"features": featuresText, "input_uri": mediapath})
    resultText = operationText.result(timeout=180)
    # >> Call Speech-2-Text API
    # audiofile = mediafile.replace("mp4","mp3").replace("PROCESSING/", "AUDIOTRACKS/")
    # audiopath = os.path.join(bucketURI, audiofile)
    # audio = speech.RecognitionAudio(uri=audiopath)
    # operationAudio = speech_client.long_running_recognize(config=audio_config, audio=audio)
    # resultAudio = operationAudio.result(timeout=90)
    resultSpeech = transcribe_speech(video_client, mediapath, "de-DE")

    parseResults_LabelDetection(resultLabel, dfTVC, mediafile, i)
    parseResults_FrameLabelDetection(resultLabel, dfFrame, mediafile, i)
    parseResults_LogoDetection(resultLogo, dfLogo, mediafile, i)
    parseResults_ShotLabelDetection(resultLabel, dfShotLabel, mediafile, i)
    parseResults_ShotDetection(resultShots, dfShots, mediafile, i)
    parseResults_TextDetection(resultText, dfText, mediafile, i)
    parseResults_Speech2Text(i, mediafile, dfAudio, resultSpeech)

dfTVC.to_pickle(f"dfTVC_TESTNEU.pkl")  
dfFrame.to_pickle(f"dfFrame_TESTNEU.pkl")
dfShotLabel.to_pickle(f"dfShotLabel_TESTNEU.pkl")
dfLogo.to_pickle(f"dfLogo_TESTNEU.pkl")
dfShots.to_pickle(f"dfShots_TESTNEU.pkl")
dfText.to_pickle(f"dfText_TESTNEU.pkl")
dfAudio.to_pickle(f"dfAudio_TESTNEU.pkl")

speaker.say("Yippy ya ya yippy yippy yeah")
# # 



Temporary storing of all dataframes sucessful

Processing video # 1, PROCESSING/19107333_Dis_1u1.mp4


  arr_value = np.array(value)




Processing video # 2, PROCESSING/19107333_Dis_Audi.mp4

Processing video # 3, PROCESSING/19107333_Dis_Commerzbank.mp4

Processing video # 4, PROCESSING/19107333_Dis_Generali.mp4

Processing video # 5, PROCESSING/19107333_Dis_Haribo.mp4
Temporary storing of all dataframes sucessful

Processing video # 6, PROCESSING/19107333_Dis_Nivea.mp4


  arr_value = np.array(value)




Processing video # 7, PROCESSING/19107333_Dis_Opel.mp4

Processing video # 8, PROCESSING/19107333_Dis_Samsung.mp4

Processing video # 9, PROCESSING/19107333_Dis_Telekom.mp4

Processing video # 10, PROCESSING/19107333_EON_kurz_1.mp4


In [None]:
dfAudio


In [5]:
# dfTVC; dfTVC.shape

dfTVC.sample(5); dfTVC.shape
dfFrame.sample(10); dfFrame.shape
dfLogo.sample(10); dfLogo.shape
dfShotLabel.sample(10); dfShotLabel.shape
dfShots.sample(10); dfShots.shape
dfText.sample(10); dfText.shape



Unnamed: 0,MEDIAFILE,SEGMENT_LABEL,SEGM_CATEGORY_LABEL,LABEL_CONF,TIME_START,TIME_END
200101,PROCESSING/19107333_Dis_Audi.mp4,motor vehicle,vehicle,0.56485,0.0,30.0
800201,PROCESSING/19107333_Dis_Samsung.mp4,mobile phone,telephone,0.870158,0.0,60.026633
700301,PROCESSING/19107333_Dis_Opel.mp4,car,vehicle,0.730794,0.0,24.96
100100,PROCESSING/19107333_Dis_1u1.mp4,television advertisement,,0.302266,0.0,23.96
700901,PROCESSING/19107333_Dis_Opel.mp4,family car,car,0.483385,0.0,24.96


(32, 6)

Unnamed: 0,MEDIAFILE,FRAME_TIME,FRAME_LABEL,FRAME_CATEGORY_LABEL,FRAME_CONF
316000,PROCESSING/19107333_Dis_Commerzbank.mp4,6.0,jaw,,0.576813
702301,PROCESSING/19107333_Dis_Opel.mp4,9.0,landmark,geographical feature,0.90291
220300,PROCESSING/19107333_Dis_Audi.mp4,4.0,building,,0.542935
614400,PROCESSING/19107333_Dis_Nivea.mp4,42.0,lighting,,0.713085
215901,PROCESSING/19107333_Dis_Audi.mp4,4.0,display device,technology,0.420033
408501,PROCESSING/19107333_Dis_Generali.mp4,15.0,graphics,artwork,0.62271
318501,PROCESSING/19107333_Dis_Commerzbank.mp4,11.0,skyline,city,0.948368
719700,PROCESSING/19107333_Dis_Opel.mp4,19.0,trademark,,0.447151
107000,PROCESSING/19107333_Dis_1u1.mp4,6.0,automotive design,,0.826409
300500,PROCESSING/19107333_Dis_Commerzbank.mp4,1.0,magenta,,0.617018


(1820, 5)

Unnamed: 0,MEDIAFILE,LOGO,ENTITY_ID,TIME_START,TIME_END,LOGO_CONF
501002,PROCESSING/19107333_Dis_Haribo.mp4,Haribo,/m/01nwzm,21.72,24.96,0.929099
801003,PROCESSING/19107333_Dis_Samsung.mp4,Samsung Group,/m/07gv72,40.8408,41.3413,0.911923
803001,PROCESSING/19107333_Dis_Samsung.mp4,Woolmark,/m/0hzn8hp,21.4214,22.4224,0.89916
301027,PROCESSING/19107333_Dis_Commerzbank.mp4,Commerzbank,/m/06csb7,25.56,26.76,0.934312
301008,PROCESSING/19107333_Dis_Commerzbank.mp4,Commerzbank,/m/06csb7,13.2,13.32,0.872696
301020,PROCESSING/19107333_Dis_Commerzbank.mp4,Commerzbank,/m/06csb7,21.6,21.96,0.903293
203001,PROCESSING/19107333_Dis_Audi.mp4,Audi,/m/0k50,26.4,28.08,0.941044
301033,PROCESSING/19107333_Dis_Commerzbank.mp4,Commerzbank,/m/06csb7,28.2,29.16,0.951227
601002,PROCESSING/19107333_Dis_Nivea.mp4,Nivea,/m/03vzwk,61.08,62.16,0.867862
903001,PROCESSING/19107333_Dis_Telekom.mp4,"Mars, Incorporated",/m/01kh5q,14.76,16.44,0.895078


(103, 6)

Unnamed: 0,MEDIAFILE,SHOT_LABEL,SHOT_CATEGORY_LABEL,LABEL_CONF,TIME_START,TIME_END
902300,PROCESSING/19107333_Dis_Telekom.mp4,human,,0.350208,29.12,30.16
405901,PROCESSING/19107333_Dis_Generali.mp4,cycle sport,sports,0.404447,14.0,14.5
202400,PROCESSING/19107333_Dis_Audi.mp4,vehicle,,0.893798,23.24,24.96
602501,PROCESSING/19107333_Dis_Nivea.mp4,black and white,style,0.826691,0.0,0.44
601000,PROCESSING/19107333_Dis_Nivea.mp4,black,,0.449509,26.96,28.16
403601,PROCESSING/19107333_Dis_Generali.mp4,tennis player,person,0.835895,6.875,7.875
903000,PROCESSING/19107333_Dis_Telekom.mp4,hand,,0.581027,25.04,26.08
202001,PROCESSING/19107333_Dis_Audi.mp4,motor vehicle,vehicle,0.85706,23.24,24.96
201301,PROCESSING/19107333_Dis_Audi.mp4,skyscraper,building,0.676594,4.32,5.6
203100,PROCESSING/19107333_Dis_Audi.mp4,black,,0.449509,0.0,1.0


(431, 6)

Unnamed: 0,MEDIAFILE,SHOT_ID,TIME_START,TIME_END
900019,PROCESSING/19107333_Dis_Telekom.mp4,19,26.12,27.08
800015,PROCESSING/19107333_Dis_Samsung.mp4,15,40.673966,43.8438
1000012,PROCESSING/19107333_EON_kurz_1.mp4,12,17.2,19.16
1000002,PROCESSING/19107333_EON_kurz_1.mp4,2,2.72,4.04
500002,PROCESSING/19107333_Dis_Haribo.mp4,2,2.56,5.28
800014,PROCESSING/19107333_Dis_Samsung.mp4,14,38.6386,40.6406
700012,PROCESSING/19107333_Dis_Opel.mp4,12,13.76,14.48
300006,PROCESSING/19107333_Dis_Commerzbank.mp4,6,15.56,16.2
600022,PROCESSING/19107333_Dis_Nivea.mp4,22,25.48,26.92
900014,PROCESSING/19107333_Dis_Telekom.mp4,14,18.64,21.16


(206, 4)

Unnamed: 0,MEDIAFILE,TIME_START,TIME_END,TEXT,TXT_CONF
900017,PROCESSING/19107333_Dis_Telekom.mp4,10.92,11.04,weniger,0.95033
500002,PROCESSING/19107333_Dis_Haribo.mp4,1.92,2.52,HARI,0.99253
600006,PROCESSING/19107333_Dis_Nivea.mp4,61.92,62.16,me,0.818492
300014,PROCESSING/19107333_Dis_Commerzbank.mp4,25.56,26.76,BAUFINANZIERUNG,0.998617
900063,PROCESSING/19107333_Dis_Telekom.mp4,5.52,5.76,CAM4,0.93179
800045,PROCESSING/19107333_Dis_Samsung.mp4,33.2332,34.3343,5 Pro-grade,1.0
800024,PROCESSING/19107333_Dis_Samsung.mp4,42.1421,42.1421,58%,1.0
400003,PROCESSING/19107333_Dis_Generali.mp4,10.375,10.75,NAL,0.717291
200030,PROCESSING/19107333_Dis_Audi.mp4,3.6,4.2,ebens,0.99334
300034,PROCESSING/19107333_Dis_Commerzbank.mp4,26.16,26.28,hand,1.0


(406, 5)

In [None]:
dfTVC.shape;dfTVC.head()
dfFrame.shape;dfFrame.head()
dfLogo.shape;dfLogo.head()
dfShots.shape;dfShots.head()
dfText.shape;dfText.head()
dfAudio.shape;dfAudio.head()

***
***
# Parkplatz

In [None]:
tmp = dfAudio.copy()
grp = dfAudio.groupby("MEDIAFILE").agg(list)
grp



In [None]:
dfMetaData = pd.concat([pd.read_pickle("dfMetaData_TVC_BATCH1_.pkl", pd.read_pickle("dfMetaData_TVC_BATCH2_.pkl")])
dfTVC      = pd.read_pickle("dfTVC_FINAL")
dfFrame    = pd.read_pickle("dfFrame_FINAL.pkl")
dfLogo     = pd.read_pickle("dfLogo_FINAL.pkl")
dfShots    = pd.read_pickle("dfShots_FINAL.pkl")
dfText     = pd.read_pickle("dfText_FINAL.pkl")
dfAudio    = pd.read_pickle("dfAudio_FINAL.pkl")

dfMetaData.DURATION = dfMetaData.DURATION.astype(float)
dfMetaData.NBR_FRAMES = dfMetaData.NBR_FRAMES.astype(int)
dfMetaData.FPS = dfMetaData.FPS.astype(float)
dfMetaData.shape; dfMetaData.head(10)

dfTVC.LABEL_CONF = dfTVC.LABEL_CONF.astype(float)
dfTVC.TIME_START = dfTVC.TIME_START.astype(float)
dfTVC.TIME_END = dfTVC.TIME_END.astype(float)
dfTVC.sort_values(by=["MEDIAFILE","TIME_START","LABEL_CONF"], ascending=[True,True,False], inplace=True)
dfTVC.shape; dfTVC.head(10)

dfFrame.FRAME_TIME = dfFrame.FRAME_TIME.astype(float)
dfFrame.FRAME_CONF = dfFrame.FRAME_CONF.astype(float)
dfFrame.sort_values(by=["MEDIAFILE","FRAME_TIME","FRAME_CONF"], ascending=[True,True,False], inplace=True)
dfFrame.shape; dfFrame.head(10)

dfLogo.TIME_START = dfLogo.TIME_START.astype(float)
dfLogo.LOGO_CONF = dfLogo.LOGO_CONF.astype(float)
dfLogo.sort_values(by=["MEDIAFILE","TIME_START","LOGO_CONF"], ascending=[True,True,False], inplace=True)
dfLogo.shape; dfLogo.head(10)

dfShots.shape; dfShots.head(10)

dfText.TIME_START = dfText.TIME_START.astype(float)
dfText.TIME_END = dfText.TIME_END.astype(float)
dfText.TXT_CONF = dfText.TXT_CONF.astype(float)
dfText.sort_values(by=["MEDIAFILE","TIME_START"], ascending=[True,True], inplace=True)
dfText.shape; dfText.head(10)


dfAudio.shape; dfAudio.head(10)


In [None]:
filter_ = (
    f.startswith("SOURCEFILES/") or
    f.startswith("AUDIOFILES/") or
    f.startswith("DATAFRAMES/") or
    f.startswith("THUMBNAILS/")
)

batch = [f for f in blobliste if not filter_]
len(batch)
    


In [None]:
from IPython.display import Image
#Image(filename='test.png') 
# Image("https://storage.googleapis.com/thumbnails_public/20112158_01_Spot_01_Kaufland_thumb0.jpg")
Image("https://storage.googleapis.com/thumbnails_public/2021-44_Spot_07_REWE_thumb16.jpg")
#Image(dfThumb.iloc[33,0])

In [None]:
from IPython.display import Video
Video("https://storage.googleapis.com/sandbox_public/21225110_Spot_Whiskas.mp4")


In [None]:
from IPython.display import Image
#Image(filename='test.png') 
# Image("https://storage.googleapis.com/thumbnails_public/20112158_01_Spot_01_Kaufland_thumb0.jpg")
Image("https://storage.googleapis.com/thumbnails_public/2021-44_Spot_07_REWE_thumb16.jpg")
#Image(dfThumb.iloc[33,0])

In [None]:
# Ansteuern der Video API & der Speech-2-Text API


# *************************************************
# Call APIs for each mediafile in batch
# *************************************************
    
  



# ==========================
    # -    -------- Fetch results from different Methods
    # --------- Start with Label Detection ---------

    
  
    

    

    # # --------- Fetch results from Audio Transcription ---------
    for u, result in enumerate(resultAudio.results,1):
        # The first alternative is the most likely one for this portion.
        audioTranscript = result.alternatives[0].transcript
        audioConfidence = result.alternatives[0].confidence
        # print(u"Transcript: {}".format(result.alternatives[0].transcript))
        # print("Confidence: {}".format(result.alternatives[0].confidence))
        dfAudio.loc[i*100_000+u,:] = [mediafile, audioTranscript, audioConfidence]
     
dfTVC.to_pickle(f"dfTVC_FOURTH.pkl")
dfFrame.to_pickle(f"dfFrame_FOURTH.pkl")
dfShotLabel.to_pickle(f"dfShotLabel_FOURTH.pkl")
dfLogo.to_pickle(f"dfLogo_FOURTH.pkl")
dfShots.to_pickle(f"dfShots_FOURTH.pkl")
dfText.to_pickle(f"dfText_FOURTH.pkl")
dfAudio.to_pickle(f"dfAudio_FOURTH.pkl")

speaker.say("Yippy ya ya yippy yippy yeah")