## Aggregating Transcripts from Sponsored Videos

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
PATH = "E:/Facebook/transcripts/sponsored/"

Functions to read transcript files.

In [3]:
# load the transcripts
def read_transcript(file, encoding = 'utf-8'):
    with open(file, 'r', encoding=encoding) as f:
        transcript = f.read()
        return transcript


In [9]:
# create a dictionary of video_id and transcript
transcript_dict = {}
print("Creating transcript dictionary")
for root, dirs, files in os.walk(PATH):
    for file in files:
        video_id = file.split(".")[0]
        try:
            transcript_dict[video_id] = read_transcript(os.path.join(root, file))
        except UnicodeDecodeError:
            transcript_dict[video_id] = read_transcript(os.path.join(root, file), encoding='ISO-8859-1')

Creating transcript dictionary


In [10]:
# create a dataframe from the dictionary
df = pd.DataFrame(transcript_dict.items(), columns=["video_id", "transcript"])
df['video_id'] = df['video_id'].astype(np.int64)
df['length'] = df['transcript'].apply(lambda x: len(x.split()))
# replacing transcripts with length less than 3 words with None
df.loc[df['length'] < 3, 'transcript'] = None
print(df.shape)
print(df.head())

(15013, 3)
           video_id                                         transcript  length
0  1001647963223889                                               None       0
1  1001709553198849                                               None       0
2  1001753429945245                                               None       1
3  1001834046625140  it is now time for my favorite segment of the ...      71
4  1002158923207565  November 26th 1986 at 12:00 got out so fast it...      52


In [18]:
# source data
data = pd.read_csv(r"C:\Users\doosti\OneDrive - Chapman University\Research\Research Projects\Facebook\Tubular\revision_2024\pooled_us_jul2024.csv")
data['video_id'] = data['video_id'].astype(np.int64) # convert video_id to int64
print(data.shape)

(220033, 153)


Note: To check the mismatch between video ids from source and current data frame, refer to `aggregate_transcripts_sponsored.py` for more details.

In [90]:
# merge the two dataframes
data['transcript'] = data.merge(df, left_on='new_id', right_on='video_id', how='left').transcript
sponsored = data[data['transcript'].notnull()][['video_id','new_id','creator_name','sponsor_name','transcript']]

In [22]:
# functions to infer about the transcript
def is_in_text(terms, text):
    if ' '.join(terms).lower() in text.lower():
        return True
    for term in terms:
        if term.lower() in ['the','a','an','of','and','or','in','on','at','to','for','with','by','from','as','is']:
            continue
        if term.lower() in text.lower().split():
            return True
    return False

def term_in_text(terms, text):
    found = []
    if ' '.join(terms).lower() in text.lower():
        return terms
    for term in terms:
        if term.lower() in ['the','a','an','of','and','or','in','on','at','to','for','with','by','from','as','is']:
            continue
        if term.lower() in text.lower().split():
            found.append(term)
    return found

def sponsored_by(text):
    if 'sponsored' in text.lower().split():
        return True
    elif 'brought to you by' in text.lower():
        return True
    return False

# function to show 20 rows of a data frame at a time every time by pressing enter
def display(df):
    for i in range(0, len(df), 20):
        print(df[i:i+20])
        input("Press Enter to continue...")

In [91]:
# preparing the sponsored data with transcripts
sponsored['terms'] = sponsored.sponsor_name.apply(lambda x: x.split())
sponsored['is_in_text'] = sponsored.apply(lambda x: is_in_text(x.terms, x.transcript), axis=1)
print(f"Number of sponsored videos including sponsor name: {sponsored['is_in_text'].sum()}")
sponsored['found_terms'] = sponsored.apply(lambda x: term_in_text(x.terms, x.transcript), axis=1)
sponsored['sponsored_by'] = sponsored.transcript.apply(sponsored_by)
print(f"Number of sponsored videos including sponsored by: {sponsored['sponsored_by'].sum()}")

Number of sponsored videos including sponsor name: 1631
Number of sponsored videos including sponsored by: 243


In [48]:
temp = sponsored[sponsored.is_in_text==True][['sponsor_name','found_terms','transcript']]
temp['found'] = temp.found_terms.apply(lambda x: ' '.join(x))

found = ["Movie","One","Me", "Washington", "Investments", "Be", "Golf", "Park","Coffee","New"]
# New is mixed



Justice League Movie ---- Movie
Black Adam is an awesome character he's one of my favorite characters in the DC Comics Yoruba to me for me I like characters that are really cool and they think highly of themselves you may have too much but I have the wisdom of God they don't care what anyone says they just do what they please stop what's really interesting oriented where you can throw lightning down he has a whole bunch of different moves that make you scared to approach him his best one is actually is super move I mean it looks like you're in a movie knock knock you into a pyramid and then he calls down lightning which is awesome



Before I Fall Movie ---- Movie
so in five words or less Tell me about the movie I'm just kidding OK Google 18



Capital One ---- One
a little Bakery in north Texas is beloved for its black forest cake but this isn't the chocolate cherry layer cake you're probably thinking of and angelically Light Affair of crisp Airy almond meringue layered with whipped c

In [54]:
temp['num'] = temp.found_terms.apply(lambda x: len(x))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(temp[temp.num==1].found.value_counts())

JetBlue              26
Kia                  20
Genesis              15
Wisconsin            14
Delta                14
DraftKings           13
Gatorade             12
NFL                  12
IKEA                 12
GE                   11
Live                 10
Chevron               9
Coca-Cola             9
PANDORA               9
FanDuel               9
Family                9
Wahoo                 9
Runner                9
PlayStation           8
Emirates              8
Ford                  8
Fidelity              8
Optum                 7
Dell                  7
Prudential            7
Mastercard            7
CrossFit              7
MetroPCS              7
NBC                   6
Chick-fil-A           6
FootJoy               6
Chevrolet             6
Chobani               6
Anaheim               6
Country               6
Harley-Davidson       5
Lexus                 5
Hockey                5
Golf                  5
Humana                5
Muscle                5
Dymatize        

In [55]:
check = ["Date","News","Fire","Team","Super","Icon","Run","Dinner","Kate","Buy","Company","Get","USA","End","Denver","Let's","My","Theater","Stage","Food","Her","Go","Purpose","Arts","Cotton","Today","Holloween","Out","Natural","Are","One","Health","Jack","Happy","Sports","Spice","baby","Old","Nutrition","Center","Muscle","Show"]

In [61]:
for term in check:
    print(f"Term: ---{term}---")
    for i, row in temp[temp.found==term].iterrows():
        print(f"Sponsor: {row.sponsor_name}")
        for j, token in enumerate(row.transcript.split()):
            if token.lower() == term.lower():
                print('TEXT:',' '.join(row.transcript.split()[j-5:j+5]))
                #break
    print("\n")

Term: ---Date---
Sponsor: Hot Date
TEXT: fine time on our regular date this is a two-person


Term: ---News---
Sponsor: FOX10 News
TEXT: 
TEXT: in another vehicle Fox 10 news reporter Rebecca hager's Live
TEXT: coming up on Fox 10 News at 9:00 tonight you'll
TEXT: live from I-65 Fox 10 News


Term: ---Fire---
Sponsor: Kidde Fire Safety
TEXT: to try and fight a fire or to try and
TEXT: more likely to start a fire especially while cooking part
TEXT: if your house catches on fire get out and stay


Term: ---Team---
Sponsor: Team Red, White & Blue
TEXT: into civilian life the mission team are to enrich the


Term: ---Super---
Sponsor: Live Super Deluxe
TEXT: favorite Nintendo game Mario Kart Super Mario Brothers California play


Term: ---Icon---
Sponsor: Icon
TEXT: help you out we made icon underwear so women can


Term: ---Run---
Sponsor: Nike+ Run Club
TEXT: that my flip during your run these perforations in the


Term: ---Dinner---
Sponsor: Beatriz at Dinner
TEXT: is not working Beatri

In [73]:
clear = ["Icon", "Run", "Dinner", "Kate", "Get", "USA", "End", "Stage", "Food", "Go", "Purpose", "Cotton", "Out", "Natural", "Happy", "Spice", "Center", "Muscle"]
not_sure = ['Health', "Jack", "baby"]
# flagging terms based on check and found minus clear and not_sure
remove_terms = []
for term in found+check:
    if term in clear:
        remove_terms.append(term)
    elif term in not_sure:
        remove_terms.append(term)
print(f"Number of terms to remove: {len(remove_terms)}")        

Number of terms to remove: 21


In [93]:
def remove(text, terms):
    if len(text) == 0:
        return text
    if text[0] in terms:
        return []
    return text
# remove terms from the list
print(sponsored.found_terms.value_counts().iloc[:4])
print(sponsored.found_terms.apply(remove, terms=remove_terms).value_counts().iloc[:4])

sponsored['found_terms'] = sponsored.found_terms.apply(remove, terms=remove_terms)

mask = sponsored.found_terms.apply(lambda x: len(x)) == 0
sponsored.loc[mask, 'is_in_text'] = False
print(sponsored.is_in_text.value_counts())

[]                 4593
[Dr, Pepper]         66
[Edward, Jones]      57
[84, Lumber]         32
Name: found_terms, dtype: int64
[]                 4593
[Dr, Pepper]         66
[Edward, Jones]      57
[84, Lumber]         32
Name: found_terms, dtype: int64
False    4593
True     1593
Name: is_in_text, dtype: int64


In [95]:
# if they use patreon
patreon = sponsored[sponsored.transcript.str.contains('patreon', case=False)]
patreon.shape

(0, 9)

In [96]:
# save sponsored
sponsored.to_csv(r"C:\Users\doosti\OneDrive - Chapman University\Research\Research Projects\Facebook\Tubular\revision_2024\sponsored.csv", index=False)

In [4]:
# load sponsored
sponsored = pd.read_csv(r"C:\Users\doosti\OneDrive - Chapman University\Research\Research Projects\Facebook\Tubular\revision_2024\sponsored.csv")

In [40]:
def search_term(term,docs):
    for i,trans in enumerate(docs):
        tokens = trans.split()
        for j, token in enumerate(tokens):
            if term in token.lower():
                start = j - 10
                end = j + 10
                if start < 0:
                    start = 0
                if end > len(tokens):
                    end = len(tokens)
                print(f'Found on documnet {i}:',' '.join(trans.split()[start:end]))
                print("\n")

search_term("iphone",sponsored.transcript)
            

Found on documnet 293: on the Android store or on the App Store on iPhones remember you can also go to chatsports.com tickets to


Found on documnet 317: matchup they haven't shown that they can dominate of my iPhone go to Facebook live and give that like button


Found on documnet 317: I'm giving it a heart baby I'm pulling out my iPhone going to Facebook live chat sports the leader in


Found on documnet 317: one and I'm giving the heart I'm pulling out my iPhone for Clemson I think they're going to win this


Found on documnet 317: one and I'm giving the heart I'm pulling on my iPhone for Clemson I think they're going to win this


Found on documnet 317: one and I'm giving the heart I'm pulling out my iPhone I'm hitting that heart button for Clemson I think


Found on documnet 317: the big one and I'm giving the heart on my iPhone for Clemson I think they're going to win this


Found on documnet 317: one and I'm giving the heart I'm pulling out my iPhone for Clemson I think they're 