In [None]:
# ! pip install isodate

In [40]:
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from isodate import parse_duration
from datetime import datetime, timedelta
import random
import time
import pickle


In [41]:
path_to_key = "/Users/sacayo/documents/Youtube_api_key"
with open(path_to_key, "r") as f:
    api_keys = f.read().split('\n')
api_key1 = api_keys[0]
api_key2 = api_keys[1]
api_key3 = api_keys[2]
api_key4 = api_keys[3]


In [42]:
# Constants for API query
COMMENT_MAX_RESULTS = 200
VIDEO_MAX_RESULTS = 50
EXAMPLES_PER_GROUP = 300
SAVE_FREQUENCY = 50
SEARCH_TERMS = ["news", "business", "markets", "economy"]
CACHE_FILE = 'video_cache.pkl'


In [43]:
class QuotaManager:
    """ Request limiter for YouTube Data API"""
    def __init__(self, daily_limit):
        self.daily_limit = daily_limit
        self.usage = 0

    def can_make_request(self, cost=1):
        return self.usage + cost <= self.daily_limit

    def log_request(self, cost=1):
        self.usage += cost

def create_youtube_client(api_key):
    return build('youtube', 'v3', developerKey=api_key)

def load_cache(cache_file):
    try:
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        return {}

def save_cache(cache, cache_file):
    with open(cache_file, 'wb') as f:
        pickle.dump(cache, f)

def execute_with_backoff(request, quota_manager):
    for n in range(6):  # Maximum of 5 retries
        if not quota_manager.can_make_request():
            raise Exception("Daily quota limit reached")
        try:
            response = request.execute()
            quota_manager.log_request()
            return response
        except HttpError as e:
            if e.resp.status in [403, 429, 500, 503]:  # Retry on these status codes
                delay = 2 ** n  # Exponential backoff
                print(f"Quota exceeded or server error. Retrying in {delay} seconds.")
                time.sleep(delay)
            else:
                raise
    raise Exception("Failed after multiple retries")


In [44]:

def fetch_comments(youtube, video_id, quota_manager, max_results=COMMENT_MAX_RESULTS):
    """Fetch comments for a video."""
    next_page_token = None
    while True:
        try:
            comments_request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=max_results,
                pageToken=next_page_token
            )
            comments_response = execute_with_backoff(comments_request, quota_manager)
            
            # Process comments here if needed
            
            if 'nextPageToken' not in comments_response:
                break
            next_page_token = comments_response['nextPageToken']
        except HttpError as e:
            if e.resp.status == 403 and 'commentsDisabled' in str(e.content):
                print(f"Comments disabled for video {video_id}")
            else:
                print(f"Error fetching comments for video {video_id}: {e}")
            break


In [45]:
def fetch_videos_details(youtube, video_ids, quota_manager):
    """Fetch details for multiple videos in a single request."""
    try:
        video_request = youtube.videos().list(
            part="snippet,contentDetails,statistics,status",
            id=','.join(video_ids),
            fields="items(id,snippet(title,channelTitle,categoryId,publishedAt),contentDetails(duration),statistics(viewCount,likeCount,dislikeCount,commentCount),status(madeForKids))"
        )
        video_response = execute_with_backoff(video_request, quota_manager)
        return {item['id']: item for item in video_response.get('items', [])}
    except HttpError as e:
        print(f"Error fetching details for videos: {e}")
        return {}


In [46]:
import os

def save_data(data, filename):
    df = pd.DataFrame(data)
    
    if os.path.exists(filename):
        # File exists, append without writing the header
        df.to_csv(filename, mode='a', header=False, index=False)
        print(f"Data appended to existing file {filename}")
    else:
        # File doesn't exist, create a new file with header
        df.to_csv(filename, index=False)
        print(f"New file created: {filename}")



In [None]:

def main():
    """ Data Collection of YouTube API"""
    f_name =  f'youtube_extract1_tv_{datetime.now().strftime("%d%b%Y")}.csv'
    youtube = create_youtube_client(api_key2)
    quota_manager = QuotaManager(DAILY_QUOTA_LIMIT)
    video_cache = load_cache(CACHE_FILE)
    
    video_data = []
    video_ids_seen = set()
    videos_collected = 0
    published_after = (datetime.now() - timedelta(days=7)).isoformat("T") + "Z"

    while videos_collected < EXAMPLES_PER_GROUP:
        search_query = random.choice(SEARCH_TERMS)
        print(f"Searching for: {search_query}")
        
        try:
            search_request = youtube.search().list(
                part="snippet",
                q=search_query,
                type="video",
                publishedAfter=published_after,
                maxResults=VIDEO_MAX_RESULTS,
                relevanceLanguage='en'
            )
            search_response = execute_with_backoff(search_request, quota_manager)

            video_ids_batch = []
            for item in search_response.get('items', []):
                video_id = item['id']['videoId']
                if video_id not in video_ids_seen:
                    video_ids_batch.append(video_id)
                    video_ids_seen.add(video_id)

                if len(video_ids_batch) == VIDEO_MAX_RESULTS:
                    break

            videos_details = fetch_videos_details(youtube, video_ids_batch, quota_manager)

            for video_id, video_details in videos_details.items():
                if video_id in video_cache:
                    video_details = video_cache[video_id]
                else:
                    video_cache[video_id] = video_details

                if video_details['status']['madeForKids']:
                    print(f"Skipping video {video_id} as it is made for kids")
                    continue

                duration = parse_duration(video_details['contentDetails']['duration']).total_seconds()
                if not (60 < duration <= 600):
                    continue

                fetch_comments(youtube, video_id, quota_manager)

                snippet = video_details['snippet']
                statistics = video_details['statistics']
                
                video_data.append({
                    'video_id': video_id,
                    'video_title': snippet['title'],
                    'channel_name': snippet['channelTitle'],
                    'genre': youtube.videoCategories().list(part="snippet", id=snippet['categoryId']).execute()['items'][0]['snippet']['title'],
                    'views': int(statistics.get('viewCount', 0)),
                    'likes': int(statistics.get('likeCount', 0)),
                    'dislikes': statistics.get('dislikeCount', 0),
                    'comment_count': int(statistics.get('commentCount', 0)),
                    'video_length': duration,
                    'video_posting_date': datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d"),
                    'made_for_kids': video_details['status']['madeForKids']
                })

                videos_collected += 1

                if videos_collected % SAVE_FREQUENCY == 0:
                    save_data(video_data, f'youtube_extract1_tv_{datetime.now().strftime("%d%b%Y")}_partial_{videos_collected}.csv')
                    save_cache(video_cache, CACHE_FILE)

                if videos_collected >= EXAMPLES_PER_GROUP:
                    break

            time.sleep(1)  # Basic rate limiting

        except HttpError as e:
            print(f"An error occurred: {e}")
            if e.resp.status in [403, 429]:
                print("Quota exceeded or rate limit hit. Waiting before retrying...")
                time.sleep(60)
            else:
                raise
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    save_data(video_data, f_name)
    save_cache(video_cache, CACHE_FILE)

if __name__ == "__main__":
    # for loop to automate rerun of main
    for i in range(50):
        print("Attempt: ",i)
        main()

## Run after getting all the other videos

In [None]:
import pandas as pd
combined_file =  f'youtube_extract_tv_{datetime.now().strftime("%d%b%Y")}_combined.csv'

# List of specific CSV files to combine
csv_files = [
    "sacayo_youtube_extract_tv_15Jul2024_combined_deduplicated.csv",
    "sacayo_youtube_extract_tv_16Jul2024_combined_deduplicated.csv"

]

# List to hold each DataFrame
dfs = []

# Loop through all specified CSV files and read them into DataFrames
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        dfs.append(df)
    except FileNotFoundError:
        print(f"File {csv_file} not found. Skipping.")

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_file, index=False)

print(f"Combined data saved to {combined_file}")


In [None]:
import pandas as pd
combined_deduped_file =  f'youtube_extract_tv_{datetime.now().strftime("%d%b%Y")}_combined_deduplicated.csv'
combined_file =  f'youtube_extract_tv_{datetime.now().strftime("%d%b%Y")}_combined.csv'
f_name =  f'youtube_extract1_tv_{datetime.now().strftime("%d%b%Y")}.csv'
# Read the CSV file
df = pd.read_csv(combined_file)

# Remove duplicates
df_cleaned = df.drop_duplicates()

# Save cleaned data to a new CSV file
df_cleaned.to_csv(combined_deduped_file, index=False)

print("Duplicates removed and cleaned CSV saved successfully.")


In [None]:
import pandas as pd

# Read the deduplicated CSV file
df = pd.read_csv(combined_deduped_file)

# Find titles not containing 'bitcoin' or 'cryptocurrency'
non_bitcoin_titles = df[~df['video_title'].str.contains('bitcoin', case=False) & ~df['video_title'].str.contains('cryptocurrency', case=False)]

# Count the number of videos
num_videos = non_bitcoin_titles.shape[0]

# Print the count and titles
print(f"Number of videos without 'bitcoin' or 'cryptocurrency': {num_videos}\n")
for idx, title in enumerate(non_bitcoin_titles['video_title'], 1):
    print(f"{idx}. {title}")



In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Function to clean and tokenize text
def clean_tokenize(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Read the deduplicated CSV file
df = pd.read_csv(combined_deduped_file)

# Find titles not containing 'bitcoin' or 'cryptocurrency'
non_bitcoin_titles = df[~df['video_title'].str.contains('bitcoin', case=False) & ~df['video_title'].str.contains('cryptocurrency', case=False)
                        & ~df['video_title'].str.contains('crypto', case=False) & ~df['video_title'].str.contains('blockchain', case=False)]

# Extract all titles as a single text
all_titles = ' '.join(non_bitcoin_titles['video_title'])

# Clean and tokenize the text
tokens = clean_tokenize(all_titles)

# Count frequency of each word
word_freq = Counter(tokens)

# Get the most common words (adjust the number as needed)
common_words = word_freq.most_common(30)

# Print common words
print("Common words in video titles without 'bitcoin' 'crypto' 'blockchain' or 'cryptocurrency':")
for word, freq in common_words:
    print(f"{word}: {freq}")


In [None]:
df.head()

In [None]:
df.genre.unique()

In [None]:
df[df['made_for_kids'] == False]

In [None]:
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import csv

# Load the video IDs from the combined deduplicated CSV file
df = pd.read_csv(combined_deduped_file)
video_ids = df['video_id'].tolist()

# Load the existing comments CSV file and extract the video IDs
try:
    existing_comments_df = pd.read_csv('youtube_comments_07jul2024.csv')
    existing_video_ids = set(existing_comments_df['video_id'].tolist())
except FileNotFoundError:
    existing_video_ids = set()

# Function to extract comments for a given video ID
def extract_comments(youtube, video_id, max_results=100):
    comments = []
    next_page_token = None

    while True:
        try:
            comments_request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=max_results,
                pageToken=next_page_token,
                textFormat="plainText"
            )
            comments_response = comments_request.execute()

            for item in comments_response.get('items', []):
                top_comment = item['snippet']['topLevelComment']['snippet']
                comment = {
                    'video_id': video_id,
                    'comment_id': item['id'],
                    'author': top_comment['authorDisplayName'],
                    'text': top_comment['textDisplay'],
                    'published_at': top_comment['publishedAt'],
                    'like_count': top_comment['likeCount']
                }
                comments.append(comment)

                # Get replies to the top-level comment
                total_reply_count = item['snippet']['totalReplyCount']
                if total_reply_count > 0:
                    comment_id = item['id']
                    next_reply_page_token = None

                    while True:
                        replies_request = youtube.comments().list(
                            part="snippet",
                            parentId=comment_id,
                            maxResults=max_results,
                            pageToken=next_reply_page_token,
                            textFormat="plainText"
                        )
                        replies_response = replies_request.execute()

                        for reply in replies_response.get('items', []):
                            reply_snippet = reply['snippet']
                            reply_comment = {
                                'video_id': video_id,
                                'comment_id': reply['id'],
                                'author': reply_snippet['authorDisplayName'],
                                'text': reply_snippet['textDisplay'],
                                'published_at': reply_snippet['publishedAt'],
                                'like_count': reply_snippet['likeCount']
                            }
                            comments.append(reply_comment)

                        next_reply_page_token = replies_response.get('nextPageToken')
                        if not next_reply_page_token:
                            break

            next_page_token = comments_response.get('nextPageToken')
            if not next_page_token:
                break
        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred:\n{e.content}")
            break

    return comments

# Function to save comments to a CSV file
def save_comments_to_csv(comments, filename='youtube_comments_07jul2024.csv'):
    # Define the header
    header = ['video_id', 'comment_id', 'author', 'text', 'published_at', 'like_count']

    # Check if the file already exists
    try:
        with open(filename, 'x', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=header)
            writer.writeheader()
    except FileExistsError:
        pass

    # Append comments to the file
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        for comment in comments:
            writer.writerow(comment)

# Collect all comments for all video IDs and save them incrementally
for video_id in video_ids:
    if video_id in existing_video_ids:
        print(f"Skipping video ID {video_id} as it's already processed.")
        continue
    print(f"Extracting comments for video ID: {video_id}")
    comments = extract_comments(youtube, video_id)
    save_comments_to_csv(comments)
    print(f"Comments for video ID {video_id} saved.")

print("All comments have been saved incrementally to youtube_comments_07jul2024.csv")


# Extracting comments


In [47]:
#Functions to get comments and replies

def get_all_comments_and_replies(youtube, video_id, quota_manager):
    comments_data = []

    # Function to get replies for a given comment
    def get_replies(parent_id):
        replies = []
        request = youtube.comments().list(
            part='snippet',
            parentId=parent_id,
            maxResults=100
        )
        while request:
            response = execute_with_backoff(request, quota_manager)
            for item in response['items']:
                replies.append(item['snippet']['textDisplay'])
            request = youtube.comments().list_next(request, response)
        return replies

    # Function to get top-level comments and their replies
    def get_comments(video_id):
        try:
            request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100
            )
            while request:
                response = execute_with_backoff(request, quota_manager)
                for item in response['items']:
                    video_id = item['snippet']['videoId']
                    top_level_comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                    comment_replies = get_replies(item['id']) if item['snippet']['totalReplyCount'] > 0 else []
                    comments_data.append({'video_id': video_id, 'comment': top_level_comment, 'replies': comment_replies})
                request = youtube.commentThreads().list_next(request, response)
        except HttpError as e:
            if e.resp.status == 403:
                print(f"Comments are disabled for video ID: {video_id}")
            else:
                print(f"An HTTP error {e.resp.status} occurred: {e.content}")

    get_comments(video_id)
    return comments_data

In [48]:
def get_video_stats(video_id, quota_manager):
    youtube = build('youtube', 'v3', developerKey=api_key3)

    try:
        request = youtube.videos().list(
            part='statistics',
            id=video_id
        )
        video_stats = execute_with_backoff(request, quota_manager)
        if 'items' in video_stats and video_stats['items']:
            return video_stats['items'][0]['statistics']
        else:
            print(f"No statistics available for video ID: {video_id}")
            return {}
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred: {e.content}")
        return {}
    except Exception as e:
        print(f"An unexpected error occurred while fetching stats for video {video_id}: {e}")
        return {}

In [49]:
#Functions to get video statistics

def get_views(video_stats):
    return int(video_stats.get('viewCount', -99999))

def get_likes(video_stats):
    return int(video_stats.get('likeCount', -99999))

def get_dislikes(video_stats):
    return int(video_stats.get('dislikeCount', -99999))

def get_comment_count(video_stats):
    return int(video_stats.get('commentCount', -99999))

def get_favorites(video_stats):
    return int(video_stats.get('favoriteCount', -99999))


In [53]:
# Constants
DAILY_LIMIT = 10000  #  daily quota limit
CACHE_FILE = "youtube_cache.pkl"

# Initialize YouTube client, QuotaManager, and cache
youtube = create_youtube_client(api_key2)
quota_manager = QuotaManager(DAILY_LIMIT)
video_cache = load_cache(CACHE_FILE)

treat_set_1 = [
    'qRD4oFEzEk8', 'JJZM_E5dhlg', '79jPeK0-5hk', 'OzfCwTb8Bog', 'BOahopz5nJI',
    'eKqa06ThrlQ', 'sCGFARcB2mM', 'X6Mic41rb84', 'G2T6_2cR_as', 'ytscdDCVz8k',
    'sBI__lWleHI', 'lNxSOMUBxmc', 'ofJPJZpfhG0', '9vT_W6eeLy8', 'kkOOwdL4T_w',
    'VpIR1GUgrv4', '0SFLiZIJR2U', '9sr6nDBUy6Y', 'EN_3DfcDCJs', 'uPB6IiqRWAw',
    '6bVXrfr4D2g',
]
treat_set_2 = [
    'Bp_gEFGZixI', '3tRTSk4hOk8', 'yajzaIWxAvQ', '6xDAhnt8Uio', 'bBmI7rC-h1s',
    '7NosnGRGSoQ', '94jgviuiHYI', 'cvR6kTkq3Yg', '4NHFZ5XUMpg', 'cV1rhDVqB6E'
]
control_set = [
    '70Qhge-3W-k', 'SreJlZGd1c0', '6ClWPlrHsW0', 'UUuDXLsiy-U', 'I3vCi8LW1fw',
    'xiJyyOuHNOQ', 'AL2SeKBl7mA', 'ejFXc_4qYtQ', 'M5v8G3W2eH4', 'OkOgVtkEVBc',
    'jluhsLZAPlw', '2oCi_5NneN8', 'wAfvhq8Ay4U', 'lCUv9oGVB0E', 'lRegtVkgKFs',
    'fxNxSHzwFng', 'P6IVgKndOkY', '5n63YOl0UsI', '0FpA1NtmqNM', 'Ue8D5GiaTDE',
    'mXbjiTfWujI', 'eLteHvWLJig', 'qZyIAb1fZ4k', 'VPZHhoyWWS4', 'zL7Ke8RkZ6o',
    'r6AZVmzRwXM', 'EmGyGEyDOPc', 'OAXWq1b1l7g', 'FARTWvyGmo4', 'xy92iXCMstA',
    'X8NzgjGh4ro', 'sDSSMSiAVOU', 'HM52g6mHiYo', 'hAD6u_WCIKI', '1OtdzqdcZS4',
    '4Mnmef0n1aw', 'bXyZqd_U56Y', 'cMEsgJSaaJA', 'KN9INe2hXXw',
]

In [18]:
comments_dfs = pd.DataFrame()

for video in treat_set_1:
    try:
        print(f"Processing video: {video}")
        comments_data = get_all_comments_and_replies(youtube, video, quota_manager)
        
        if comments_data:
            df = pd.DataFrame(comments_data)
            df['video_id'] = video
            df['date'] = datetime.now()
            comments_dfs = pd.concat([comments_dfs, df], ignore_index= True)
        else:
            print(f"No comments data available for video: {video}")

    
    
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred for video {video}: {e.content}")
    except Exception as e:
        print(f"An unexpected error occurred for video {video}: {e}")
    
    if not quota_manager.can_make_request():
        print("Daily quota limit reached. Stopping execution.")
        break

if not comments_dfs.empty:
    print(comments_dfs.head())
    comments_dfs.to_csv('treat_set1_comments.csv', index=False)
    print("Comments data saved to 'treat_set1_comments.csv'")
else:
    print("No comments data collected.")

print(f"Total quota used: {quota_manager.usage}")

Processing video: qRD4oFEzEk8
Processing video: JJZM_E5dhlg
Processing video: 79jPeK0-5hk
Processing video: OzfCwTb8Bog
Processing video: BOahopz5nJI
Processing video: eKqa06ThrlQ
Processing video: sCGFARcB2mM
Processing video: X6Mic41rb84
Processing video: G2T6_2cR_as
Processing video: ytscdDCVz8k
Processing video: sBI__lWleHI
Processing video: lNxSOMUBxmc
Processing video: ofJPJZpfhG0
Processing video: 9vT_W6eeLy8
Processing video: kkOOwdL4T_w
Processing video: VpIR1GUgrv4
Processing video: 0SFLiZIJR2U
Processing video: 9sr6nDBUy6Y
Processing video: EN_3DfcDCJs
Processing video: uPB6IiqRWAw
Processing video: 6bVXrfr4D2g
      video_id                                            comment replies  \
0  qRD4oFEzEk8                        Cannabis works for epilepsy      []   
1  qRD4oFEzEk8              THE NUMBERS MASON. WHAT DO THEY MEAN?      []   
2  qRD4oFEzEk8                  I hope it&#39;s not another scam.      []   
3  JJZM_E5dhlg  Shannon Bream...you are a sick individual and 

In [27]:
comments_dfs = pd.DataFrame()

for video in treat_set_2:
    try:
        print(f"Processing video: {video}")
        comments_data = get_all_comments_and_replies(youtube, video, quota_manager)
        
        if comments_data:
            df = pd.DataFrame(comments_data)
            df['video_id'] = video
            df['date'] = datetime.now()
            comments_dfs = pd.concat([comments_dfs, df], ignore_index= True)
        else:
            print(f"No comments data available for video: {video}")

    
    
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred for video {video}: {e.content}")
    except Exception as e:
        print(f"An unexpected error occurred for video {video}: {e}")
    
    if not quota_manager.can_make_request():
        print("Daily quota limit reached. Stopping execution.")
        break

if not comments_dfs.empty:
    print(comments_dfs.head())
    comments_dfs.to_csv('treat_set2_comments.csv', index=False)
    print("Comments data saved to 'treat_set2_comments.csv'")
else:
    print("No comments data collected.")

print(f"Total quota used: {quota_manager.usage}")

Processing video: Bp_gEFGZixI
Processing video: 3tRTSk4hOk8
Processing video: yajzaIWxAvQ
Processing video: 6xDAhnt8Uio
Processing video: bBmI7rC-h1s
Processing video: 7NosnGRGSoQ
Processing video: 94jgviuiHYI
Processing video: cvR6kTkq3Yg
Processing video: 4NHFZ5XUMpg
Processing video: cV1rhDVqB6E
      video_id                                            comment replies  \
0  Bp_gEFGZixI  The blockchain technology behind Bitcoin is in...      []   
1  Bp_gEFGZixI  Same thing what they did in 2020 election. Rep...      []   
2  Bp_gEFGZixI  Biden and democrats will cheat in 2024 if they...      []   
3  Bp_gEFGZixI  Democrats will steal this election from the pe...      []   
4  Bp_gEFGZixI                                   FJB 💩 for brains      []   

                        date  
0 2024-07-28 22:14:26.855636  
1 2024-07-28 22:14:26.855636  
2 2024-07-28 22:14:26.855636  
3 2024-07-28 22:14:26.855636  
4 2024-07-28 22:14:26.855636  
Comments data saved to 'treat_set2_comments.csv'
To

In [26]:
#Pull outcomes from video statistics
outcomes_dfs = []

for video in treat_set_2:
    vid_stats = get_video_stats(video, quota_manager)
    stat_dict = {
        "video_id": video,
        "date": datetime.now(),
        "comment_count": get_comment_count(vid_stats),
        "likes": get_likes(vid_stats),
        "views": get_views(vid_stats),
        "favorites": get_favorites(vid_stats)
    }
    df = pd.DataFrame([stat_dict])
    outcomes_dfs.append(df)

final_outcomes_df = pd.concat(outcomes_dfs, ignore_index=True)
final_outcomes_df.to_csv('treat_set2_video_stats.csv', index=False)

final_outcomes_df

Unnamed: 0,video_id,date,comment_count,likes,views,favorites
0,Bp_gEFGZixI,2024-07-28 22:12:41.422163,491,1222,29040,0
1,3tRTSk4hOk8,2024-07-28 22:12:41.551705,1259,14102,280708,0
2,yajzaIWxAvQ,2024-07-28 22:12:41.674107,8,41,1819,0
3,6xDAhnt8Uio,2024-07-28 22:12:41.801256,106,591,8257,0
4,bBmI7rC-h1s,2024-07-28 22:12:41.892657,1,0,16,0
5,7NosnGRGSoQ,2024-07-28 22:12:42.005729,428,3152,82099,0
6,94jgviuiHYI,2024-07-28 22:12:42.132149,89,693,13064,0
7,cvR6kTkq3Yg,2024-07-28 22:12:42.273792,249,1701,58279,0
8,4NHFZ5XUMpg,2024-07-28 22:12:42.380209,437,1679,35981,0
9,cV1rhDVqB6E,2024-07-28 22:12:42.464169,3172,367714,3746871,0


In [None]:
#Pull outcomes from video statistics
outcomes_dfs = []

for video in treat_set_2:
    vid_stats = get_video_stats(video, quota_manager)
    stat_dict = {
        "video_id": video,
        "date": datetime.now(),
        "comment_count": get_comment_count(vid_stats),
        "likes": get_likes(vid_stats),
        "views": get_views(vid_stats),
        "favorites": get_favorites(vid_stats)
    }
    df = pd.DataFrame([stat_dict])
    outcomes_dfs.append(df)

final_outcomes_df = pd.concat(outcomes_dfs, ignore_index=True)
final_outcomes_df.to_csv('treat_set2_video_stats.csv', index=False)

final_outcomes_df

In [20]:
drive_csv_path = './Copy of 241_Random_Assignment - 241_random_assignment.csv'

df = pd.read_csv(drive_csv_path, sep = ',')
df.head()

Unnamed: 0,video_id,video_title,channel_name,genre,views,likes,dislikes,comment_count,video_length,video_posting_date,treatment,comment,owner,completed,Date Posted,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,qRD4oFEzEk8,"Inside Neuroelectrics, the brain science start...",CNBC International,Science & Technology,3751,111,0,3,359,6/13/2024,treatment,The blockchain technology behind Bitcoin is in...,Sammy,True,7/13/2024 8:16 PM,,,
1,UVEBA98jtm8,Heavy rainfall to affect multiple regions acro...,Geo News,News & Politics,79345,966,0,22,494,2024-07-07,control,,Max,False,,,,
2,IHf2Gi3qHY0,Samaa News Headlines 07 PM | Horrible Incident...,SAMAA TV,News & Politics,31261,203,0,14,110,2024-07-07,control,,Max,False,,,,
3,70Qhge-3W-k,Gov. Doug Burgum: Biden is 'absolutely' a secu...,Fox Business,News & Politics,97091,2450,0,488,323,2024-07-07,control,,Sammy,False,,,,
4,_FwD-FETfuI,5 Crazy New WhatsApp Features You Must Try 😮 ...,Trakin Tech,Science & Technology,241972,16378,0,485,252,6/29/2024,treatment,Bitcoin seems to have its pros and cons.,Max,True,07/13/24,,,


In [26]:
df = df[df['treatment'] == 'treatment']
vid_ids = list(df['video_id'])
vid_ids

['qRD4oFEzEk8',
 '_FwD-FETfuI',
 'JJZM_E5dhlg',
 '79jPeK0-5hk',
 '4P7C5quW5AI',
 '6VNaXPIJpL8',
 'OzfCwTb8Bog',
 'BOahopz5nJI',
 'eKqa06ThrlQ',
 'a47XjAWQgM0',
 'cnEyTjlb6S0',
 'CQYuOE4mMao',
 'cXRNgI9o7Do',
 'sCGFARcB2mM',
 'X6Mic41rb84',
 'G2T6_2cR_as',
 'ytscdDCVz8k',
 'D9F4tS1ER-U',
 'EiqKcwIzhkU',
 '0ZW-icIM01U',
 '3aMvY39Z_sk',
 'sBI__lWleHI',
 'lNxSOMUBxmc',
 'ofJPJZpfhG0',
 '9vT_W6eeLy8',
 'kkOOwdL4T_w',
 'kcA5Yhx-22I',
 'KFPSP-ViZ8I',
 'VpIR1GUgrv4',
 'lUD3UOHWtZs',
 '0SFLiZIJR2U',
 '9sr6nDBUy6Y',
 'EN_3DfcDCJs',
 '78VLWJ49Owg',
 'oRxTN0BJTXI',
 'pDcZCHqOrlg',
 'cqTjnetcRDE',
 'WlsnZsEHU3I',
 'uPB6IiqRWAw',
 'XH4uUHcbVgk',
 'dP6bPOQTTz8',
 'Jyz2sDuz0ss',
 'GtmjLOHQ8i8',
 'WprINRRdB4s',
 'ST_-KqB_IY0',
 'Ep9CnRDKJvw',
 'Jo0t5PlntwM',
 'fSj4YiemGyg',
 '4Y7SuI75KkA',
 '0KcJ3oxHaxo',
 'YFXdaCykwq4',
 'u-0ekHLK9ZY',
 'E6SSxVbGIyA',
 'g7Txb2RMmDM',
 'bNeRb5qRKzs',
 'AF0ShxWuFBs',
 'XOkmKqUcCOY',
 'kw5Bxf1J37Y',
 'PL_Hdlcs3V4',
 'vbMtMgFVODk',
 '7qOVbIORgx8',
 '83xG_14RGxI',
 'NQsZxE

In [54]:
outcomes_dfs = []

for video in vid_ids:
    vid_stats = get_video_stats(video, quota_manager)
    stat_dict = {
        "video_id": video,
        "date": datetime.now(),
        "comment_count": get_comment_count(vid_stats),
        "likes": get_likes(vid_stats),
        "views": get_views(vid_stats),
        "favorites": get_favorites(vid_stats)
    }
    df = pd.DataFrame([stat_dict])
    outcomes_dfs.append(df)

No statistics available for video ID: XihN6uHBihE
No statistics available for video ID: 39Ef4otlH7Y


In [61]:
comment_outcomes_df = pd.concat(outcomes_dfs, ignore_index=True)
comment_outcomes_df[comment_outcomes_df['comment_count'] > 0 ]

Unnamed: 0,video_id,date,comment_count,likes,views,favorites
0,qRD4oFEzEk8,2024-07-31 17:49:41.711049,3,161,5648,0
1,_FwD-FETfuI,2024-07-31 17:49:41.852026,684,27997,604625,0
2,JJZM_E5dhlg,2024-07-31 17:49:42.021601,2446,4804,305733,0
3,79jPeK0-5hk,2024-07-31 17:49:42.240667,218,2106,74292,0
4,4P7C5quW5AI,2024-07-31 17:49:42.390067,276,1498,68317,0
...,...,...,...,...,...,...
178,jI7lG6YHpKY,2024-07-31 17:50:11.136191,105,204,71419,0
179,8_ZxXs7mlqs,2024-07-31 17:50:11.264391,1157,15048,3556576,0
182,ZkGvyfwBy8w,2024-07-31 17:50:11.742972,309,426,69064,0
183,l0_hrxBvZjI,2024-07-31 17:50:11.880271,194,1274,255547,0
