In [4]:
from googleapiclient.discovery import build
from dotenv import load_dotenv
import pandas as pd
import time
import os
import csv

load_dotenv()

# Set up YouTube API client
api_key =  os.getenv('API_KEY')  # Replace with your API Key
youtube = build('youtube', 'v3', developerKey=api_key)

# Function to fetch comments for a given video ID
def get_all_comments(video_id, outer_item):
    comments = []
    next_page_token = None

    while True:
        # Make API call to get comments
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            pageToken=next_page_token  # Use pagination token for next set of comments
        )
        
        # Execute the request
        response = request.execute()

        # Loop through the comments in the response
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            timestamp = item['snippet']['topLevelComment']['snippet']['publishedAt']
            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
            tipe_produk = outer_item['tipe_produk']
            brand = outer_item['brand']
            segment = outer_item['segment']
            release_date = outer_item['release_date']
            comments.append({
                'author': author.strip(),
                'comment': comment.strip(),
                'timestamp': timestamp.strip(),
                'like_count': like_count,
                'tipe_produk': tipe_produk,
                'brand': brand,
                'segment': segment,
                'release_date': release_date
            })
        
        # Check if there's another page of comments (pagination)
        next_page_token = response.get('nextPageToken')

        if not next_page_token:  # If no more pages, break the loop
            break

        # Optional: Sleep to avoid hitting API rate limits
        time.sleep(1)

    return comments

In [5]:
# Sample CSV data (you can replace this with the actual CSV file if needed)
csv_data = """
,tipe_produk,brand,segment,release_date,price_range,youtube_id,
,Galaxy S24,Samsung,Flagship,"January 18, 2024","15,000,000 – 18,000,000",u70lvJabuAo,
,Galaxy S24+,Samsung,Flagship,"January 18, 2024","18,000,000 – 21,000,000",Eu4i73mUbbA,
,Galaxy S24 Ultra,Samsung,Flagship,"January 18, 2024","21,000,000 – 25,000,000",daFpfaCmrAs,
,Galaxy A06,Samsung,Entry-Level,November 2024,"1,500,000 – 2,000,000",6Xb2xOUW9QQ,
,Realme 12 5G,Realme,Mid-Range,February 2024,"3,000,000 – 4,000,000",BK5cIfThopY,
,Realme 12 Pro 5G,Realme,Mid-Range,February 2024,"4,000,000 – 5,500,000",pBW_sp7GdFo,
,Realme C61,Realme,Entry-Level,September 2024,"1,500,000 – 2,500,000","ONw_TR7fW9o;
PG5szlg-my8;
Hsq4MvYrH6U;
zEvvCNWgPjE",
,Redmi Note 13 5G,Xiaomi,Mid-Range,February 2024,"3,000,000 – 4,000,000",oaiG5_5Z2hc,
,Redmi Note 13 Pro 5G,Xiaomi,Mid-Range,February 2024,"4,000,000 – 5,500,000",zs6C_UipY70,
,Xiaomi 14T,Xiaomi,Flagship,September 2024,"10,000,000 – 12,000,000",8YiVhQpDfAM,
,Xiaomi 14T Pro,Xiaomi,Flagship,September 2024,"12,000,000 – 14,000,000",urZOhILUges,
,POCO M7 Pro 5G,POCO,Mid-Range,Expected Late 2024,"3,500,000 – 4,500,000","NmKZI9Re33o;
IDLJdbz_e3s;
slkZItJfUzs",
,OPPO A3x,OPPO,Entry-Level,2024,"1,800,000 – 2,500,000",P-khRbPXrU0,
,Vivo V40,Vivo,Mid-Range,August 2024,"4,000,000 – 5,500,000",xJEf5N9mj6I,
,iQOO Z8,iQOO,Mid-Range,August 2024,"3,500,000 – 4,500,000",YqaZN_fg32s,
,Infinix Zero 30,Infinix,Mid-Range,August 2024,"3,000,000 – 4,000,000",WqakBYplLhE,
,Tecno Spark 20C,Tecno,Entry-Level,January 2024,"1,200,000 – 1,800,000","8CcESpLB13Y;
IlsGDaR4yiQ",
,Sharp Aquos Sense 8,Sharp,Mid-Range,January 2024,"3,500,000 – 4,500,000","IUHG8o-RREk;
FVypZz3mm-k",
,ASUS Zenfone 11 Ultra,ASUS,Flagship,2024,"12,000,000 – 15,000,000",MFWwdNfaTxc,
,Vivo X100 Pro,Vivo,Flagship,2024,"10,000,000 – 12,000,000",vA5bPjV29qQ,
,Google Pixel 9,Google,Flagship,2024,"12,000,000 – 14,000,000",ZyhYwrMaI-s,
,iPhone 16,Apple,Flagship,October 2024,"20,000,000 – 25,000,000","09tQ_D3l8sc;
ZllBkqAUy7g",
,Huawei Pura 70,Huawei,Flagship,"April 18, 2024","12,000,000 – 14,000,000",UHHNkOmxXwE,
,Huawei Pura 70 Pro,Huawei,Flagship,"April 18, 2024","14,000,000 – 16,000,000",xJEf5N9mj6I,
,Huawei Pura 70 Ultra,Huawei,Flagship,"April 18, 2024","16,000,000 – 18,000,000",KlJXQIna7sE,
,Huawei Nova Flip,Huawei,Mid-Range,"August 5, 2024","8,000,000 – 10,000,000","89ucsaRRSy8&t=5s;    
vfmhtAqoJF8;                
qcXXUHSGmi4;           
3VMFr3CDZSU;          
WdaLzxOeTD4;          
YpMSZ70ml5w;          
6MKr2LMy-Js              
",
,Huawei Mate XT Ultimate,Huawei,Flagship,"September 10, 2024","20,000,000 – 22,000,000",M8_zKvwqmSM,
"""

# Read CSV data into a list of dictionaries
def read_csv_to_dict_array(csv_content):
    rows = csv_content.strip().split('\n')  # Split the CSV data into rows
    reader = csv.DictReader(rows)  # Create a DictReader to map rows to dictionaries

    # Convert rows to a list of dictionaries
    result = []
    for row in reader:
        # Split the 'youtube_id' field by semicolons, strip spaces from each ID, and convert it into a list
        youtube_ids = [youtube_id.strip() for youtube_id in row['youtube_id'].split(';')]
        row['youtube_id'] = youtube_ids  # Replace the youtube_id field with the list of stripped IDs
        result.append(row)
    
    return result

# Create an array of dictionaries from the CSV data
data = read_csv_to_dict_array(csv_data)

# Print the resulting list of dictionaries
for item in data:
    print(item)
print(data)


{'': '', 'tipe_produk': 'Galaxy S24', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'January 18, 2024', 'price_range': '15,000,000 – 18,000,000', 'youtube_id': ['u70lvJabuAo']}
{'': '', 'tipe_produk': 'Galaxy S24+', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'January 18, 2024', 'price_range': '18,000,000 – 21,000,000', 'youtube_id': ['Eu4i73mUbbA']}
{'': '', 'tipe_produk': 'Galaxy S24 Ultra', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'January 18, 2024', 'price_range': '21,000,000 – 25,000,000', 'youtube_id': ['daFpfaCmrAs']}
{'': '', 'tipe_produk': 'Galaxy A06', 'brand': 'Samsung', 'segment': 'Entry-Level', 'release_date': 'November 2024', 'price_range': '1,500,000 – 2,000,000', 'youtube_id': ['6Xb2xOUW9QQ']}
{'': '', 'tipe_produk': 'Realme 12 5G', 'brand': 'Realme', 'segment': 'Mid-Range', 'release_date': 'February 2024', 'price_range': '3,000,000 – 4,000,000', 'youtube_id': ['BK5cIfThopY']}
{'': '', 'tipe_produk': 'Realme 12 Pro 5G', 'bra

In [None]:
for item in data:
  for video_id in item['youtube_id']:
    try:
      comments = get_all_comments(video_id, item)
      # Convert to DataFrame for easy analysis
      df = pd.DataFrame(comments)
      print(f"{item['tipe_produk']} Total Comments Retrieved: {len(df)}")
      file_path = f"./datasets-2024/{item['tipe_produk']}.csv"
      file_exists = os.path.exists(file_path)
      df.to_csv(file_path, mode='a', header=not file_exists, index=False)
    except Exception as e:
      print(f"An error occurred: {str(e)}")

Galaxy S24 Total Comments Retrieved: 124
Galaxy S24+ Total Comments Retrieved: 714
Galaxy S24 Ultra Total Comments Retrieved: 2734
Galaxy A06 Total Comments Retrieved: 821
Realme 12 5G Total Comments Retrieved: 62
Realme 12 Pro 5G Total Comments Retrieved: 778
Realme C61 Total Comments Retrieved: 38
Realme C61 Total Comments Retrieved: 6
Realme C61 Total Comments Retrieved: 3
Realme C61 Total Comments Retrieved: 7
Redmi Note 13 5G Total Comments Retrieved: 2775
Redmi Note 13 Pro 5G Total Comments Retrieved: 4009
Xiaomi 14T Total Comments Retrieved: 2243
Xiaomi 14T Pro Total Comments Retrieved: 3197
POCO M7 Pro 5G Total Comments Retrieved: 11
POCO M7 Pro 5G Total Comments Retrieved: 8
POCO M7 Pro 5G Total Comments Retrieved: 5
OPPO A3x Total Comments Retrieved: 2585
Vivo V40 Total Comments Retrieved: 2272
iQOO Z8 Total Comments Retrieved: 28
Infinix Zero 30 Total Comments Retrieved: 3226
Tecno Spark 20C Total Comments Retrieved: 22
Tecno Spark 20C Total Comments Retrieved: 70
Sharp Aquo

HttpError: <HttpError 400 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=ZllBkqAUy7g&textFormat=plainText&pageToken=Z2V0X25ld2VzdF9maXJzdC0tQ2dnSWdBUVZGN2ZST0JJRkNJY2dHQUFTQlFpZElCZ0JFZ1VJaUNBWUFCSUZDS2dnR0FBU0JRaUpJQmdBSWc0S0RBampoS1MzQmhESS11MzZBZw%3D%3D&key=AIzaSyD9PXGCKz51wzEL1hLvx6pkZh1qvCMLfAU&alt=json returned "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the <code>commentThread</code> resource in the request body to ensure that it is valid.". Details: "[{'message': "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the request's input is invalid. Check the structure of the <code>commentThread</code> resource in the request body to ensure that it is valid.", 'domain': 'youtube.commentThread', 'reason': 'processingFailure', 'location': 'body', 'locationType': 'other'}]">

In [6]:
csv_data_2 = """
,tipe_produk,brand,segment,release_date,price_range,youtube_id,
,Galaxy S25,Samsung,Flagship,"January 5, 2025","16,000,000 – 19,000,000","MIG3CO-fj9o;nLzVLDjTSak;vAgkQWh2vrc",
,Galaxy S25+,Samsung,Flagship,"January 5, 2025","19,000,000 – 22,000,000",,
,Galaxy S25 Ultra,Samsung,Flagship,"January 5, 2025","22,000,000 – 26,000,000",XhKxDUkwCMY,
,Galaxy S25 Slim,Samsung,Flagship,Mid-2025,"15,000,000 – 18,000,000",Me5scfVMck8,
,Galaxy Tri-Fold,Samsung,Flagship,2025,"30,000,000 – 35,000,000",,
,iPhone 17,Apple,Flagship,"September 9, 2025","21,000,000 – 26,000,000",d5JQ6NJfolI,
,iPhone 17 Air,Apple,Mid-Range,"September 9, 2025","15,000,000 – 18,000,000",1HaZM0eBZ_0,
,Pixel 10,Google,Flagship,October 2025,"13,000,000 – 15,000,000",Hv5uZjQIFrA,
,Pixel 10 Pro,Google,Flagship,October 2025,"15,000,000 – 17,000,000","MlEt77R_IYI;gTGtL_hIsAM",
,Pixel 9a,Google,Mid-Range,2025,"5,000,000 – 6,500,000","
QJDsBxcw23Q;
5kelXQfwEK4
",
,Xiaomi 15,Xiaomi,Flagship,2025,"11,000,000 – 13,000,000",TdBhMJKnpEw,
,Xiaomi 15 Pro,Xiaomi,Flagship,2025,"13,000,000 – 15,000,000",wcqZFR7ZD24,
,Redmi Note 14 Series,Xiaomi,Mid-Range,2025,"3,500,000 – 5,000,000",DZAuipx_xJw,
,Realme 13 Series,Realme,Mid-Range,2025,"3,500,000 – 5,000,000",9X8TLWIRpXY,
,Realme C62,Realme,Entry-Level,2025,"1,500,000 – 2,500,000",,
,OPPO Find X7,OPPO,Flagship,2025,"12,000,000 – 15,000,000","SnIkgxQ4cxQ;ly7EPFToZdQ",
,Huawei Pura 80,Huawei,Flagship,April 2025,"13,000,000 – 15,000,000",,
,Huawei Pura 80 Pro,Huawei,Flagship,April 2025,"15,000,000 – 17,000,000",,
,Huawei Pura 80 Ultra,Huawei,Flagship,April 2025,"17,000,000 – 19,000,000",CLp5qCOTxMA,
,Huawei Nova 13,Huawei,Mid-Range,August 2025,"6,000,000 – 8,000,000",,
,Huawei Mate XT Pro,Huawei,Flagship,September 2025,"22,000,000 – 24,000,000",,
"""

# Create an array of dictionaries from the CSV data
data_2 = read_csv_to_dict_array(csv_data_2)

# Print the resulting list of dictionaries
for item in data_2:
    print(item)
print(data_2)

{'': '', 'tipe_produk': 'Galaxy S25', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'January 5, 2025', 'price_range': '16,000,000 – 19,000,000', 'youtube_id': ['MIG3CO-fj9o', 'nLzVLDjTSak', 'vAgkQWh2vrc']}
{'': '', 'tipe_produk': 'Galaxy S25+', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'January 5, 2025', 'price_range': '19,000,000 – 22,000,000', 'youtube_id': ['']}
{'': '', 'tipe_produk': 'Galaxy S25 Ultra', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'January 5, 2025', 'price_range': '22,000,000 – 26,000,000', 'youtube_id': ['XhKxDUkwCMY']}
{'': '', 'tipe_produk': 'Galaxy S25 Slim', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': 'Mid-2025', 'price_range': '15,000,000 – 18,000,000', 'youtube_id': ['Me5scfVMck8']}
{'': '', 'tipe_produk': 'Galaxy Tri-Fold', 'brand': 'Samsung', 'segment': 'Flagship', 'release_date': '2025', 'price_range': '30,000,000 – 35,000,000', 'youtube_id': ['']}
{'': '', 'tipe_produk': 'iPhone 17', 'brand': 'A

In [7]:
for item in data_2:
  for video_id in item['youtube_id']:
    try:
      comments = get_all_comments(video_id, item)
      # Convert to DataFrame for easy analysis
      df = pd.DataFrame(comments)
      print(f"{item['tipe_produk']} Total Comments Retrieved: {len(df)}")
      file_path = f"./datasets-2025/{item['tipe_produk']}.csv"
      file_exists = os.path.exists(file_path)
      df.to_csv(file_path, mode='a', header=not file_exists, index=False)
    except Exception as e:
      print(f"An error occurred: {str(e)}")

Galaxy S25 Total Comments Retrieved: 35
Galaxy S25 Total Comments Retrieved: 8
Galaxy S25 Total Comments Retrieved: 217
Galaxy S25 Ultra Total Comments Retrieved: 62
Galaxy S25 Slim Total Comments Retrieved: 367
iPhone 17 Total Comments Retrieved: 91
iPhone 17 Air Total Comments Retrieved: 105
Pixel 10 Total Comments Retrieved: 45
Pixel 10 Pro Total Comments Retrieved: 26
Pixel 10 Pro Total Comments Retrieved: 22
Pixel 9a Total Comments Retrieved: 20
Pixel 9a Total Comments Retrieved: 40
Xiaomi 15 Total Comments Retrieved: 912
Xiaomi 15 Pro Total Comments Retrieved: 174
Redmi Note 14 Series Total Comments Retrieved: 157
Realme 13 Series Total Comments Retrieved: 2673
OPPO Find X7 Total Comments Retrieved: 94
OPPO Find X7 Total Comments Retrieved: 288
Huawei Pura 80 Ultra Total Comments Retrieved: 19
