# Facebook Comments Scrapping

In [93]:
import pandas as pd
import re
import numpy as np

In [3]:

def parse_facebook_comments(file_path):
    """
    Processes the raw text from a file containing a copied Facebook post 
    to extract structured comments into a list of dictionaries.
    
    Args:
        file_path (str): The path to the text file containing the copied Facebook data.
        
    Returns:
        list: A list of dictionaries containing 'Author', 'Comment', and 'Timestamp'.
    """
    
    # Read file content inside the function
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return []

    # 1. First try to use the unique repeated "Facebook" lines as the start marker.
    FACEBOOK_SPAM_BLOCK = "Facebook\nFacebook\nFacebook\nFacebook\nFacebook\nFacebook\nFacebook\nFacebook\nFacebook\nFacebook\nFacebook"
    comment_start_index = raw_text.find(FACEBOOK_SPAM_BLOCK)
    
    if comment_start_index == -1:
        # Fallback 1: Try finding "B√¨nh lu·∫≠n" after the main post content marker ("v·∫•n ƒë·ªÅ b·∫£n quy·ªÅn")
        post_content_start_phrase = "v·∫•n ƒë·ªÅ b·∫£n quy·ªÅn"
        post_start_index = raw_text.find(post_content_start_phrase)
        
        if post_start_index != -1:
            search_area = raw_text[post_start_index:]
        else:
            search_area = raw_text
            
        comment_marker_index = search_area.find("B√¨nh lu·∫≠n")
        
        if comment_marker_index == -1:
            print("Error: Could not find a reliable start marker for the 'B√¨nh lu·∫≠n' section.")
            return []

        absolute_comment_start_index = post_start_index + comment_marker_index
        comment_text = raw_text[absolute_comment_start_index + len("B√¨nh lu·∫≠n"):].strip()
    else:
        # If the new block is found, start immediately after it.
        comment_text = raw_text[comment_start_index + len(FACEBOOK_SPAM_BLOCK):].strip()


    # 2. Split the text into blocks based on the action buttons or double newlines.
    blocks = re.split(r'\nTr·∫£ l·ªùi\nChia s·∫ª|\nTr·∫£ l·ªùi\n\n|\nƒê√£ ch·ªânh s·ª≠a\nTr·∫£ l·ªùi\nChia s·∫ª|\nTr·∫£ l·ªùi\n|\nChia s·∫ª', comment_text)
    
    parsed_comments = []

    # 3. Define the core regex pattern for extraction within each block.
    comment_pattern = re.compile(
        r'(.+?)\n\s*'  # 1. Capture Author Name
        r'(.+?)'        # 2. Capture Comment Content
        r'(\d+ (?:tu·∫ßn|ng√†y|ph√∫t)|\d+ (?:gi·ªù|ph√∫t)|\d+ [0-9]{1,2} [0-9]{4})' # 3. Capture Time/Date
        , re.DOTALL
    )

    for block in blocks:
        block = block.strip()
        if not block:
            continue
        
        match = comment_pattern.search(block)
        
        if match:
            author_raw = match.group(1).strip()
            time_stamp = match.group(3).strip()
            content = match.group(2).strip()

            # --- Clean-up Steps ---
            content = re.sub(r'\.\.\. Xem th√™m', '', content, flags=re.DOTALL).strip()
            
            author_lines = author_raw.split('\n')
            author = author_lines[0].strip()
            
            # Remove any special role tags from the author line
            author = re.sub(r'Ng∆∞·ªùi ƒë√≥ng g√≥p nhi·ªÅu nh·∫•t|T√°c gi·∫£', '', author).strip()

            # Re-join any content that got split into the Author's raw field
            if len(author_lines) > 1:
                content_start_lines = author_lines[1:]
                content = '\n'.join(content_start_lines) + '\n' + content
                
            if len(content) < 5:
                continue
            
            parsed_comments.append({
                'Author': author,
                'Comment': content,
                'Timestamp': time_stamp
            })

    return parsed_comments


In [4]:
post1 = parse_facebook_comments('fb_post1.txt')

if post1:
    df_comments = pd.DataFrame(post1)
    
    # Final Cleaning
    df_comments['Comment'] = df_comments['Comment'].str.replace(r'[\r\n\t]+', ' ', regex=True).str.strip()
    df_comments.drop_duplicates(subset=['Comment'], keep='first', inplace=True)
    df_comments = df_comments[df_comments['Comment'].str.len() > 10].reset_index(drop=True)
    
    print("\n--- SUCCESSFULLY PARSED COMMENTS ---")
    print(f"Total Comments Extracted: {len(df_comments)}")
    print("\nDataFrame Preview:")
    print(df_comments.head(10).to_markdown(index=False))
    
    # Example: Save to CSV
    # df_comments.to_csv('cleaned_facebook_comments.csv', index=False, encoding='utf-8')
else:
    print("\nParsing failed. Please check the content of 'comments.txt' to ensure the structure is consistent.")



--- SUCCESSFULLY PARSED COMMENTS ---
Total Comments Extracted: 60

DataFrame Preview:
| Author         | Comment                                                                                                                                                                                                                                                                                                               | Timestamp   |
|:---------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------|
| Th·∫±ng ƒê·∫ßu L√¨n  | Kh√¥ng ph·ªß nh·∫≠n nhi·ªÅu ƒë·ª©a trong n√†y t·ª´ng coi l·∫≠u nhi·ªÅu k·ªÉ c·∫£ tao h·ªìi ƒë√≥. Gi·ªù l√∫c coi b·ªô manga y√™u th√≠ch n√†o ƒë√≥ t cx mu·ªën mua h√†ng ·ªßng h·ªô t√°c gi·∫£ c∆°      

## Translate to English

an open-source pretrained <b>Large Vision Model</b> from <a href='https://ollama.com'>Ollama</a>

#### Using Ollama (https://ollama.com/library/llava)

Download the official executable file from https://ollama.com/download

In [5]:
# Install the LLM for Natural LAnguage Processing model locally
!ollama pull mistral

[?2026h[?25l[1Gpulling manifest ‚†ã [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†ô [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†π [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†∏ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†º [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†¥ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†¶ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†ß [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†á [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†è [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling f5074b1221da:   0% ‚ñï                  ‚ñè 1.6 MB/4.4 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling f5074b1221da:   0% ‚ñï                  ‚ñè 7.8 MB/4.4 GB                  [K[?25h[?2026l[?2026h[?25l[A[1Gpulling manifest [K
pulling f5074b1221da:   0% ‚ñï                  ‚ñè  11 MB/4.4 GB                  [K[?25h[?2026l[?2026h[?25l[A

In [6]:
# Check if Ollama is running, should be error:  address already in use
!ollama serve

Error: listen tcp 127.0.0.1:11434: bind: address already in use


In [7]:
import re
import json
import pandas as pd
import requests
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_FILE_NAME = 'comments.txt'
OLLAMA_API_URL = 'http://localhost:11434/api/generate'
OLLAMA_MODEL = 'llama3' 

def translate_text_with_ollama(text):
    """
    Sends text to the local Ollama Llama 3 API for translation.
    
    Args:
        text (str): The Vietnamese text to translate.
        
    Returns:
        str: The English translation or an error message.
    """
    if not text or len(text.strip()) < 5:
        return ""

    # prompt = f"Translate the following Vietnamese social media comment to natural, modern English. Be concise and provide only the translated text: {text}"
    prompt = f"Translate the following Vietnamese text to English, provides only the translated text: {text}"
    
    payload = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.1 # Low temperature for accurate, literal translation
        }
    }
    
    try:
        response = requests.post(OLLAMA_API_URL, json=payload, timeout=30)
        response.raise_for_status()
        
        # Ollama returns a JSON response; the generated text is under 'response'
        data = response.json()
        translated_text = data.get('response', '').strip()
        
        # Clean up common LLM output formatting (like unnecessary quotes or labels)
        if translated_text.startswith(('"', "'")) and translated_text.endswith(('"', "'")):
            translated_text = translated_text[1:-1]
        
        return translated_text
        
    except requests.exceptions.Timeout:
        return "ERROR: Ollama API Timeout"
    except requests.exceptions.RequestException as e:
        return f"ERROR: Ollama connection failed or server error. Check if Ollama is running: {e}"
    except Exception as e:
        return f"ERROR: Unknown API issue: {e}"

In [8]:
df_comments.head()

Unnamed: 0,Author,Comment,Timestamp
0,Th·∫±ng ƒê·∫ßu L√¨n,Kh√¥ng ph·ªß nh·∫≠n nhi·ªÅu ƒë·ª©a trong n√†y t·ª´ng coi l·∫≠...,1 tu·∫ßn
1,Quang Ki·ªám,L·∫≠u l√† l·∫≠u. M√¨nh kh√¥ng c√≥ ti·ªÅn/ƒëi·ªÅu ki·ªán/c√°ch ...,1 tu·∫ßn
2,Van Anh Pham,T ch∆°i game l·∫≠u khi ch∆∞a c√≥ ti·ªÅn c√≤n khi c√≥ ƒëi...,1 tu·∫ßn
3,Uy√™n Nh√£,B·ªè ti·ªÅn ra mua truy·ªán ·ªßng h·ªô b·∫£n quy·ªÅn v√† t·ª´ng...,1 tu·∫ßn
4,Nguy·ªÖn Minh V≈©,Uy√™n Nh√£ ng∆∞·ªùi c√≥ ƒë·ªß c·∫£ ti·ªÅn v√† √Ω th·ª©c ƒë·ªÉ mua ...,6 ng√†y


In [9]:
translate_text_with_ollama(df_comments.loc[0, 'Comment'])

"I don't deny that many of them used to watch and read a lot, including me back then. Now, whenever I see my favorite manga series, I want to buy the merchandise to support the author."

In [18]:
# translate extracted comments and add to column of df_comments
translated_cmt = [translate_text_with_ollama(c) for c in df_comments.loc[:, 'Comment']]
df_comments.loc[:, 'translated_cmt'] = translated_cmt

In [19]:
df_comments.head()

Unnamed: 0,Author,Comment,Timestamp,translated_cmt
0,Th·∫±ng ƒê·∫ßu L√¨n,Kh√¥ng ph·ªß nh·∫≠n nhi·ªÅu ƒë·ª©a trong n√†y t·ª´ng coi l·∫≠...,1 tu·∫ßn,I don't deny that many of them used to watch p...
1,Quang Ki·ªám,L·∫≠u l√† l·∫≠u. M√¨nh kh√¥ng c√≥ ti·ªÅn/ƒëi·ªÅu ki·ªán/c√°ch ...,1 tu·∫ßn,I'm a pirate. I don't have money/language skil...
2,Van Anh Pham,T ch∆°i game l·∫≠u khi ch∆∞a c√≥ ti·ªÅn c√≤n khi c√≥ ƒëi...,1 tu·∫ßn,"Here is the translation:\n\n""I play pirated ga..."
3,Uy√™n Nh√£,B·ªè ti·ªÅn ra mua truy·ªán ·ªßng h·ªô b·∫£n quy·ªÅn v√† t·ª´ng...,1 tu·∫ßn,"Here is the translation:\n\n""I spent money to ..."
4,Nguy·ªÖn Minh V≈©,Uy√™n Nh√£ ng∆∞·ªùi c√≥ ƒë·ªß c·∫£ ti·ªÅn v√† √Ω th·ª©c ƒë·ªÉ mua ...,6 ng√†y,Uyen Nha has enough money and awareness to buy...


# Official Publisher Selling Data

<a href='https://en.wikipedia.org/wiki/Kim_ƒê·ªìng_Publishing_House'><b>Kim ƒê·ªìng Publishing House</b></a> has been a leading publisher in Vietnam since its establishment in 1957. The company recently acquired licenses for many popular Japanese comic books (manga) and sells them on <b>Shopee</b>, a major e-commerce marketplace in Southeast Asia similar to <b>Amazon</b>. This makes their sales data a reliable source for our market analysis.

Unfortunately, direct data crawling from their online store is restricted, which significantly complicated the extraction process. To circumvent this technical barrier and leverage a robust tool, I utilized the <b>Google AI Studio</b> freetier for API requests. This strategy allowed me to capture screenshots of the sales listings and then use the Gemini API to accurately extract the necessary information.

## Gemini Free-tier API

Google AI Studio gives a free tier API call for Gemini Flash model which is truly effective for images extraction tasks for about 100 images per day. That's enough for this project

In [None]:
import requests
import json
import base64
import os
import sys
import pandas as pd
from typing import Tuple, Optional, Any, Dict, List
import time
from gemini_api_key import GEMINI_KEY


apiKey = GEMINI_KEY 
MODEL_NAME = "gemini-2.5-flash-preview-05-20" 
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL_NAME}:generateContent?key={apiKey}"


# 1. Image Encoding Function (Required for API submission)
def encode_image_to_base64(image_path: str) -> Optional[str]:
    """Encodes the image file into a Base64 string for the API payload."""
    try:
        with open(image_path, "rb") as f:
            print("Image encoded successfully.")
            return base64.b64encode(f.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: Image file not found at path: {image_path}")
        return None
    except Exception as e:
        print(f"Error reading image file: {e}")
        return None
    
    
# 2. Gemini Extraction Function
def extract_structured_data_gemini(base64_image: str) -> Tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
    """
    Sends the image and prompt to the Gemini API, requesting structured JSON output.
    Uses exponential backoff for robustness.
    """
    
    # UPDATED STRUCTURED EXTRACTION PROMPT: Focuses on extracting small numbers precisely.
    prompt = """
    You are an expert data extractor. Analyze the provided e-commerce grid image containing product listings.
    Your task is to extract the following fields for all products visible:
    1. name: The full title of the product (e.g., 'Th√°m T·ª≠ L·ª´ng Danh Conan').
    2. price_vnd: The numerical price in Vietnamese Dong (VND).
    3. sold_count: The numerical count of items sold. Only include the number. Look for the phrase 'ƒê√£ b√°n' or 'b√°n' followed by a number.

    Respond STRICTLY in a JSON array format. Do not include any text, notes, or explanations outside the JSON object.
    """

    payload = {
        "contents": [
            {
                "parts": [
                    {"text": prompt},
                    {
                        "inlineData": {
                            "mimeType": "image/jpeg",
                            "data": base64_image
                        }
                    }
                ]
            }
        ],
        "generationConfig": {
            "responseMimeType": "application/json",
            "responseSchema": {
                "type": "ARRAY",
                "items": {
                    "type": "OBJECT",
                    "properties": {
                        "name": {"type": "STRING", "description": "The full product name or title."},
                        "price_vnd": {"type": "INTEGER", "description": "The numerical price in VND."},
                        "sold_count": {"type": "INTEGER", "description": "The numerical count of items sold."}
                    },
                    "required": ["name", "price_vnd", "sold_count"]
                }
            }
        }
    }

    print(f"Connecting to Gemini API with model {MODEL_NAME}...")
    
    try:
        # Exponential backoff for robust API calls
        for i in range(5):
            try:
                response = requests.post(API_URL, json=payload, timeout=60) 
                response.raise_for_status() 
                break
            except requests.exceptions.RequestException:
                if i < 4:
                    wait_time = 2 ** i
                    time.sleep(wait_time)
                else:
                    raise

        # Extract and parse the generated JSON text
        data = response.json()
        raw_json_string = data.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text', '{}').strip()
        
        return json.loads(raw_json_string), None

    except requests.exceptions.RequestException as e:
        return None, f"Network/Connection Error: Ensure internet access and valid API key. Details: {e}"
    except json.JSONDecodeError:
        return None, f"JSON Decoding Error: Model output was not valid JSON."
    except Exception as e:
        return None, f"An unexpected error occurred: {e}"

In [15]:
def ocr(img_path):
    # 1. Encode the image
    base64_image = encode_image_to_base64(img_path)

    if base64_image:
        # 2. Extract data using Gemini
        extracted_data, error_message = extract_structured_data_gemini(base64_image)

        print("-" * 50)
        
        if error_message:
            print(f"Extraction Failed: {error_message}")
        elif extracted_data:
            return pd.DataFrame(extracted_data)
        else:
            print("Extraction failed. Check model output.")

In [17]:
SHP_DATA_PATH = 'data'
CURR_DIR = os.getcwd()

df_sells = pd.DataFrame(columns=['name', 'price_vnd', 'sold_count'])
for path in os.listdir(SHP_DATA_PATH):
    ipath = os.path.join(CURR_DIR, SHP_DATA_PATH, path)
    print(ipath)
    df_extracted = ocr(ipath)
    df_sells = pd.concat([df_sells, df_extracted], ignore_index=True)

/Users/tranquocan.truong/Desktop/DALAS/Project/data/kimdong1.png
Image encoded successfully.
Connecting to Gemini API with model gemini-2.5-flash-preview-05-20...
--------------------------------------------------
/Users/tranquocan.truong/Desktop/DALAS/Project/data/kimdong2.png
Image encoded successfully.
Connecting to Gemini API with model gemini-2.5-flash-preview-05-20...
--------------------------------------------------
/Users/tranquocan.truong/Desktop/DALAS/Project/data/kimdong3.png
Image encoded successfully.
Connecting to Gemini API with model gemini-2.5-flash-preview-05-20...
--------------------------------------------------
/Users/tranquocan.truong/Desktop/DALAS/Project/data/kimdong6.png
Image encoded successfully.
Connecting to Gemini API with model gemini-2.5-flash-preview-05-20...
--------------------------------------------------
/Users/tranquocan.truong/Desktop/DALAS/Project/data/kimdong4.png
Image encoded successfully.
Connecting to Gemini API with model gemini-2.5-flas

In [None]:
df_sells

Unnamed: 0,name,price_vnd,sold_count
0,Truy·ªán - Conan (B·∫£n N√¢ng C·∫•p),35000,20000
1,Truy·ªán - D·∫•u ·∫§n Ho√†ng Gia,30000,70000
2,Truy·ªán - Huy·∫øt Qu·ª∑ H·ªìn Chi·∫øn 2025,35000,2000
3,Truy·ªán - Th√°m T·ª≠ L·ª´ng Danh Conan - Ti√™n Truy·ªán...,65000,999
4,Truy·ªán Tranh - Iruma Gi√° ƒê√°o,25000,3000
...,...,...,...
85,S√°ch - One Piece ( T51 - T99 ),25000,3000
86,S√°ch _ Nh·ªØng C√¢u Chuy·ªán Truy·ªÅn C·∫£m H·ª©n...,50000,198
87,Truy·ªán - Ng∆∞·ªùi l√≠nh ƒêi·ªán Bi√™n k·ªÉ chuy·ªán ( TB...,35000,160
88,"Truy·ªán - Fullmetal, Alchemist - Cang gi·∫£ kim...",75000,5000


# Viet Nam Statistical Data

The <b>National Statistics Office</b> of Vietnam does actually have multiple <a href='https://www.nso.gov.vn/en/statistical-data/?utm_source=chatgpt.com'>public statistical dataset</a> of almost all concerns, so this is our data come from.

## Average expenditure by type of goods and by region

In [56]:
df_avg_expenditure = pd.read_csv('data/VN_monthly_expenditure.csv', header=1)
df_avg_expenditure

Unnamed: 0,Items,2014 Tota expenditure,2014 Living expenditure,2014 Eating drinking & smoking expenditure,2014 Non eating drinking & smoking expenditure,2014 Others,2016 Tota expenditure,2016 Living expenditure,2016 Eating drinking & smoking expenditure,2016 Non eating drinking & smoking expenditure,...,2022 Tota expenditure,2022 Living expenditure,2022 Eating drinking & smoking expenditure,2022 Non eating drinking & smoking expenditure,2022 Others,Prel. 2024 Tota expenditure,Prel. 2024 Living expenditure,Prel. 2024 Eating drinking & smoking expenditure,Prel. 2024 Non eating drinking & smoking expenditure,Prel. 2024 Others
0,WHOLE COUNTRY,1888.0,1763.0,927.0,836.0,125.0,2157.0,2016.0,1027.0,989.0,...,2794.93,2667.93,1287.14,1380.79,127.0,2976.59,2814.33,1381.1,1433.23,162.26
1,Urban,2613.0,2461.0,1239.0,1222.0,152.0,3059.0,2886.0,1404.0,1482.0,...,3263.85,3124.43,1468.64,1655.8,139.42,3767.74,3587.59,1709.9,1877.69,180.14
2,Rural,1557.0,1444.0,784.0,660.0,113.0,1735.0,1609.0,851.0,758.0,...,2496.18,2377.09,1171.51,1205.58,119.09,2488.09,2336.87,1178.07,1158.79,151.22
3,Red River Delta,2241.0,2082.0,1079.0,1003.0,159.0,2528.0,2364.0,1151.0,1213.0,...,3394.31,3230.02,1569.13,1660.88,164.29,3450.86,3235.25,1521.88,1713.36,215.61
4,Northern midlands and mountain areas,1538.0,1441.0,752.0,689.0,97.0,1655.0,1551.0,824.0,727.0,...,1969.98,1871.43,961.67,909.76,98.55,2337.68,2207.6,1084.81,1122.78,130.08
5,North Central area and Central coastal area,1647.0,1537.0,850.0,687.0,110.0,1809.0,1685.0,918.0,767.0,...,2547.26,2426.29,1200.82,1225.47,120.96,2715.28,2573.97,1315.82,1258.15,141.32
6,Central Highlands,1660.0,1537.0,777.0,760.0,123.0,1766.0,1620.0,828.0,792.0,...,2208.69,2105.73,979.92,1125.81,102.97,2274.42,2139.91,1080.11,1059.8,134.51
7,South East,2410.0,2282.0,1155.0,1127.0,128.0,3018.0,2846.0,1415.0,1431.0,...,3579.91,3455.93,1524.21,1931.72,123.99,3674.66,3523.39,1723.51,1799.88,151.28
8,Mekong River Delta,1602.0,1484.0,805.0,679.0,118.0,1872.0,1741.0,888.0,853.0,...,2257.83,2142.62,1101.67,1040.95,115.21,2610.96,2450.29,1223.75,1226.54,160.66
9,,,,,,,,,,,...,,,,,,,,,,


In [57]:
# 1. Replace Items by Region
df_avg_expenditure = df_avg_expenditure.rename(columns={'Items': 'Region'})

# Set Region column as index
df_avg_expenditure = df_avg_expenditure.set_index('Region')

# 2. Create the MultiIndex
# split each column name i.e "2014 Total expenditure" to ('2014', 'Total expenditure')
new_columns = [tuple(c.split(' ', 1)) for c in df_avg_expenditure.columns]

# Create the MultiIndex
df_avg_expenditure.columns = pd.MultiIndex.from_tuples(new_columns, names=['Year', 'Expenditure Type'])

df_avg_expenditure

Year,2014,2014,2014,2014,2014,2016,2016,2016,2016,2016,...,2022,2022,2022,2022,2022,Prel.,Prel.,Prel.,Prel.,Prel.
Expenditure Type,Tota expenditure,Living expenditure,Eating drinking & smoking expenditure,Non eating drinking & smoking expenditure,Others,Tota expenditure,Living expenditure,Eating drinking & smoking expenditure,Non eating drinking & smoking expenditure,Others,...,Tota expenditure,Living expenditure,Eating drinking & smoking expenditure,Non eating drinking & smoking expenditure,Others,2024 Tota expenditure,2024 Living expenditure,2024 Eating drinking & smoking expenditure,2024 Non eating drinking & smoking expenditure,2024 Others
Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
WHOLE COUNTRY,1888.0,1763.0,927.0,836.0,125.0,2157.0,2016.0,1027.0,989.0,141.0,...,2794.93,2667.93,1287.14,1380.79,127.0,2976.59,2814.33,1381.1,1433.23,162.26
Urban,2613.0,2461.0,1239.0,1222.0,152.0,3059.0,2886.0,1404.0,1482.0,173.0,...,3263.85,3124.43,1468.64,1655.8,139.42,3767.74,3587.59,1709.9,1877.69,180.14
Rural,1557.0,1444.0,784.0,660.0,113.0,1735.0,1609.0,851.0,758.0,126.0,...,2496.18,2377.09,1171.51,1205.58,119.09,2488.09,2336.87,1178.07,1158.79,151.22
Red River Delta,2241.0,2082.0,1079.0,1003.0,159.0,2528.0,2364.0,1151.0,1213.0,164.0,...,3394.31,3230.02,1569.13,1660.88,164.29,3450.86,3235.25,1521.88,1713.36,215.61
Northern midlands and mountain areas,1538.0,1441.0,752.0,689.0,97.0,1655.0,1551.0,824.0,727.0,104.0,...,1969.98,1871.43,961.67,909.76,98.55,2337.68,2207.6,1084.81,1122.78,130.08
North Central area and Central coastal area,1647.0,1537.0,850.0,687.0,110.0,1809.0,1685.0,918.0,767.0,124.0,...,2547.26,2426.29,1200.82,1225.47,120.96,2715.28,2573.97,1315.82,1258.15,141.32
Central Highlands,1660.0,1537.0,777.0,760.0,123.0,1766.0,1620.0,828.0,792.0,146.0,...,2208.69,2105.73,979.92,1125.81,102.97,2274.42,2139.91,1080.11,1059.8,134.51
South East,2410.0,2282.0,1155.0,1127.0,128.0,3018.0,2846.0,1415.0,1431.0,172.0,...,3579.91,3455.93,1524.21,1931.72,123.99,3674.66,3523.39,1723.51,1799.88,151.28
Mekong River Delta,1602.0,1484.0,805.0,679.0,118.0,1872.0,1741.0,888.0,853.0,131.0,...,2257.83,2142.62,1101.67,1040.95,115.21,2610.96,2450.29,1223.75,1226.54,160.66
,,,,,,,,,,,...,,,,,,,,,,


In [58]:
df_avg_expenditure['2014'].loc['Urban', 'Eating drinking & smoking expenditure']

np.float64(1239.0)

## Average income by region (thousand VND)

In [97]:
df_avg_income = pd.read_csv('data/VN_avg_monthly_income_1424.csv', header=1)
# Normalize pivot column name
df_avg_income = df_avg_income.rename(columns={'Residence and region': 'Region'})

# Set Region column as index
df_avg_income = df_avg_income.set_index('Region')

# 1. Get a list of the numeric column names
numeric_cols = df_avg_income.select_dtypes(include=np.number).columns

# 2. convert from VND to USD with the rate 1 USD = 26200 VND
df_avg_income[numeric_cols] = df_avg_income[numeric_cols] * 1000

df_avg_income

Unnamed: 0_level_0,2014,2016,2018,2019,2020,2021,2022,2023,Prel. 2024
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WHOLE COUNTRY,2637000.0,3098000.0,3874000.0,4294500.0,4250000.0,4204800.0,4672540.0,4961810.0,5414980.0
Urban,3964000.0,4551000.0,5624000.0,6022390.0,5500000.0,5388400.0,5944720.0,6258200.0,6889530.0
Rural,2038000.0,2423000.0,2986000.0,3399360.0,3482000.0,3485800.0,3863640.0,4169660.0,4504500.0
Red River Delta,3265000.0,3883000.0,4775000.0,5190680.0,5084000.0,5026000.0,5586470.0,5980630.0,6558140.0
Northern midlands and mountain areas,1613000.0,1963000.0,2452000.0,2640050.0,2745000.0,2837500.0,3169740.0,3438050.0,3759180.0
North Central area and Central coastal area,1982000.0,2358000.0,3014000.0,3331030.0,3405000.0,3493200.0,3966680.0,4274910.0,4648490.0
Central Highlands,2008000.0,2366000.0,2895000.0,3094640.0,2817000.0,2855600.0,3281790.0,3548190.0,3882340.0
South East,4125000.0,4662000.0,5792000.0,6279740.0,6024000.0,5794200.0,6334090.0,6514150.0,7074540.0
Mekong River Delta,2327000.0,2778000.0,3585000.0,3886420.0,3874000.0,3713200.0,4076640.0,4370970.0,4753170.0
,,,,,,,,,


## Employed persons in the economy by ages (unit: thousand person)

In [94]:
df_employed = pd.read_csv('data/Number of employed persons in the economy by age group.csv', header=1)
df_employed = df_employed.dropna()

# 1. Get a list of the numeric column names
numeric_cols = df_employed.select_dtypes(include=np.number).columns

# 2. Use that list to select and update only those columns
df_employed[numeric_cols] = df_employed[numeric_cols] * 1000

df_employed


Unnamed: 0,Age group,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,Prel.2024
0,TOTAL,53030600.0,53110500.0,53345500.0,53708600.0,54282500.0,54659200.0,53609580.0,49072000.0,50604710.0,51287000.0,51860300.0
1,15-19,2405300.0,2407500.0,2117000.0,2192600.0,2030500.0,2083700.0,1678120.0,1315200.0,1351140.0,1325500.0,1311800.0
2,20-24,4749400.0,5055400.0,4848400.0,4838800.0,4545200.0,4609400.0,3946460.0,3343700.0,3463230.0,3939610.0,3373360.0
3,25-29,6167300.0,6125300.0,6195700.0,6028800.0,6175500.0,6630000.0,6218560.0,5489000.0,5281890.0,4836970.0,4512900.0
4,30-34,6556700.0,6899400.0,6837900.0,6705600.0,6593600.0,7365500.0,7262620.0,6679600.0,6855910.0,6549990.0,6386320.0
5,35-39,6490500.0,6425300.0,6472100.0,6618100.0,6831700.0,7271200.0,7543010.0,7060000.0,7300520.0,7260830.0,7276440.0
6,40-44,6554300.0,6386400.0,6473400.0,6598700.0,6679900.0,6419000.0,6736470.0,6154000.0,6592070.0,6875320.0,7275500.0
7,45-49,6109100.0,5849600.0,5994200.0,6183800.0,6590700.0,6077300.0,6172570.0,5670300.0,6061660.0,6200120.0,6391810.0
8,50+,13998000.0,13961600.0,14406800.0,14542200.0,14835400.0,14203100.0,14051700.0,13360300.0,13698290.0,14298660.0,15332160.0


## All combine

In [64]:
# IMPORTANT: Give a name to this new data type
new_data_type = 'Avg Monthly Income' 

# Create a MultiIndex where the top level is the year and the bottom is our new name
income_cols = [(year, new_data_type) for year in df_avg_income.columns]
df_avg_income.columns = pd.MultiIndex.from_tuples(income_cols, names=['Year', 'Expenditure Type'])

df_avg_income

Year,2014,2016,2018,2019,2020,2021,2022,2023,Prel. 2024
Expenditure Type,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income,Avg Monthly Income
Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
WHOLE COUNTRY,2637.0,3098.0,3874.0,4294.5,4250.0,4204.8,4672.54,4961.81,5414.98
Urban,3964.0,4551.0,5624.0,6022.39,5500.0,5388.4,5944.72,6258.2,6889.53
Rural,2038.0,2423.0,2986.0,3399.36,3482.0,3485.8,3863.64,4169.66,4504.5
Red River Delta,3265.0,3883.0,4775.0,5190.68,5084.0,5026.0,5586.47,5980.63,6558.14
Northern midlands and mountain areas,1613.0,1963.0,2452.0,2640.05,2745.0,2837.5,3169.74,3438.05,3759.18
North Central area and Central coastal area,1982.0,2358.0,3014.0,3331.03,3405.0,3493.2,3966.68,4274.91,4648.49
Central Highlands,2008.0,2366.0,2895.0,3094.64,2817.0,2855.6,3281.79,3548.19,3882.34
South East,4125.0,4662.0,5792.0,6279.74,6024.0,5794.2,6334.09,6514.15,7074.54
Mekong River Delta,2327.0,2778.0,3585.0,3886.42,3874.0,3713.2,4076.64,4370.97,4753.17
,,,,,,,,,


In [67]:
# --- 4. Join the two DataFrames ---
# .join() combines the tables side-by-side, aligning them on the 'Region' index
df_stats = df_avg_expenditure.join(df_avg_income, how='outer')

In [66]:
df_stats['2014']

Expenditure Type,Tota expenditure,Living expenditure,Eating drinking & smoking expenditure,Non eating drinking & smoking expenditure,Others,Avg Monthly Income
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Highlands,1660.0,1537.0,777.0,760.0,123.0,2008.0
Mekong River Delta,1602.0,1484.0,805.0,679.0,118.0,2327.0
North Central area and Central coastal area,1647.0,1537.0,850.0,687.0,110.0,1982.0
Northern midlands and mountain areas,1538.0,1441.0,752.0,689.0,97.0,1613.0
Red River Delta,2241.0,2082.0,1079.0,1003.0,159.0,3265.0
Rural,1557.0,1444.0,784.0,660.0,113.0,2038.0
South East,2410.0,2282.0,1155.0,1127.0,128.0,4125.0
Urban,2613.0,2461.0,1239.0,1222.0,152.0,3964.0
WHOLE COUNTRY,1888.0,1763.0,927.0,836.0,125.0,2637.0
,,,,,,


In [69]:
# Drop NaNs
df_stats = df_stats.dropna()
df_stats['2014']

Expenditure Type,Tota expenditure,Living expenditure,Eating drinking & smoking expenditure,Non eating drinking & smoking expenditure,Others,Avg Monthly Income
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Highlands,1660.0,1537.0,777.0,760.0,123.0,2008.0
Mekong River Delta,1602.0,1484.0,805.0,679.0,118.0,2327.0
North Central area and Central coastal area,1647.0,1537.0,850.0,687.0,110.0,1982.0
Northern midlands and mountain areas,1538.0,1441.0,752.0,689.0,97.0,1613.0
Red River Delta,2241.0,2082.0,1079.0,1003.0,159.0,3265.0
Rural,1557.0,1444.0,784.0,660.0,113.0,2038.0
South East,2410.0,2282.0,1155.0,1127.0,128.0,4125.0
Urban,2613.0,2461.0,1239.0,1222.0,152.0,3964.0
WHOLE COUNTRY,1888.0,1763.0,927.0,836.0,125.0,2637.0


# Watching Anime Data

## From Unofficial Website

In [144]:
df_unoff_anime = pd.read_csv('data/anime_watching_count.csv', sep=';')
df_unoff_anime.head()

Unnamed: 0,Anime,watching_count
0,One Piece,229864062
1,Black Clover,88642980
2,Detective Conan,79460576
3,Jujutsu Kaisen 2nd SS,39453479
4,Bleach,37193674


In [146]:
# Check data types
df_unoff_anime.dtypes

Anime             object
watching_count    object
dtype: object

In [147]:
# convert count from str to int
df_unoff_anime['watching_count'] = df_unoff_anime['watching_count'].apply(lambda row: int(''.join(row.split(','))))
df_unoff_anime['watching_count'].dtype

dtype('int64')

In [148]:
df_unoff_anime.head()

Unnamed: 0,Anime,watching_count
0,One Piece,229864062
1,Black Clover,88642980
2,Detective Conan,79460576
3,Jujutsu Kaisen 2nd SS,39453479
4,Bleach,37193674


## From Netflix

The dataset was published on Kaggle (<a href='https://www.kaggle.com/datasets/rohitgrewal/netflix-data'>link</a>)

In [2]:
link2 = 'https://www.kaggle.com/datasets/shivamb/netflix-shows'

In [None]:
# Download data from Kaggle
!curl -L -o ./data/netflix-data.zip\
  https://www.kaggle.com/api/v1/datasets/download/rohitgrewal/netflix-data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1195k  100 1195k    0     0  1433k      0 --:--:-- --:--:-- --:--:01  616k-- 6869k


In [None]:
# Unzip dataset
%cd data
!unzip netflix-data.zip
%cd ..

/Users/tranquocan.truong/Desktop/DALAS/Project/data
Archive:  netflix-data.zip
  inflating: Netflix Dataset.csv     
/Users/tranquocan.truong/Desktop/DALAS/Project


In [131]:
df_netflix = pd.read_csv('data/Netflix Dataset.csv')
df_netflix.head()

Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
0,s1,TV Show,3%,,"Jo√£o Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,07:19,Jorge Michel Grau,"Demi√°n Bichir, H√©ctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [132]:
# Normalize the country column
df_netflix.loc[:, 'Country'] = df_netflix['Country'].str.strip().str.lower()

In [133]:
# Check how many country exists in this dataset
df_netflix['Country'].unique()

array(['brazil', 'mexico', 'singapore', 'united states', 'turkey',
       'egypt', 'india', 'poland, united states', 'thailand', 'nigeria',
       nan, 'norway, iceland, united states', 'united kingdom', 'japan',
       'south korea', 'italy', 'canada', 'indonesia', 'romania', 'spain',
       'iceland', 'south africa, nigeria', 'france',
       'united states, south africa', 'portugal, spain',
       'hong kong, china, singapore', 'united states, germany',
       'south africa, china, united states', 'argentina',
       'united states, france, serbia', 'germany',
       'denmark, france, poland', 'poland', 'kenya',
       'new zealand, united kingdom', 'pakistan', 'australia',
       'australia, united states', 'mexico, united states',
       'united states, china', 'china, hong kong', 'taiwan',
       'united states, united kingdom', 'france, south korea, japan',
       'united states, canada', 'united kingdom, united states',
       'netherlands, denmark, south africa', 'canada, unit

In [134]:
# filter only from country Japan
df_netflix_japan = df_netflix[df_netflix['Country'] == 'japan']
df_netflix_japan

Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
24,s25,TV Show,‚ÄãSAINT SEIYA: Knights of the Zodiac,,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",japan,"January 23, 2020",TV-14,2 Seasons,"Anime Series, International TV Shows",Seiya and the Knights of the Zodiac rise again...
107,s108,Movie,37 Seconds,Hikari,"Mei Kayama, Misuzu Kanno, Shunsuke Daitoh, Mak...",japan,"January 31, 2020",TV-MA,116 min,"Dramas, International Movies","Trapped by society and familial obligations, a..."
228,s229,Movie,A Silent Voice,Naoko Yamada,"Miyu Irino, Saori Hayami, Aoi Yuki, Kensho Ono...",japan,"June 5, 2019",TV-14,130 min,"Anime Features, International Movies",A former class bully reaches out to the deaf g...
253,s254,Movie,A Whisker Away,"Junichi Sato, Tomotaka Shibayama","Mirai Shida, Natsuki Hanae, Hiroaki Ogi, Koich...",japan,"June 18, 2020",TV-PG,104 min,"Anime Features, Children & Family Movies, Inte...",A peculiar girl transforms into a cat to catch...
260,s261,TV Show,A.I.C.O.,Kazuya Murata,"Haruka Shiraishi, Yusuke Kobayashi, Makoto Fur...",japan,"March 9, 2018",TV-14,1 Season,"Anime Series, International TV Shows",Everything Aiko knew was a lie. Now she's join...
...,...,...,...,...,...,...,...,...,...,...,...
7425,s7424,TV Show,Violet Evergarden,,"Yui Ishikawa, Takehito Koyasu, Daisuke Namikaw...",japan,"February 5, 2019",TV-14,1 Season,"Anime Series, International TV Shows, Romantic...","The war is over, and Violet Evergarden needs a..."
7426,s7425,Movie,Violet Evergarden: Eternity and the Auto Memor...,Haruka Fujita,"Yui Ishikawa, Minako Kotobuki, Aoi Yuki",japan,"April 2, 2020",TV-PG,91 min,"Anime Features, International Movies, Sci-Fi &...",A lonely young woman feels trapped at her all ...
7742,s7741,TV Show,Your lie in April,,"Natsuki Hanae, Risa Taneda, Ayane Sakura, Ryot...",japan,"March 1, 2016",TV-14,1 Season,"Anime Series, International TV Shows, Romantic...","After his mother's death, a piano prodigy's wi..."
7751,s7750,TV Show,Yu-Gi-Oh!,,"Dan Green, Eric Stuart, Amy Birnbaum, Darren D...",japan,"July 8, 2020",TV-Y7,2 Seasons,"Anime Series, Kids' TV",The lives of young Yugi Moto and his friends J...


In [135]:
# Normalize the type column 
df_netflix_japan.loc[:, 'Type'] = df_netflix_japan['Type'].str.strip().str.lower()

# retain only which contains "anime" keyword
df_netflix_anime = df_netflix_japan[df_netflix_japan['Type'].str.contains('anime')]
df_netflix_anime

Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
24,s25,TV Show,‚ÄãSAINT SEIYA: Knights of the Zodiac,,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",japan,"January 23, 2020",TV-14,2 Seasons,"anime series, international tv shows",Seiya and the Knights of the Zodiac rise again...
228,s229,Movie,A Silent Voice,Naoko Yamada,"Miyu Irino, Saori Hayami, Aoi Yuki, Kensho Ono...",japan,"June 5, 2019",TV-14,130 min,"anime features, international movies",A former class bully reaches out to the deaf g...
253,s254,Movie,A Whisker Away,"Junichi Sato, Tomotaka Shibayama","Mirai Shida, Natsuki Hanae, Hiroaki Ogi, Koich...",japan,"June 18, 2020",TV-PG,104 min,"anime features, children & family movies, inte...",A peculiar girl transforms into a cat to catch...
260,s261,TV Show,A.I.C.O.,Kazuya Murata,"Haruka Shiraishi, Yusuke Kobayashi, Makoto Fur...",japan,"March 9, 2018",TV-14,1 Season,"anime series, international tv shows",Everything Aiko knew was a lie. Now she's join...
341,s342,TV Show,Aggretsuko,,"Kaolip, Komegumi Koiwasaki, Maki Tsuruta, Soht...",japan,"August 27, 2020",TV-14,3 Seasons,"anime series, international tv shows","Frustrated with her thankless office job, Rets..."
...,...,...,...,...,...,...,...,...,...,...,...
7425,s7424,TV Show,Violet Evergarden,,"Yui Ishikawa, Takehito Koyasu, Daisuke Namikaw...",japan,"February 5, 2019",TV-14,1 Season,"anime series, international tv shows, romantic...","The war is over, and Violet Evergarden needs a..."
7426,s7425,Movie,Violet Evergarden: Eternity and the Auto Memor...,Haruka Fujita,"Yui Ishikawa, Minako Kotobuki, Aoi Yuki",japan,"April 2, 2020",TV-PG,91 min,"anime features, international movies, sci-fi &...",A lonely young woman feels trapped at her all ...
7742,s7741,TV Show,Your lie in April,,"Natsuki Hanae, Risa Taneda, Ayane Sakura, Ryot...",japan,"March 1, 2016",TV-14,1 Season,"anime series, international tv shows, romantic...","After his mother's death, a piano prodigy's wi..."
7751,s7750,TV Show,Yu-Gi-Oh!,,"Dan Green, Eric Stuart, Amy Birnbaum, Darren D...",japan,"July 8, 2020",TV-Y7,2 Seasons,"anime series, kids' tv",The lives of young Yugi Moto and his friends J...


In [136]:
df_netflix_anime.columns

Index(['Show_Id', 'Category', 'Title', 'Director', 'Cast', 'Country',
       'Release_Date', 'Rating', 'Duration', 'Type', 'Description'],
      dtype='object')

Because this open source dataset does not contain any data of watching count so that only the Title is useful

In [142]:
df_netflix_anime.loc[:, 'Title'] = df_netflix_anime['Title'].str.strip()

In [152]:
netflix_anime_list = list(df_netflix_anime['Title'].values)
netflix_anime_list[:10]

['\u200bSAINT SEIYA: Knights of the Zodiac',
 'A Silent Voice',
 'A Whisker Away',
 'A.I.C.O.',
 'Aggretsuko',
 'AJIN: Demi-Human',
 'Akame ga Kill!',
 'Angel Beats!',
 'Anohana: The Flower We Saw That Day',
 'ÂøçËÄÖ„Éè„ÉÉ„Éà„É™„Åè„Çì']

## Compare most watched anime on Unofficial Website to Netflix

In [150]:
unof_anime_list = list(df_unoff_anime['Anime'].values)
unof_anime_list[:10]

['One Piece',
 'Black Clover',
 'Detective Conan',
 'Jujutsu Kaisen 2nd SS',
 'Bleach',
 'Demon Slayer - Swordsmith village Arc',
 'Demon Slayer - Hashira Training Arc',
 'The Eminence in Shadow',
 'Soul Land - Douluo Dalu',
 'Tsuchimiki: M√¥nlit Fantasy']

In [153]:
# Compare
set(unof_anime_list) & set(netflix_anime_list)

{'Bleach', 'Fairy Tail'}

Our analysis indicates that only two of the top trending anime series in 2024 (based on the collected view counts) are currently licensed for streaming on Netflix. This significant gap in content acquisition suggests that the majority of frequently watched anime titles are unavailable on the platform. Consequently, this <b>lack of in-demand, locally trending content drastically reduces the platform's overall attraction for avid anime consumers, posing a direct threat to subscriber retention and growth in the Vietnamese market</b>.