In [None]:
!pip install python-louvain



In [None]:
import pandas as pd
import re
import os
import networkx as nx
from community.community_louvain import best_partition
import networkx.algorithms.community as nx_comm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
def clean_caption(caption):
    if not isinstance(caption, str):
        return {"cleaned_text": "", "hashtags": [], "mentions": []}

    hashtags = re.findall(r'#\w+', caption)
    mentions = re.findall(r'@\w+', caption)

    if mentions:
        text_before_mention = caption.split('@')[0].strip()
    else:
        text_before_mention = caption.strip()

    text_before_mention = text_before_mention.lower()
    replacements = {
        r'\bko\b': 'không',
        r'\bc\b': 'chị',
        r'\balo\b': 'gọi',
        r'\bmún\b': 'muốn',
        r'\biu\b': 'yêu',
        r'\bxink\b': 'xinh',
        r'\bdág\b': 'dáng'
    }
    for pattern, replacement in replacements.items():
        text_before_mention = re.sub(pattern, replacement, text_before_mention, flags=re.IGNORECASE)

    text_before_mention = text_before_mention.replace('0', 'o')
    text_before_mention = re.sub(r'#\w+', '', text_before_mention)
    text_before_mention = re.sub(r'[^\w\s]', '', text_before_mention)
    text_before_mention = ' '.join(text_before_mention.split())

    return {
        "cleaned_text": text_before_mention,
        "hashtags": hashtags,
        "mentions": mentions
    }

In [None]:
def detect_misleading_caption(cleaned_text):
    if not isinstance(cleaned_text, str):
        return {"is_suspect": 0, "fake_score": 0}

    suspicious_phrases = [
        r'\băn không lo béo\b',
        r'\băn không sợ béo\b',
        r'\băn không lo múp\b',
        r'\băn không sợ múp\b',
        r'\bgiảm cân nhanh\b',
        r'\bgiảm cân tức thì\b',
        r'\bdáng đẹp tức thì\b',
        r'\bgiảm cân dễ dàng\b',
        r'\băn hoài không béo\b',
        r'\bgiảm béo không lo\b',
        r'\bdáng xinh không lo\b',
        r'\băn thoải mái không béo\b',
        r'\bgiảm \d+kg\b',
        r'\b100% thiên nhiên\b',
        r'\btrắng da \d+ ngày\b'
    ]

    fake_score = 0
    for phrase in suspicious_phrases:
        matches = len(re.findall(phrase, cleaned_text))
        fake_score += matches

    is_suspect = 1 if fake_score > 0 else 0

    return {
        "is_suspect": is_suspect,
        "fake_score": fake_score
    }

In [None]:
def build_network(df):
    G = nx.DiGraph()

    creator_suspect = df.groupby('Creator ID')['is_suspect'].max().to_dict()
    creator_name_map = df.groupby('Creator ID')['Creator name'].first().to_dict()
    unique_creators = df['Creator ID'].unique()
    print(f"Number of unique creators: {len(unique_creators)}")

    for creator in unique_creators:
        G.add_node(creator, is_suspect=creator_suspect.get(creator, 0), name=creator_name_map.get(creator, creator))

    for _, row in df.iterrows():
        if row['Mentions']:
            for mention in row['Mentions'].split(', '):
                mention = mention.strip()
                if mention and mention[1:] in G.nodes:
                    G.add_edge(row['Creator ID'], mention[1:])

    hashtag_groups = {}
    for _, row in df.iterrows():
        if row['Hashtags']:
            hashtags = row['Hashtags'].split(', ')
            creator = row['Creator ID']
            for hashtag in hashtags:
                if hashtag:
                    if hashtag not in hashtag_groups:
                        hashtag_groups[hashtag] = []
                    hashtag_groups[hashtag].append(creator)

    for hashtag, creators in hashtag_groups.items():
        for i, creator1 in enumerate(creators):
            for creator2 in creators[i+1:]:
                if creator1 in G.nodes and creator2 in G.nodes:
                    G.add_edge(creator1, creator2)
                    G.add_edge(creator2, creator1)

    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")
    return G

In [None]:
def community_detection_and_analysis(df, G):
    G_undirected = G.to_undirected()

    louvain_communities = best_partition(G_undirected)
    df['Community_Louvain'] = df['Creator ID'].map(louvain_communities)

    label_prop_communities = nx_comm.label_propagation_communities(G_undirected)
    community_dict = {node: i for i, comm in enumerate(label_prop_communities) for node in comm}
    df['Community_LabelProp'] = df['Creator ID'].map(community_dict)

    pagerank = nx.pagerank(G)
    betweenness = nx.betweenness_centrality(G)
    degree_centrality = nx.degree_centrality(G)

    df['PageRank'] = df['Creator ID'].map(pagerank)
    df['Betweenness'] = df['Creator ID'].map(betweenness)
    df['Degree_Centrality'] = df['Creator ID'].map(degree_centrality)

    community_risk = df.groupby('Community_Louvain').agg({
        'is_suspect': 'mean',
        'fake_score': 'mean',
        'Creator name': lambda x: list(x.unique())
    }).rename(columns={'is_suspect': 'Suspicious_Ratio', 'fake_score': 'Avg_Fake_Score', 'Creator name': 'Creators'})

    return df, community_risk

def visualize_network(G, df):
    # Convert to undirected graph to find connected components
    G_undirected = G.to_undirected()

    # Find the largest connected component
    largest_cc = max(nx.connected_components(G_undirected), key=len)
    G_largest = G.subgraph(largest_cc).copy()

    # Log the number of nodes and edges after removing outliers
    print(f"Visualizing largest connected component: {len(G_largest.nodes)} nodes and {len(G_largest.edges)} edges")

    plt.figure(figsize=(15, 10))
    pos = nx.spring_layout(G_largest, k=1.0, iterations=100, scale=2.0)

    pagerank_map = df.set_index('Creator ID')['PageRank'].to_dict()
    suspect_map = df.groupby('Creator ID')['is_suspect'].max().to_dict()
    top_influencers = df.nlargest(20, 'PageRank')['Creator ID'].tolist()

    node_colors = []
    node_sizes = []
    for node in G_largest.nodes:
        pagerank = pagerank_map.get(node, 0) * 5000
        is_suspect = suspect_map.get(node, 0)
        color = 'red' if is_suspect else 'green'
        node_colors.append(color)
        node_sizes.append(max(50, pagerank))

    nx.draw(G_largest, pos, node_color=node_colors, node_size=node_sizes, with_labels=False, arrows=True, edge_color='gray', alpha=0.5)

    for node in top_influencers:
        if node in G_largest.nodes:
            creator_name = G.nodes[node]['name']
            x, y = pos[node]
            plt.text(x, y, creator_name, fontsize=6, ha='center', va='center', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

    legend_handles = [
        mpatches.Patch(color='red', label='Suspicious'),
        mpatches.Patch(color='green', label='Non-Suspicious')
    ]
    plt.legend(handles=legend_handles, title="Node Status", loc='best')

    plt.title("Creator Network (Static Visualization - Largest Connected Component)")
    plt.savefig("network_static.png", dpi=300, bbox_inches='tight')
    plt.close()
    print("Static network visualization saved to 'network_static.png'")

def identify_unsafe_hubs(df_aggregated):
    suspicious_creators = df_aggregated[df_aggregated['is_suspect'] == 1]
    unsafe_hubs = suspicious_creators.nlargest(10, 'Betweenness')[['Creator ID', 'Creator name', 'Betweenness', 'Community_Louvain']]
    return unsafe_hubs

In [None]:
def process_data(input_file):
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file '{input_file}' not found")

    df = pd.read_excel(input_file)

    df = df.drop_duplicates(subset=['Creator ID', 'Video Title'])
    print(f"Number of rows after deduplication: {len(df)}")

    cleaned_data = df['Video Title'].apply(clean_caption)
    df['Cleaned_Text'] = cleaned_data.apply(lambda x: x['cleaned_text'])
    df['Hashtags'] = cleaned_data.apply(lambda x: ', '.join(x['hashtags']))
    df['Mentions'] = cleaned_data.apply(lambda x: ', '.join(x['mentions']))

    detection_data = df['Cleaned_Text'].apply(detect_misleading_caption)
    df['is_suspect'] = detection_data.apply(lambda x: x['is_suspect'])
    df['fake_score'] = detection_data.apply(lambda x: x['fake_score'])

    G = build_network(df)
    df, community_risk = community_detection_and_analysis(df, G)

    df_aggregated = df.groupby('Creator ID').agg({
        'Creator name': 'first',
        'is_suspect': 'max',
        'fake_score': 'mean',
        'Community_Louvain': 'first',
        'Community_LabelProp': 'first',
        'PageRank': 'first',
        'Betweenness': 'first',
        'Degree_Centrality': 'first'
    }).reset_index()

    visualize_network(G, df)

    unsafe_hubs = identify_unsafe_hubs(df_aggregated)

    # Save df_aggregated to CSV
    df_aggregated.to_csv("creator_metrics.csv", index=False)
    print("Aggregated DataFrame saved to 'creator_metrics.csv'")

    return df_aggregated, community_risk, unsafe_hubs

if __name__ == "__main__":
    input_file = "Popular_Video_20250410_140539.xlsx"

    try:
        df_aggregated, community_risk, unsafe_hubs = process_data(input_file)
        print("SV3 processing completed.")
        print("\nCommunity Risk Analysis (with Creator Names):")
        print(community_risk)
        print("\nTop 20 Influencers (by PageRank):")
        print(df_aggregated.nlargest(20, 'PageRank')[['Creator ID', 'Creator name', 'PageRank', 'is_suspect', 'Community_Louvain']])
        print("\nTop 10 Unsafe Hubs (by Betweenness Centrality):")
        print(unsafe_hubs)
    except FileNotFoundError as e:
        print(e)

Number of rows after deduplication: 2686
Number of unique creators: 274
Number of nodes: 274
Number of edges: 26991
Visualizing largest connected component: 245 nodes and 26990 edges
Static network visualization saved to 'network_static.png'
Aggregated DataFrame saved to 'creator_metrics.csv'
SV3 processing completed.

Community Risk Analysis (with Creator Names):
                   Suspicious_Ratio  Avg_Fake_Score  \
Community_Louvain                                     
0                          0.032468        0.032468   
1                          0.000000        0.000000   
2                          0.013408        0.013408   
3                          0.250000        0.250000   
4                          0.000000        0.000000   
5                          1.000000        1.000000   
6                          0.000000        0.000000   
7                          0.000000        0.000000   
8                          1.000000        1.000000   
9                          0

In [None]:
community_risk.sort_values(by=['Avg_Fake_Score'], ascending=False)

Unnamed: 0_level_0,Suspicious_Ratio,Avg_Fake_Score,Creators
Community_Louvain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,1.0,1.0,[Thị Nhung🧶]
19,1.0,1.0,[Như nè]
5,1.0,1.0,[Thaolee]
10,1.0,1.0,[bebuong]
17,1.0,1.0,[Lại là Như nè hihi 🍀]
3,0.25,0.25,"[𝐍𝐠𝐚̂𝐧 𝐇𝐚̀ 🍌, Công chúa ❤️, Nana Liu, Nana Liu..."
0,0.032468,0.032468,"[Trần Hà Linh 🐰, 𝓚𝓲𝓶 𝓐𝓷𝓱 🇻🇳, Bùi Thu Trà🍵, Chu..."
2,0.013408,0.013408,"[Thảo Lê Cenly, Bình An🤍, Gia Kun, Út Nhị Mino..."
4,0.0,0.0,[Trang Phạm]
7,0.0,0.0,[Bếp của Solai]




In [None]:
# prompt: Using dataframe community_risk: sort by art fake score

community_risk.sort_values(by=['Avg_Fake_Score'], ascending=False)


In [None]:
unsafe_hubs

Unnamed: 0,Creator ID,Creator name,Betweenness,Community_Louvain
15,_tradangditam,Bùi Thu Trà🍵,0.025343,0
20,anhhong.thaomoccenly,Ánh Hồng - Thảo Mộc Cenly,0.023602,0
21,anhhongcenly,Ánh Hồng Cenly,0.020515,0
91,hannah_onlyn2,HOA PHAN,0.005819,0
272,zananh.ne,Vân Anh Hay Quạo,0.005339,2
263,utnhi_mino_official,Út Nhị Mino,0.004261,2
265,villa_gaming94,🔥 Villa Gaming 🔥,0.002702,2
164,nama7991,Linh hướng nội,0.001518,2
165,nanaliiu.2107,nanaliiu.2107,0.00115,2
144,loangiamcan1,LOAN GIẢM CÂN,0.000886,2


In [None]:
df_aggregated[["Creator ID","Creator name","PageRank","Betweenness","Degree_Centrality"]]

Unnamed: 0,Creator ID,Creator name,PageRank,Betweenness,Degree_Centrality
0,.bepdiiday._,Bẹp bẹp,0.003718,0.002793,0.644689
1,.niii.2,Hoàng Oanh,0.001241,0.000000,0.146520
2,1101_ndt,Ng Doan Trang,0.001241,0.000000,0.146520
3,_baokin_,Bảo Kin,0.000600,0.000000,0.000000
4,_carotshop,Cà Rốt Shop 🥕,0.005011,0.000873,1.113553
...,...,...,...,...,...
269,xuantruong.cenly,XUÂN TRƯỜNG Giám Đốc CTY CENLY,0.005415,0.002852,1.179487
270,xuantruong_official,CENLY ORGANIC - XUÂN TRƯỜNG,0.005411,0.002346,1.186813
271,yen.linh_shop,YẾN LINH SH0P,0.004942,0.000404,1.106227
272,zananh.ne,Vân Anh Hay Quạo,0.005912,0.005339,1.267399
