In [1]:
import pandas as pd
import json
import os

In [2]:
def parse_instagram_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    rows = []
    # Add the owner as a node
    rows.append({
        "source": data["owner_id"],
        "target": None,
        "type": "owner",
        "username": data["username"],
        "fullname": data["fullname"],
        "post_id": data["post_id"],
        "likes": data["likes"]
    })
    
    # Add tagged users as nodes and edges
    for user in data["tagged_users"]:
        rows.append({
            "source": data["owner_id"],
            "target": user["id"],
            "type": "tagged",
            "username": user["username"],
            "fullname": user["full_name"],
            "post_id": data["post_id"],
            "likes": data["likes"]
        })
    
    return pd.DataFrame(rows)

def process_all_json_files(base_path):
    all_data = []
    for username in os.listdir(base_path):
        user_path = os.path.join(base_path, username)
        if os.path.isdir(user_path):
            for file in os.listdir(user_path):
                if file.endswith('.json'):
                    file_path = os.path.join(user_path, file)
                    df = parse_instagram_json(file_path)
                    all_data.append(df)
    
    return pd.concat(all_data, ignore_index=True)

In [3]:
base_path = 'scraped_data/instagram'
combined_df = process_all_json_files(base_path)

In [6]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl (30.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.1


In [9]:
combined_df.to_parquet('parsed_data/network_graph_data.parquet')