In [1]:
import pandas as pd
import json
import os

# Parse data for network graph

In [4]:
def parse_instagram_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    rows = []
    # Add the owner as a node
    rows.append({
        "source": data["owner_id"],
        "target": None,
        "type": "owner",
        "username": data["username"],
        "fullname": data["fullname"],
        "post_id": data["post_id"],
        "likes": data["likes"],
        "count_followed": data["user"]["count_followed"]
    })
    
    # Add tagged users as nodes and edges
    for user in data["tagged_users"]:
        rows.append({
            "source": data["owner_id"],
            "target": user["id"],
            "type": "tagged",
            "username": user["username"],
            "fullname": user["full_name"],
            "post_id": data["post_id"],
            "likes": data["likes"],
            "count_followed": None,
        })
    
    return pd.DataFrame(rows)

def process_all_json_files(base_path, parser):
    all_data = []
    for username in os.listdir(base_path):
        user_path = os.path.join(base_path, username)
        if os.path.isdir(user_path):
            for file in os.listdir(user_path):
                if file.endswith('.json'):
                    file_path = os.path.join(user_path, file)
                    df = parser(file_path)
                    all_data.append(df)
    
    return pd.concat(all_data, ignore_index=True)

In [5]:
base_path = 'scraped_data/instagram'

In [None]:
combined_df = process_all_json_files(base_path, parse_instagram_json)

In [7]:
# Keep unique account owners and their tagged accounts
cleaned_combined_df = combined_df.drop_duplicates(subset=['source', 'target'])

In [6]:
# !pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl (30.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.1


In [11]:
cleaned_combined_df.to_parquet('parsed_data/network_graph_data.parquet')

In [13]:
cleaned_combined_df.to_csv('parsed_data/network_graph_data.csv')

# Parse data for virality predictor

In [48]:
def parse_instagram_json_for_model_training(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    rows = []
    # Add the owner as a node
    rows.append({
        "media_id": data["media_id"],
        "caption": data["description"],
        "width": data["width"],
        "height": data["height"],
        "likes": data["likes"],
        "count": data["count"],
        "filename": os.path.splitext(os.path.basename(file_path))[0],
    })
    
    return pd.DataFrame(rows)

In [49]:
model_training_meta_df = process_all_json_files(base_path, parse_instagram_json_for_model_training)

In [50]:
# Save raw data as parquet
model_training_meta_df.to_parquet('model_training_data_raw.parquet')

In [51]:
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    if max_val == min_val:
        return column * 0  # Handle case where all values are the same
    return (column - min_val) / (max_val - min_val)

# Normalize data
model_training_meta_df['virality_score'] = normalize(model_training_meta_df['likes'])
model_training_meta_df['width'] = normalize(model_training_meta_df['width'])
model_training_meta_df['height'] = normalize(model_training_meta_df['height'])
model_training_meta_df['count'] = normalize(model_training_meta_df['count'])

In [52]:
# !pip install scikit-learn

In [53]:
# !pip install transformers

In [54]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state
    
    # Average the token embeddings to get a single vector for the entire text
    sentence_embedding = torch.mean(embeddings, dim=1)
    
    return sentence_embedding.squeeze().numpy()

In [55]:
captions_embedding = []
for caption in model_training_meta_df['caption']:
    embedding = get_bert_embedding(caption)
    captions_embedding.append(embedding)

In [56]:
model_training_meta_df['embedded_caption'] = captions_embedding

In [57]:
model_training_meta_df.columns

Index(['media_id', 'caption', 'width', 'height', 'likes', 'count', 'filename',
       'virality_score', 'embbeded_caption'],
      dtype='object')

In [15]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os
import numpy as np

def extract_embeddings(image_folder, model_name='resnet50', batch_size=32):
    """
    Extract embeddings from images using a pre-trained model
    
    Args:
        image_folder: Path to folder containing JPG images
        model_name: 'resnet50' or 'vgg16'
        batch_size: Number of images to process at once
        
    Returns:
        embeddings: numpy array of embeddings
        image_paths: list of image paths
    """
    # Choose model
    if model_name == 'resnet50':
        model = models.resnet50(pretrained=True)
        # Remove the final fully connected layer
        model = torch.nn.Sequential(*(list(model.children())[:-1]))
        embedding_size = 2048
    elif model_name == 'vgg16':
        model = models.vgg16(pretrained=True)
        # Use features part of VGG (before fully connected layers)
        model = model.features
        # Add global average pooling to get fixed size output
        model = torch.nn.Sequential(
            model,
            torch.nn.AdaptiveAvgPool2d((1, 1))
        )
        embedding_size = 512
    else:
        raise ValueError("Model must be 'resnet50' or 'vgg16'")
    
    # Set model to evaluation mode
    model.eval()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Define image preprocessing
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Get all jpg images
    image_paths = get_all_img_paths(image_folder)
    
    embeddings = []
    
    # Process images in batches
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        batch_tensors = []
        
        for img_path in batch_paths:
            try:
                img = Image.open(img_path).convert('RGB')
                img_tensor = preprocess(img)
                batch_tensors.append(img_tensor)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
                continue
        
        if not batch_tensors:
            continue
            
        # Stack tensors into a batch
        batch = torch.stack(batch_tensors).to(device)
        
        # Extract features
        with torch.no_grad():
            batch_embeddings = model(batch)
            
        # Reshape and convert to numpy
        batch_embeddings = batch_embeddings.squeeze().cpu().numpy()
        
        # Handle single image case
        if len(batch_tensors) == 1:
            batch_embeddings = batch_embeddings.reshape(1, -1)
            
        embeddings.append(batch_embeddings)
    
    # Concatenate all batches
    if embeddings:
        embeddings = np.vstack(embeddings)
    else:
        embeddings = np.array([])
    
    return embeddings, image_paths

In [20]:
def get_all_img_paths(base_path):
    img_paths = []
    for username in os.listdir(base_path):
        user_path = os.path.join(base_path, username)
        if os.path.isdir(user_path):
            for file in os.listdir(user_path):
                if file.endswith('.jpg'):
                    file_path = os.path.join(user_path, file)
                    img_paths.append(file_path)
    
    return img_paths

In [21]:
image_folder = "scraped_data/instagram/"

# Extract embeddings using ResNet50 (faster)
embeddings, image_paths = extract_embeddings(image_folder, model_name='resnet50')

print(f"Generated {len(embeddings)} embeddings with shape {embeddings.shape}")

Generated 162 embeddings with shape (162, 2048)


In [22]:
# Save embeddings to file
np.save("image_embeddings.npy", embeddings)

In [16]:
embeddings = np.load('image_embeddings.npy')

In [25]:
# Save image paths for reference
with open("image_paths.txt", "w") as f:
    for path in image_paths:
        f.write(f"{path}\n")

In [59]:
image_paths = get_all_img_paths(base_path)
file_name = [os.path.basename(path) for path in image_paths if path.endswith('.jpg')]

In [60]:
embeddings_df = pd.DataFrame(embeddings)

In [68]:
embeddings_df['filename'] = file_name
embeddings_df['filename'] = embeddings_df['filename'].astype(str)

In [69]:
model_training_meta_df['filename'] = model_training_meta_df['filename'].astype(str)

In [73]:
joined_df = pd.merge(embeddings_df, model_training_meta_df, on='filename')
joined_df = joined_df.drop(['filename', 'media_id', 'caption'], axis=1)

In [94]:
# Handle caption embedding
array_df = pd.DataFrame(joined_df['embbeded_caption'].tolist(), 
                        # index=joined_df.index, 
                        columns=[f'caption_{i}' for i in range(len(joined_df['embbeded_caption'].iloc[0]))])
                        
# Combine with original dataframe
joined_df = pd.concat([joined_df.drop('embbeded_caption', axis=1), array_df], axis=1)

In [97]:
threshold = np.percentile(joined_df['virality_score'], 75)  # Top 25% considered viral
joined_df['virality_label'] = (joined_df['virality_score'] >= threshold).astype(int)

In [99]:
joined_df = joined_df.drop('virality_score', axis=1)

In [100]:
joined_df.to_parquet('model_training_data.parquet')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
