#### Desired data pairs and metadata

In [19]:
import jmespath
import json
import os
import sys

# Load the JSON file
def load_json_file(file_path):
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        sys.exit(1)

    with open(file_path, 'r') as file:
        try:
            data = json.load(file)
            return data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file {file_path}: {e}")
            sys.exit(1)

# Parse a single Instagram post (items[0]) using jmespath
def parse_post_json(data: dict) -> dict:
    return jmespath.search(
        """{
            shortcode: code,
            caption: caption.text,
            head_image_url: image_versions2.candidates[0].url,
            audio_type: music_metadata.audio_type,
            music_title: music_metadata.music_info.music_asset_info.title,
            music_artist: music_metadata.music_info.music_asset_info.display_artist
        }""",
        data,
    )

# Extract all parsed posts from a combined user-post JSON
def extract_all_posts(json_data: dict):
    parsed_posts = []

    for username, posts_dict in json_data.items():
        for post_url, post_data in posts_dict.items():
            items = post_data.get("items", [])
            if items:  # Only if not empty
                parsed = parse_post_json(items[0])
                if parsed:
                    parsed["username"] = username  # Add source username
                    parsed_posts.append(parsed)

    return parsed_posts


In [20]:
# Load file
raw_json_data = load_json_file("json/all_instagram_data.json")

# Parse all valid posts
parsed_posts = extract_all_posts(raw_json_data)

In [21]:
import pandas as pd

df = pd.DataFrame(parsed_posts)
len(df)

1919

In [22]:
# for all "None" on audio type, remove them
df = df[df['audio_type'] == 'licensed_music']

# remove id, caption, audio type, 
df = df.drop(columns=['username', 'caption', 'audio_type'])

df

Unnamed: 0,shortcode,head_image_url,music_title,music_artist
81,DHqyEKlx45m,https://scontent-lax3-1.cdninstagram.com/v/t51...,Call Me When You Break Up,"Selena Gomez, benny blanco, Gracie Abrams"
92,DHKpuJEtril,https://scontent-lax3-2.cdninstagram.com/v/t51...,Sunset Blvd,"Selena Gomez, benny blanco"
104,C6bGvvzvQma,https://scontent-lax3-1.cdninstagram.com/v/t51...,Perfect,Ed Sheeran
107,C0vRPnvoIAa,https://scontent-lax3-1.cdninstagram.com/v/t51...,Those Eyes,New West
165,C_gUrFoSjoi,https://scontent-lax3-1.cdninstagram.com/v/t51...,I've Been In Love (feat. Channel Tres),Jungle
...,...,...,...,...
1731,CwwST_HrON3,https://scontent-lax3-1.cdninstagram.com/v/t51...,Dark Red,Steve Lacy
1772,DIMr8d_hWk5,https://scontent-lax3-2.cdninstagram.com/v/t51...,Where Roses Bloom (Voice Memo Clip),Teesa
1789,DBy1OUOhHEi,https://scontent-lax3-2.cdninstagram.com/v/t51...,,
1824,C9ASY71hDTo,https://scontent-lax3-2.cdninstagram.com/v/t39...,Dreamer's Path,James Quinn


In [23]:
# 
df_old = pd.read_csv("csv/all_instagram_data.csv")

# concat
df = pd.concat([df_old, df], ignore_index=True)





In [24]:
#show duplicates
duplicates = df[df.duplicated(keep=False)]
if not duplicates.empty:
    print("Duplicates found:")
    print(duplicates)
else:
    print("No duplicates found.")

# remove duplicates
df = df.drop_duplicates()

Duplicates found:
        shortcode                                     head_image_url  \
0     DHqyEKlx45m  https://scontent-lax3-1.cdninstagram.com/v/t51...   
1     DHKpuJEtril  https://scontent-lax3-2.cdninstagram.com/v/t51...   
2     C6bGvvzvQma  https://scontent-lax3-1.cdninstagram.com/v/t51...   
3     C0vRPnvoIAa  https://scontent-lax3-1.cdninstagram.com/v/t51...   
4     C_gUrFoSjoi  https://scontent-lax3-1.cdninstagram.com/v/t51...   
...           ...                                                ...   
1687  C4PUrYLty_Q  https://scontent-lax3-2.cdninstagram.com/v/t39...   
1688  CwwST_HrON3  https://scontent-lax3-1.cdninstagram.com/v/t51...   
1689  DIMr8d_hWk5  https://scontent-lax3-2.cdninstagram.com/v/t51...   
1691  C9ASY71hDTo  https://scontent-lax3-2.cdninstagram.com/v/t39...   
1692  CzybwmqhZ-U  https://scontent-lax3-2.cdninstagram.com/v/t39...   

                                 music_title  \
0                  Call Me When You Break Up   
1                    

In [25]:
len(df)

888

In [26]:

# save to csv
df.to_csv("csv/all_instagram_data.csv", index=False)