#### Desired data pairs and metadata

In [90]:
import jmespath
import json
import os
import sys

# Load the JSON file
def load_json_file(file_path):
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        sys.exit(1)

    with open(file_path, 'r') as file:
        try:
            data = json.load(file)
            return data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file {file_path}: {e}")
            sys.exit(1)

# Parse a single Instagram post (items[0]) using jmespath
def parse_post_json(data: dict) -> dict:
    return jmespath.search(
        """{
            shortcode: code,
            caption: caption.text,
            head_image_url: image_versions2.candidates[0].url,
            audio_type: music_metadata.audio_type,
            music_title: music_metadata.music_info.music_asset_info.title,
            music_artist: music_metadata.music_info.music_asset_info.display_artist
        }""",
        data,
    )

# Extract all parsed posts from a combined user-post JSON
def extract_all_posts(json_data: dict):
    parsed_posts = []

    for shortcode, json_dict in json_data.items():
        items = json_dict["items"][0]
        parsed = parse_post_json(items)
        if parsed:
            parsed["shortcode"] = shortcode  # Add source username
            parsed_posts.append(parsed)

    return parsed_posts


In [91]:
# Load file
raw_json_data = load_json_file("refreshed.json")


parsed_posts = extract_all_posts(raw_json_data)


In [98]:
import pandas as pd

df = pd.DataFrame(parsed_posts)
len(df)

499

In [93]:
df.head()

Unnamed: 0,shortcode,caption,head_image_url,audio_type,music_title,music_artist
0,DBqaqZFye1P,10월은 행복했어요 🍀,https://scontent-lax3-2.cdninstagram.com/v/t51...,licensed_music,chocolate milk,yawn.
1,DIfxeTESmAH,paulas ibiza @loewe 🍅,https://scontent-lax3-2.cdninstagram.com/v/t51...,licensed_music,,
2,DAghuC8O3K1,dont blink or you'll miss it,https://scontent-lax3-2.cdninstagram.com/v/t39...,licensed_music,Don’t Walk Away...Just Trust Me,Tory Lanez
3,DDO76ScyKdo,추울때도 로에베팅. \n#loewegifts,https://scontent-lax3-1.cdninstagram.com/v/t51...,licensed_music,,
4,DEnFjlmsa59,,https://scontent-lax3-2.cdninstagram.com/v/t51...,licensed_music,Dear Winter,"Larry June, Jay Worthy"


In [99]:
#drop caption
df = df.drop(columns=["caption"])

In [100]:
df.head()

Unnamed: 0,shortcode,head_image_url,audio_type,music_title,music_artist
0,DBqaqZFye1P,https://scontent-lax3-2.cdninstagram.com/v/t51...,licensed_music,chocolate milk,yawn.
1,DIfxeTESmAH,https://scontent-lax3-2.cdninstagram.com/v/t51...,licensed_music,,
2,DAghuC8O3K1,https://scontent-lax3-2.cdninstagram.com/v/t39...,licensed_music,Don’t Walk Away...Just Trust Me,Tory Lanez
3,DDO76ScyKdo,https://scontent-lax3-1.cdninstagram.com/v/t51...,licensed_music,,
4,DEnFjlmsa59,https://scontent-lax3-2.cdninstagram.com/v/t51...,licensed_music,Dear Winter,"Larry June, Jay Worthy"


In [101]:
df = df[df['audio_type'] == "licensed_music"]

In [102]:
len(df)

478

In [103]:
df.drop(columns=["audio_type"], inplace=True)

In [106]:
df.drop(columns=["music_title","music_artist"], inplace=True)

In [107]:
df

Unnamed: 0,shortcode,head_image_url
0,DBqaqZFye1P,https://scontent-lax3-2.cdninstagram.com/v/t51...
1,DIfxeTESmAH,https://scontent-lax3-2.cdninstagram.com/v/t51...
2,DAghuC8O3K1,https://scontent-lax3-2.cdninstagram.com/v/t39...
3,DDO76ScyKdo,https://scontent-lax3-1.cdninstagram.com/v/t51...
4,DEnFjlmsa59,https://scontent-lax3-2.cdninstagram.com/v/t51...
...,...,...
494,DB1iHj_RmpT,https://scontent-lax3-2.cdninstagram.com/v/t51...
495,DIip2tlyfAB,https://scontent-lax3-2.cdninstagram.com/v/t51...
496,C_e-Za6TwRQ,https://scontent-lax3-2.cdninstagram.com/v/t51...
497,C1sZCxFyLyj,https://scontent-lax3-2.cdninstagram.com/v/t51...


In [117]:
#read df from csv
df1 = pd.read_csv("CHOCOMINT.csv")
df1 = df1.dropna()
#remove duplicates
df1 = df1.drop_duplicates(subset=["shortcode"])

In [118]:
len(df1)

552

In [120]:
#identify how many shortcodes are mutually shared by both dataframes
df = df[df['shortcode'].isin(df1['shortcode'])]
len(df)

472

In [121]:
df

Unnamed: 0,shortcode,head_image_url
0,DBqaqZFye1P,https://scontent-lax3-2.cdninstagram.com/v/t51...
1,DIfxeTESmAH,https://scontent-lax3-2.cdninstagram.com/v/t51...
2,DAghuC8O3K1,https://scontent-lax3-2.cdninstagram.com/v/t39...
3,DDO76ScyKdo,https://scontent-lax3-1.cdninstagram.com/v/t51...
4,DEnFjlmsa59,https://scontent-lax3-2.cdninstagram.com/v/t51...
...,...,...
493,DHdTNv_pmJN,https://scontent-lax3-2.cdninstagram.com/v/t51...
494,DB1iHj_RmpT,https://scontent-lax3-2.cdninstagram.com/v/t51...
495,DIip2tlyfAB,https://scontent-lax3-2.cdninstagram.com/v/t51...
496,C_e-Za6TwRQ,https://scontent-lax3-2.cdninstagram.com/v/t51...


In [125]:
df2 = pd.read_csv("pairs_songencoded.csv")
#remove duplicates
df2 = df2.drop_duplicates(subset=["shortcode"])

In [None]:
df2 = df2[df2['shortcode'].isin(df['shortcode'])]
len(df2)

459

In [128]:
# for every row in df2, take the 'shortcode' and find the corresponding row in df
# then, replace the 'link' of that row in df2 with the 'head_image_url' of that row in df
for index, row in df2.iterrows():
    shortcode = row['shortcode']
    matching_row = df[df['shortcode'] == shortcode]
    if not matching_row.empty:
        head_image_url = matching_row.iloc[0]['head_image_url']
        df2.at[index, 'link'] = head_image_url

In [129]:
df2

Unnamed: 0,shortcode,link,embedding
0,DHqyEKlx45m,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.927291750908, 0.449945956469, 3.70222735405..."
1,DHKpuJEtril,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.89463031292, 0.459532767534, 4.2029671669, ..."
2,C6bGvvzvQma,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.850314557552, 0.443762511015, 3.41218352318..."
3,C0vRPnvoIAa,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.81948775053, 0.430989027023, 4.39311981201,..."
4,C_gUrFoSjoi,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.314242720604, 0.451382547617, 3.97746062279..."
...,...,...,...
708,DDqSwt_z8nJ,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.582497835159, 0.4395198524, 6.87156152725, ..."
709,DCyN-bJTPBy,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.221343681216, 0.409568578005, 7.05048418045..."
710,DCWjVQ5Tt71,https://scontent-lax3-2.cdninstagram.com/v/t51...,"[0.922227978706, 0.456249445677, 4.11033153534..."
711,DAgUVg_TFDp,https://scontent-lax3-2.cdninstagram.com/v/t51...,"[0.166219711304, 0.406690448523, 11.5702352524..."


In [131]:
df1

Unnamed: 0,shortcode,head_image_url,music_title,music_artist
0,DHqyEKlx45m,https://scontent-lax3-1.cdninstagram.com/v/t51...,Call Me When You Break Up,"Selena Gomez, benny blanco, Gracie Abrams"
1,DHKpuJEtril,https://scontent-lax3-2.cdninstagram.com/v/t51...,Sunset Blvd,"Selena Gomez, benny blanco"
2,C6bGvvzvQma,https://scontent-lax3-1.cdninstagram.com/v/t51...,Perfect,Ed Sheeran
3,C0vRPnvoIAa,https://scontent-lax3-1.cdninstagram.com/v/t51...,Those Eyes,New West
4,C_gUrFoSjoi,https://scontent-lax3-1.cdninstagram.com/v/t51...,I've Been In Love (feat. Channel Tres),Jungle
...,...,...,...,...
738,DDqSwt_z8nJ,https://scontent-lax3-1.cdninstagram.com/v/t51...,Be Be Your Love,Rachael Yamagata
739,DCyN-bJTPBy,https://scontent-lax3-1.cdninstagram.com/v/t51...,Choose You,Elmiene
740,DCWjVQ5Tt71,https://scontent-lax3-2.cdninstagram.com/v/t51...,Wings,Mot
741,DAgUVg_TFDp,https://scontent-lax3-2.cdninstagram.com/v/t51...,hotline (edit),Billie Eilish


In [132]:
#make two new columns in df2: 'music_title' and 'music_artist'
#for every row in df2, take the shortcode and find the corresponding row in df1
#then, put the 'music_title' and 'music_artist' of that row in df1 into the new columns in df2

for index, row in df2.iterrows():
    shortcode = row['shortcode']
    matching_row = df1[df1['shortcode'] == shortcode]
    if not matching_row.empty:
        music_title = matching_row.iloc[0]['music_title']
        music_artist = matching_row.iloc[0]['music_artist']
        df2.at[index, 'music_title'] = music_title
        df2.at[index, 'music_artist'] = music_artist
df2

Unnamed: 0,shortcode,link,embedding,music_title,music_artist
0,DHqyEKlx45m,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.927291750908, 0.449945956469, 3.70222735405...",Call Me When You Break Up,"Selena Gomez, benny blanco, Gracie Abrams"
1,DHKpuJEtril,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.89463031292, 0.459532767534, 4.2029671669, ...",Sunset Blvd,"Selena Gomez, benny blanco"
2,C6bGvvzvQma,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.850314557552, 0.443762511015, 3.41218352318...",Perfect,Ed Sheeran
3,C0vRPnvoIAa,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.81948775053, 0.430989027023, 4.39311981201,...",Those Eyes,New West
4,C_gUrFoSjoi,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.314242720604, 0.451382547617, 3.97746062279...",I've Been In Love (feat. Channel Tres),Jungle
...,...,...,...,...,...
708,DDqSwt_z8nJ,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.582497835159, 0.4395198524, 6.87156152725, ...",Be Be Your Love,Rachael Yamagata
709,DCyN-bJTPBy,https://scontent-lax3-1.cdninstagram.com/v/t51...,"[0.221343681216, 0.409568578005, 7.05048418045...",Choose You,Elmiene
710,DCWjVQ5Tt71,https://scontent-lax3-2.cdninstagram.com/v/t51...,"[0.922227978706, 0.456249445677, 4.11033153534...",Wings,Mot
711,DAgUVg_TFDp,https://scontent-lax3-2.cdninstagram.com/v/t51...,"[0.166219711304, 0.406690448523, 11.5702352524...",hotline (edit),Billie Eilish


In [133]:
#save df2
df2.to_csv("final.csv", index=False)