In [26]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import requests as req
from rich import print as rp
import polars as pl
from random import choice
import logging
from timeit_decorator import timeit
from pandarallel import pandarallel

# Configure logging
logging.basicConfig(level=logging.INFO)
pandarallel.initialize(progress_bar=True, nb_workers=32)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [34]:
# @timeit(log_level=logging.INFO)
from time import sleep


def download_image(
    post_id: str = "CoGzQVFLqDB",
    path_download: str = "../data/images/",
    proxy_servers: list = [
        "100.80.84.42:9091",
        "100.70.13.68:9091",
        "100.122.187.59:9091",
        "100.66.160.80:9091",
    ],
) -> None:
    url = f"https://www.instagram.com/p/{post_id}/media/?size=l"
    # Create folder if not exist
    if not os.path.exists(path_download):
        os.makedirs(path_download)
    # Skip if file exist

    path_file = f"{path_download}{post_id}.jpg"
    if not os.path.exists(path_file):
        # Define proxy
        proxy_servers = proxy_servers
        proxy_server = choice(proxy_servers)
        proxies = {
            "http": f"socks5://{proxy_server}",
            "https": f"socks5://{proxy_server}",
        }
        # Download image
        # response = req.get(url, proxies=proxies)
        response = req.get(url)
        if response.status_code == 200:
            try:
                # Save image
                with open(path_file, "wb") as file:
                    file.write(response.content)
                rp(f"[green bold]Downloaded: {post_id}.[/green bold]")
            except Exception as e:
                rp(f"[red bold]Failed due to an error: {post_id}, {e}.[/red bold]")
        else:
            rp(f"[red bold]Failed: {post_id}.[/red bold]")
    else:
        rp(f"[yellow bold]Skipped: {post_id}.[/yellow bold]")


download_image()

In [3]:
import re

df = pd.concat(
    [
        pl.read_csv("../data/raw/consolidated_press.csv", ignore_errors=True)
        .select(["URL"])
        .to_pandas(),
        pl.read_csv("../data/raw/consolidated_users.csv", ignore_errors=True)
        .select(["URL"])
        .to_pandas(),
        pl.read_csv("../data/raw/non_self_disclosure_filtered.csv", ignore_errors=True)
        .select(["URL"])
        .to_pandas(),
    ]
)
pattern = r"(p|tv)\/([A-Z,a-z,0-9,_,-]+)\/"  # Starting with p or tv; followed by a string of characters, numbers, _, or -; ending with /
df["POST_ID"] = df["URL"].apply(
    lambda x: re.findall(pattern, x)[0][1] if re.findall(pattern, x) else "None"
)
# print(df["URL"][0])
df.head()

Unnamed: 0,URL,POST_ID
0,https://www.instagram.com/p/C9EZeNnRNLY/,C9EZeNnRNLY
1,https://www.instagram.com/p/C9Dr_tjAL6S/,C9Dr_tjAL6S
2,https://www.instagram.com/p/C9DPywVNJ2w/,C9DPywVNJ2w
3,https://www.instagram.com/p/C9DPSZmM_RO/,C9DPSZmM_RO
4,https://www.instagram.com/p/C9DAH0bSs0z/,C9DAH0bSs0z


In [4]:
df_sample = df[["POST_ID"]]
# df_sample["POST_ID"].apply(download_image)
df_sample.parallel_apply(lambda x: download_image(x["POST_ID"]), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5986), Label(value='0 / 5986'))), …

0         None
1         None
2         None
3         None
4         None
          ... 
163984    None
163985    None
163986    None
163987    None
163988    None
Length: 191524, dtype: object

In [36]:
downloaded = set([x[:-4] for x in os.listdir("../data/images/")])
# check diff
df["DOWNLOADED"] = df["POST_ID"].apply(lambda x: True if x in downloaded else False)
df["DOWNLOADED"].value_counts()

DOWNLOADED
True     189058
False      2466
Name: count, dtype: int64

In [38]:
df[df["DOWNLOADED"] == False].parallel_apply(
    lambda x: download_image(x["POST_ID"]), axis=1
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=78), Label(value='0 / 78'))), HBox…

11        None
12        None
17        None
116       None
121       None
          ... 
163233    None
163259    None
163607    None
163692    None
163961    None
Length: 2466, dtype: object

In [39]:
df[df["DOWNLOADED"] == False]

Unnamed: 0,URL,POST_ID,DOWNLOADED
11,https://www.instagram.com/p/C9CL2rkNKOY/,C9CL2rkNKOY,False
12,https://www.instagram.com/p/C9CB9jYMZkT/,C9CB9jYMZkT,False
17,https://www.instagram.com/p/C9AtvoUtMOY/,C9AtvoUtMOY,False
116,https://www.instagram.com/p/C8ucKA0NvCS/,C8ucKA0NvCS,False
121,https://www.instagram.com/p/C8t6HX9sryx/,C8t6HX9sryx,False
...,...,...,...
163233,https://www.instagram.com/p/CvNDYl2MkHT/,CvNDYl2MkHT,False
163259,https://www.instagram.com/p/CvNCeH3upLN/,CvNCeH3upLN,False
163607,https://www.instagram.com/p/CvMxUw6svv_/,CvMxUw6svv_,False
163692,https://www.instagram.com/p/CvMos7Jr_uw/,CvMos7Jr_uw,False
