In [1]:
import logging
import os
from random import choice

import pandas as pd
import polars as pl
import requests as req
from pandarallel import pandarallel
from rich import print as rp

# Configure logging
logging.basicConfig(level=logging.INFO)
pandarallel.initialize(progress_bar=True, nb_workers=32)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# @timeit(log_level=logging.INFO)


def download_image(
    post_id: str = "C1SvpQRS99V",
    path_download: str = "../data/images/",
    proxy_servers: list = [
        "100.80.84.42:9091",
        "100.70.13.68:9091",
        "100.122.187.59:9091",
        "100.66.160.80:9091",
    ],
) -> None:
    try:
        url = f"https://www.instagram.com/p/{post_id}/media/?size=l"
        # Create folder if not exist
        if not os.path.exists(path_download):
            os.makedirs(path_download)
        # Skip if file exist

        path_file = f"{path_download}{post_id}.jpg"
        if not os.path.exists(path_file):
            # Define proxy
            proxy_servers = proxy_servers
            proxy_server = choice(proxy_servers)
            proxies = {
                "http": f"socks5://{proxy_server}",
                "https": f"socks5://{proxy_server}",
            }
            # Download image
            # response = req.get(url, proxies=proxies)
            response = req.get(url)
            if response.status_code == 200:
                try:
                    # Save image
                    with open(path_file, "wb") as file:
                        file.write(response.content)
                    rp(f"[green bold]Downloaded: {post_id}.[/green bold]")
                except Exception as e:
                    rp(f"[red bold]Failed due to an error: {post_id}, {e}.[/red bold]")
            else:
                rp(f"[red bold]Failed: {post_id}.[/red bold]")
        else:
            rp(f"[yellow bold]Skipped: {post_id}.[/yellow bold]")
    except Exception as e:
        rp(f"[red bold]Failed due to an error: {post_id}, {e}.[/red bold]")
        pass


download_image()

In [3]:
import re

df = pd.concat(
    [
        pl.read_csv("../data/raw/consolidated_press.csv", ignore_errors=True)
        .select(["URL"])
        .to_pandas(),
        pl.read_csv("../data/raw/consolidated_users.csv", ignore_errors=True)
        .select(["URL"])
        .to_pandas(),
        pl.read_csv("../data/raw/non_self_disclosure_filtered.csv", ignore_errors=True)
        .select(["URL"])
        .to_pandas(),
    ]
)
pattern = r"(p|tv)\/([A-Z,a-z,0-9,_,-]+)\/"  # Starting with p or tv; followed by a string of characters, numbers, _, or -; ending with /


images_downloaded = os.listdir("../data/images/")
images_downloaded = [i.split(".")[0] for i in images_downloaded]
df["POST_ID"] = df["URL"].apply(
    lambda x: re.findall(pattern, x)[0][1] if re.findall(pattern, x) else "None"
)
df["DOWNLOADED"] = df["POST_ID"].parallel_apply(
    lambda x: True if x in images_downloaded else False
)
df = df[~df["DOWNLOADED"]]
# print(df["URL"][0])
df.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5986), Label(value='0 / 5986'))), …

Unnamed: 0,URL,POST_ID,DOWNLOADED
11,https://www.instagram.com/p/C9CL2rkNKOY/,C9CL2rkNKOY,False
12,https://www.instagram.com/p/C9CB9jYMZkT/,C9CB9jYMZkT,False
17,https://www.instagram.com/p/C9AtvoUtMOY/,C9AtvoUtMOY,False
116,https://www.instagram.com/p/C8ucKA0NvCS/,C8ucKA0NvCS,False
121,https://www.instagram.com/p/C8t6HX9sryx/,C8t6HX9sryx,False


In [4]:
df_sample = df[["POST_ID"]]
# df_sample["POST_ID"].apply(download_image)
# df_sample.parallel_apply(lambda x: download_image(x["POST_ID"]), axis=1)

In [5]:
downloaded = set([x[:-4] for x in os.listdir("../data/images/")])
# check diff
df["DOWNLOADED"] = df["POST_ID"].apply(lambda x: True if x in downloaded else False)
df["DOWNLOADED"].value_counts()

DOWNLOADED
False    3015
Name: count, dtype: int64

In [6]:
df[df["DOWNLOADED"] == False].parallel_apply(
    lambda x: download_image(x["POST_ID"]), axis=1
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=95), Label(value='0 / 95'))), HBox…

11        None
12        None
17        None
116       None
121       None
          ... 
163471    None
163607    None
163692    None
163777    None
163961    None
Length: 3015, dtype: object

In [7]:
df[df["DOWNLOADED"] == False]

Unnamed: 0,URL,POST_ID,DOWNLOADED
11,https://www.instagram.com/p/C9CL2rkNKOY/,C9CL2rkNKOY,False
12,https://www.instagram.com/p/C9CB9jYMZkT/,C9CB9jYMZkT,False
17,https://www.instagram.com/p/C9AtvoUtMOY/,C9AtvoUtMOY,False
116,https://www.instagram.com/p/C8ucKA0NvCS/,C8ucKA0NvCS,False
121,https://www.instagram.com/p/C8t6HX9sryx/,C8t6HX9sryx,False
...,...,...,...
163471,https://www.instagram.com/p/CvM5my6MQ2L/,CvM5my6MQ2L,False
163607,https://www.instagram.com/p/CvMxUw6svv_/,CvMxUw6svv_,False
163692,https://www.instagram.com/p/CvMos7Jr_uw/,CvMos7Jr_uw,False
163777,https://www.instagram.com/p/CvMgLhzuLgQ/,CvMgLhzuLgQ,False
