In [None]:
!pip install instaloader beautifulsoup4 requests pandas

In [None]:
"""Configuration settings for Instagram scraper."""

# Instagram settings
INSTAGRAM_USERNAME = None  # Optional: Set if you need to login
INSTAGRAM_PASSWORD = None  # Optional: Set if you need to login

# Scraping settings
MAX_LINKS_TO_CHECK = 20  # Maximum number of links to check from Linktree
REQUEST_TIMEOUT = 10  # Timeout for HTTP requests in seconds
MAX_RETRIES = 3  # Maximum number of retries for failed requests

# Link tree services to detect
LINK_TREE_SERVICES = [
    'linktr.ee',
    'linktree.com',
    'bio.link',
    'beacons.ai',
    'hoo.be',
    'solo.to',
    'allmylinks.com',
    'carrd.co',
    'taplink.cc',
    'linkpop.com',
    'shorby.com',
    'campsite.bio',
]

# Social media platforms to detect
SOCIAL_PLATFORMS = {
    'linkedin': ['linkedin.com/in/', 'linkedin.com/company/'],
    'facebook': ['facebook.com/', 'fb.com/', 'fb.me/'],
    'twitter': ['twitter.com/', 'x.com/'],
    'youtube': ['youtube.com/', 'youtu.be/'],
    'tiktok': ['tiktok.com/@'],
    'pinterest': ['pinterest.com/'],
    'instagram': ['instagram.com/'],
}

# Output settings
OUTPUT_CSV = 'customer_ledger.csv'
OUTPUT_JSON = 'customer_ledger.json'

# User agent for requests
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'


In [None]:
"""Module for resolving Linktree and similar bio link pages."""

import logging
import requests
from bs4 import BeautifulSoup
from typing import Dict, List, Optional
from urllib.parse import urlparse


logger = logging.getLogger(__name__)

class LinkTreeResolver:
    """Resolves 'link in bio' pages to find actual business websites."""

    def __init__(self):
        self.headers = {'User-Agent': USER_AGENT}

    def is_link_tree(self, url: str) -> bool:
        """
        Check if a URL is a known link tree service.
        
        Args:
            url: The URL to check
            
        Returns:
            True if it's a link tree service, False otherwise
        """
        if not url:
            return False
            
        try:
            domain = urlparse(url).netloc.lower()
            # Remove 'www.' if present
            if domain.startswith('www.'):
                domain = domain[4:]
                
            for service in LINK_TREE_SERVICES:
                if service in domain:
                    return True
            return False
        except Exception:
            return False

    def resolve_url(self, url: str) -> Dict[str, List[str]]:
        """
        Resolve a link tree URL to find contained links.
        
        Args:
            url: The link tree URL to resolve
            
        Returns:
            Dictionary with 'social_links' and 'website_links'
        """
        results = {
            'social_links': [],
            'website_links': []
        }

        if not url:
            return results

        try:
            response = requests.get(url, headers=self.headers, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all links
            for a in soup.find_all('a', href=True):
                href = a['href']
                if not href or href.startswith('#') or href.startswith('mailto:'):
                    continue
                    
                # Classify link
                if self._is_social_link(href):
                    if href not in results['social_links']:
                        results['social_links'].append(href)
                else:
                    # Avoid adding the link tree itself or empty links
                    if href != url and href not in results['website_links']:
                        results['website_links'].append(href)
                        
            # Limit number of links to check/return
            results['website_links'] = results['website_links'][:MAX_LINKS_TO_CHECK]
            
            return results
            
        except Exception as e:
            logger.error(f"Error resolving link tree {url}: {e}")
            return results

    def _is_social_link(self, url: str) -> bool:
        """Check if a URL is a social media link."""
        try:
            domain = urlparse(url).netloc.lower()
            for platform, domains in SOCIAL_PLATFORMS.items():
                for d in domains:
                    # Simple check if the domain part of the config exists in the url
                    # This is a basic check and can be improved
                    clean_d = d.split('/')[0] # get just domain from "linkedin.com/in/"
                    if clean_d in domain:
                        return True
            return False
        except Exception:
            return False


In [None]:
"""Module for scraping Instagram profile bio links."""

import instaloader
import logging
from typing import Optional, Dict

logger = logging.getLogger(__name__)


class InstagramScraper:
    """Scrapes Instagram profiles to extract bio links."""
    
    def __init__(self, username: Optional[str] = None, password: Optional[str] = None):
        """
        Initialize Instagram scraper.
        
        Args:
            username: Instagram username for login (optional)
            password: Instagram password for login (optional)
        """
        self.loader = instaloader.Instaloader()
        self.username = username
        self.password = password
        self.logged_in = False
        
    def login(self) -> bool:
        """
        Login to Instagram if credentials are provided.
        
        Returns:
            True if login successful or not needed, False otherwise
        """
        if not self.username or not self.password:
            logger.info("No credentials provided, using public access")
            return True
            
        try:
            self.loader.login(self.username, self.password)
            self.logged_in = True
            logger.info("Successfully logged in to Instagram")
            return True
        except Exception as e:
            logger.error(f"Failed to login to Instagram: {e}")
            return False
    
    def get_profile_info(self, instagram_handle: str) -> Optional[Dict[str, str]]:
        """
        Get profile information from Instagram.
        
        Args:
            instagram_handle: Instagram username (without @)
            
        Returns:
            Dictionary with profile info including bio link, or None if failed
        """
        try:
            # Remove @ if present
            handle = instagram_handle.lstrip('@')
            
            # Load profile
            profile = instaloader.Profile.from_username(self.loader.context, handle)
            
            # Initialize resolver
            
            resolver = LinkTreeResolver()
            
            external_url = profile.external_url
            resolved_links = {'social_links': [], 'website_links': []}
            
            if external_url and resolver.is_link_tree(external_url):
                logger.info(f"Detected link tree: {external_url}, resolving...")
                resolved_links = resolver.resolve_url(external_url)
                logger.info(f"Resolved {len(resolved_links['website_links'])} website links and {len(resolved_links['social_links'])} social links")
            
            info = {
                'username': profile.username,
                'full_name': profile.full_name,
                'bio': profile.biography,
                'external_url': external_url,
                'resolved_website_links': resolved_links['website_links'],
                'resolved_social_links': resolved_links['social_links'],
                'followers': profile.followers,
                'following': profile.followees,
                'is_business': profile.is_business_account,
                'is_verified': profile.is_verified,
            }
            
            logger.info(f"Successfully scraped profile: {handle}")
            logger.info(f"External URL: {info['external_url']}")
            
            return info
            
        except instaloader.exceptions.ProfileNotExistsException:
            logger.error(f"Profile not found: {instagram_handle}")
            return None
        except instaloader.exceptions.ConnectionException as e:
            logger.error(f"Connection error while fetching profile {instagram_handle}: {e}")
            return None
        except Exception as e:
            logger.error(f"Error fetching profile {instagram_handle}: {e}")
            return None
    
    def get_bio_link(self, instagram_handle: str) -> Optional[str]:
        """
        Get the bio link from an Instagram profile.
        
        Args:
            instagram_handle: Instagram username (without @)
            
        Returns:
            Bio link URL or None if not found
        """
        profile_info = self.get_profile_info(instagram_handle)
        if profile_info:
            return profile_info.get('external_url')
        return None


In [None]:

# Usage Example

# Initialize scraper
# Note: For public profiles, login might not be strictly necessary but recommended for stability.
# If you have credentials, set them in the config variables above or pass them here.
scraper = InstagramScraper()
scraper.login()

# Test with a handle
handle = "linktr.ee" # Example handle, or use a real one
print(f"Scraping profile: {handle}")
info = scraper.get_profile_info(handle)

if info:
    print("Found profile info:")
    print(f"Username: {info['username']}")
    print(f"Bio Link: {info['external_url']}")
    if info.get('resolved_website_links'):
        print("Resolved Website Links:", info['resolved_website_links'])
    if info.get('resolved_social_links'):
        print("Resolved Social Links:", info['resolved_social_links'])
else:
    print("Failed to scrape profile.")
