In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from IPython.display import display
import pandas as pd
import time, json, random, re, datetime
import configparser as cp 

class InstagramScraper():
    def __init__(self):
        self.config = cp.ConfigParser()
        self.config.read('config.cfg')
        
        # Dummy account credentials
        self.username = self.config.get('core', 'username')
        self.password = self.config.get('core', 'password')
        
        # Change to whatever driver is preferred
        self.driver = webdriver.Chrome() 
        
        self.today = datetime.datetime.now().strftime("%Y-%m-%d")
        
    def parse_json(self, page):
        # Create a bs4 object from page source
        soup = BeautifulSoup(page)
        
        # Grab interesting JSON only
        json_main = soup.find('script', string = re.compile("^window._sharedData") ).string.replace('window._sharedData = ', '')[:-1]
        
        # Parse the JSON into a dictionary
        parsed = json.loads(json_main)
        
        return(parsed)
    
    def remove_emojis(self, string):
        # Remove unwanted unicode emojies and special characters by forcing latin-1
        return(string.encode('latin-1', 'ignore').decode('utf-8'))
        
    def do_login(self):
        # Load page
        self.driver.get("https://www.instagram.com/accounts/login/")
        
        # Wait a bit for the page to load
        time.sleep(2)

        # Input the credentials
        self.driver.find_element_by_xpath("//input[@name='username']").send_keys(self.username)
        self.driver.find_element_by_xpath("//input[@name='password']").send_keys(self.password)

        # Send
        self.driver.find_element_by_xpath("//button[@type='submit']").click()
        
        # Wait a bit for the page to load after logging in
        time.sleep(2)
        
        return(True)
        
    # Scrape account metadata, image lists and followers
    def scrape_accounts(self, accounts = list(), do_profiles = True, do_posts = False, do_followers = False):
        # Prepare empty results lists
        profiles  = list()
        posts     = list()
        followers = list()
        
        # Cycle through target list and get metadata
        for one in accounts:
            self.driver.get('https://www.instagram.com/{0}/'.format(one))
            
            # check if the request returned 200
            if 'dialog-404' in self.driver.page_source:
                continue
            
            # wait a bit while the page loads
            time.sleep(2)
            
            # Get the profile metadata
            metadata = self.get_account_metadata(self.driver.page_source)
            profiles.append(metadata)
            
            if not metadata['is_private'] and do_posts:
                posts += self.get_posts(metadata)
            if not metadata['is_private'] and do_followers:
                followers.append(self.get_followers(metadata))
            
            # Randomized delay between requests
            time.sleep(random.randint(1,3))
        
        # Write this to a .csv file
        if do_profiles:
            display(pd.DataFrame(profiles))
            pd.DataFrame(profiles).to_csv('account_metadata/{0}.csv'.format(self.today.replace('-', '')), index=False)
        if not metadata['is_private'] and do_posts:
            df = pd.DataFrame(posts)
            df.drop_duplicates(inplace=True)
            display(df)
            df.to_csv('account_posts/{0}.csv'.format(self.today.replace('-', '')), index=False)
        if not metadata['is_private'] and do_followers:
            pd.DataFrame(followers).to_csv('account_followers/{0}.csv'.format(self.today.replace('-', '')), index=False)
            
    # Parse account metadata from html/json
    def get_account_metadata(self, page):
        # Parse the page JSON into a dictionary
        parsed = self.parse_json(page)
        
        metadata = {
            'date'      : self.today,
            'username'  : parsed['entry_data']['ProfilePage'][0]['graphql']['user']['username'],
            'full_name' : self.remove_emojis(parsed['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']),
            'is_private': 1 if parsed['entry_data']['ProfilePage'][0]['graphql']['user']['is_private'] else 0,
            'is_business_account': 1 if parsed['entry_data']['ProfilePage'][0]['graphql']['user']['is_business_account'] else 0,
            'external_url': parsed['entry_data']['ProfilePage'][0]['graphql']['user']['external_url'],
            'biography' : self.remove_emojis(parsed['entry_data']['ProfilePage'][0]['graphql']['user']['biography'].replace('\n', ' ')),
            'business_category_name': parsed['entry_data']['ProfilePage'][0]['graphql']['user']['business_category_name'],
            'business_email': parsed['entry_data']['ProfilePage'][0]['graphql']['user']['business_email'],
            'following' : parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_follow']['count'],
            'followers' : parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_followed_by']['count'],
            'posts'     : parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count']
        }
        
        return(metadata)
        
    # Load all images through infinite scroll and grab each image's link and basic information
    def get_posts(self, metadata):
        # Since Instagram hides posts from the DOM tree as they disappear from view during scroll, we have to iterate over screens
        # The number of iterations = (total number of posts - the initial 12 images) / by the average of 10 images per scroll
        n = int((metadata['posts']-12)/10)
        posts = list()
        n = 2
        for i in range(n):
            # Scroll to the bottom of the page
            self.driver.find_element_by_tag_name('body').send_keys(Keys.END)
            
            # Find all posts visible in the DOM tree
            post_urls = self.driver.find_elements_by_xpath("//a[starts-with(@href,'/p/')]")
            
            for url in post_urls:
                # Add to list
                posts.append({
                    'date'    : self.today,
                    'username': metadata['username'],
                    'post_url': url.get_attribute('href')
                })
            
            # Randomized delay between requests
            time.sleep(random.randint(1,3))
        
        return(posts)
    
    def get_post_metadata(self, posts):
        user_posts = list()

        for one in posts:
            self.driver.get(one['post_url'])
            
            # Parse the page JSON into a dictionary
            parsed = self.parse_json(self.driver.page_source)

            metadata = {
                'date'      : self.today,
                'username'  : one['username'],
                'post_url'  : one['post_url'],
                'auto_generated_desc': parsed['entry_data']['PostPage'][0]['graphql']['shortcode_media']['accessibility_caption'],
                'is_video'  : 1 if parsed['entry_data']['PostPage'][0]['graphql']['shortcode_media']['is_video'] else 0,
                'caption'   : self.remove_emojis(parsed['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']),
                'likes'     : parsed['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_preview_like']['count'],
                'comments'  : parsed['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_to_comment']['count'],
                'location'  : parsed['entry_data']['PostPage'][0]['graphql']['shortcode_media']['location']['name']
                # can add tagged users here, if needed
            }
            
            user_posts.append(metadata)
            
            # Randomized delay between requests
            time.sleep(random.randint(2,3))
        
        df = pd.DataFrame(user_posts)
        display(df)
        df.to_csv('account_posts/{0}.csv'.format(self.today.replace('-', '')), index=False)

    # Load all followers through infinite scroll and grab their usernames
    def get_followers(self, metadata):
        # Find the Followers link and click it
        self.driver.find_element_by_partial_link_text("follower").click()
        
        # Wait a bit for the modal to fully load
        time.sleep(3)
        
        # The first scroll only happens around this line, not at the bottom of the modal
        scroll = self.driver.find_element_by_xpath("//*[contains(text(), 'Suggestions For You')]")
        scroll.location_once_scrolled_into_view
        
        # The number of iterations = (total number of followers - the initial 12) / by the 12 followers per scroll
        n = int((metadata['followers']-12)/12)
        followers = list()
        
        # Look inside the modal window only
        modal = self.driver.find_element_by_xpath("//div[@role='dialog']")        
        n=2
        for i in range(n):
            # Then just find the last Follow button
            last_element = modal.find_elements_by_xpath("//div[@role='button']")[-1]
            last_element.location_once_scrolled_into_view

            # Delay between requests
            time.sleep(1)

        # Get all the links
        links = modal.find_elements_by_tag_name('a')
        lst = list()

        for i in links:
            lst.append(i.get_attribute('href'))

        # Remove duplicates, as every item has two '<a>' tags (redirect link and follow button)
        lst = list(dict.fromkeys(lst))

        followers += lst
        
        result = {
            'username': metadata['username'],
            'followers': followers
        }
        
        return(result)
        
    # Placeholder for retreiving comments
    def get_comments(self, post):
        pass

In [3]:
s = InstagramScraper()
if s.do_login():
    s.scrape_accounts(['jorgeofficial.ro'], do_posts = True) # 'deepikapiku', 'streetmagazines'

Unnamed: 0,biography,business_category_name,business_email,date,external_url,followers,following,full_name,is_business_account,is_private,posts,username
0,SINGER TV Host Booking: +40757162275 viberec...,Creators & Celebrities,concertjorge@gmail.com,2018-12-14,https://youtu.be/ZWM0b6JUmeg,176900,24,JORGE,1,0,315,jorgeofficial.ro


Unnamed: 0,date,post_url,username
0,2018-12-14,https://www.instagram.com/p/BrXNfM0FgFK/,jorgeofficial.ro
1,2018-12-14,https://www.instagram.com/p/BrU5JZmFlZ5/,jorgeofficial.ro
2,2018-12-14,https://www.instagram.com/p/BrUmeluln8L/,jorgeofficial.ro
3,2018-12-14,https://www.instagram.com/p/BrSqtuGF3ah/,jorgeofficial.ro
4,2018-12-14,https://www.instagram.com/p/BrSbBLVFLtk/,jorgeofficial.ro
5,2018-12-14,https://www.instagram.com/p/BrR-NEJF3ej/,jorgeofficial.ro
6,2018-12-14,https://www.instagram.com/p/BrQPRRYlhSL/,jorgeofficial.ro
7,2018-12-14,https://www.instagram.com/p/BrPOyLrFYUG/,jorgeofficial.ro
8,2018-12-14,https://www.instagram.com/p/BrNpcGolH0D/,jorgeofficial.ro
9,2018-12-14,https://www.instagram.com/p/BrMzf4OFHw7/,jorgeofficial.ro


In [4]:
posts = [{'username': 'jorgeofficial.ro', 'post_url': 'https://www.instagram.com/p/BrU5JZmFlZ5/'}, {'username': 'jorgeofficial.ro', 'post_url': 'https://www.instagram.com/p/BrUmeluln8L/'}]

In [5]:
s.get_post_metadata(posts)

Unnamed: 0,auto_generated_desc,caption,comments,date,is_video,likes,location,post_url,username
0,"Image may contain: 2 people, people sitting an...",Nu te speria! Asa arata poza de la Vlogul nou ...,14,2018-12-14,0,2083,"Bucharest, Romania",https://www.instagram.com/p/BrU5JZmFlZ5/,jorgeofficial.ro
1,"Image may contain: 1 person, standing",Luna cadourilor....cred ca cea mai frumoasa lu...,16,2018-12-14,0,1383,"Bucharest, Romania",https://www.instagram.com/p/BrUmeluln8L/,jorgeofficial.ro


In [None]:
# TODO:
# 1. Read handlers list from a text file
# 2. Command-line runnable with arguments
# 3. Include post metadata function into the main scrape block
# 4. Comments scraper
# 5. Topic models
# 6. Sentiment analysis
# 7. Segmentation (somehow lol)

In [171]:
driver = webdriver.Chrome()
driver.get("https://www.instagram.com/qweazxfrq/")
#driver.get("https://www.instagram.com/p/BQuARCSgV4e/")
driver.title

'Page Not Found • Instagram'

In [173]:
'fuck' if 'dialog-404' in driver.page_source else 'coool'

'coool'

In [52]:
driver.find_element_by_tag_name('body').send_keys(Keys.END)

In [23]:
x = driver.find_elements_by_xpath("//a[starts-with(@href,'/p/')]")

In [None]:
# placeholder for random executions

In [25]:
x[0].get_attribute('href')

'https://www.instagram.com/p/BrSNYA0BVQv/'

In [50]:
soup = BeautifulSoup(driver.page_source)

In [51]:
json_main = soup.find('script', string = re.compile("^window.") ).string.replace('window._sharedData = ', '')[:-1]

In [52]:
parsed = json.loads(json_main)
# print(json.dumps(parsed, indent=4, sort_keys=True))

#metadata = data['entry_data']['ProfilePage'][0]['graphql']

# data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']

In [19]:
# parsed['entry_data']['ProfilePage'][0]['graphql']['user']

# parsed['entry_data']['ProfilePage'][0]['graphql']['user']['biography']
# parsed['entry_data']['ProfilePage'][0]['graphql']['user']['business_category_name']
# parsed['entry_data']['ProfilePage'][0]['graphql']['user']['business_email']
# parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_follow']['count']
# parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_followed_by']['count']
# parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count']

parsed['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']

'STREET MAGAZINE'

In [89]:
driver.find_element_by_partial_link_text("follower").click()

In [55]:
inf_load = driver.find_element_by_xpath("//*[contains(text(), 'Suggestions For You')]")
inf_load.location_once_scrolled_into_view

{'x': 404, 'y': 496}

In [90]:
last_element = driver.find_elements_by_xpath("//div[@role='button']")[-1]
last_element.location_once_scrolled_into_view

{'x': 375, 'y': 624}

In [91]:
modal = driver.find_element_by_xpath("//div[@role='dialog']")
links = modal.find_elements_by_tag_name('a')

In [135]:
a = ['q', '2', '3']
b = ['m', 'n']
a += b

In [136]:
a

['q', '2', '3', 'm', 'n']

In [140]:
c = {'username':'x',
 'followers': a}

In [141]:
c

{'username': 'x', 'followers': ['q', '2', '3', 'm', 'n']}

In [142]:
pd.DataFrame(c)

Unnamed: 0,username,followers
0,x,q
1,x,2
2,x,3
3,x,m
4,x,n


In [92]:
x = [{'date': '2018-12-13', 'username': 'deepikapiku', 'full_name': '#DeepVeerWale 💏❤💑', 'is_private': 0, 'is_business_account': 0, 'external_url': None, 'biography': 'Deepika Padukone Is Perfect.JUST PERFECT❤ Met Deepika On 11th August, 2017💃🏻 Liked By Shahid on 11th Sept 2017💕 @deepikapadukone ❤', 'business_category_name': None, 'business_email': None, 'following': 548, 'followers': 502863, 'posts': 5362}]

In [93]:
x

[{'date': '2018-12-13',
  'username': 'deepikapiku',
  'full_name': '#DeepVeerWale 💏❤💑',
  'is_private': 0,
  'is_business_account': 0,
  'external_url': None,
  'biography': 'Deepika Padukone Is Perfect.JUST PERFECT❤ Met Deepika On 11th August, 2017💃🏻 Liked By Shahid on 11th Sept 2017💕 @deepikapadukone ❤',
  'business_category_name': None,
  'business_email': None,
  'following': 548,
  'followers': 502863,
  'posts': 5362}]

In [94]:
q = x[0]['biography']

In [130]:
q.encode('latin-1', 'ignore').decode('utf-8')

'Deepika Padukone Is Perfect.JUST PERFECT Met Deepika On 11th August, 2017 Liked By Shahid on 11th Sept 2017 @deepikapadukone '

In [97]:
non_bmp_re = re.compile(u"[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]", flags=re.UNICODE)
non_bmp_re.sub(u'', q)

'Deepika Padukone Is Perfect.JUST PERFECT❤ Met Deepika On 11th August, 2017 Liked By Shahid on 11th Sept 2017 @deepikapadukone ❤'

In [124]:
pd.DataFrame(x).to_csv('x.csv', index=False)

In [114]:
xx = [{'date': '2018-12-13', 'username': 'hamza.isah.7777', 'full_name': 'Hamza don', 'is_private': 0, 'is_business_account': 0, 'external_url': None, 'biography': '', 'business_category_name': None, 'business_email': None, 'following': 1007, 'followers': 440, 'posts': 58}]

In [115]:
pd.DataFrame(xx)

Unnamed: 0,biography,business_category_name,business_email,date,external_url,followers,following,full_name,is_business_account,is_private,posts,username
0,,,,2018-12-13,,440,1007,Hamza don,0,0,58,hamza.isah.7777


In [125]:
xr = pd.read_csv('x.csv')

In [126]:
xr

Unnamed: 0,biography,business_category_name,business_email,date,external_url,followers,following,full_name,is_business_account,is_private,posts,username
0,Deepika Padukone Is Perfect.JUST PERFECT❤ Met ...,,,2018-12-13,,502863,548,#DeepVeerWale 💏❤💑,0,0,5362,deepikapiku
