In [1]:
from random import choice
from datetime import datetime
import json
import requests
from bs4 import BeautifulSoup
import IPython.display
import os
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.dialects.postgresql import insert


In [2]:
USER_AGENTS = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36']

In [3]:
class InstagramScraper:
    def __init__(self, url, user_agents=None, **kwargs):
        self.url = url
        self.user_agents = user_agents
        self.database = os.environ['SHARED_DB_URI']
        self.keyword_id = kwargs.get("keyword_id")
        self.brand_id = kwargs.get("brand_id")
        
    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(USER_AGENTS)

    def __request_url(self):
        try:
            response = requests.get(
                        self.url,
                        headers={'User-Agent': self.__random_agent()})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non-200 status code.')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text
    @staticmethod
    def extract_json(html):
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)
  
    def raw_response(self):
        results = {}
        try:
            response = self.__request_url()
            json_data = self.extract_json(response)
            return json_data
        except Exception as e:
            raise e

    def page_metrics(self):
        results = {}
        try:
            response = self.__request_url()
            json_data = self.extract_json(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                if key != 'edge_owner_to_timeline_media':
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
        return results
    def hashtag_posts(self):
        results = []
        try:
            response = self.__request_url()
            json_data = self.extract_json(response)
            infos = posts['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges']
        except Exception as e:
            raise e
        else:
            for node in infos:
                node = node.get('node')
                if node and isinstance(node,dict):
                    results.append(node)
                    self.populate_table(node, node['id'])
        return results

    def populate_table(self, raw_data, api_id):
        """Populate a given table witht he Twitter collected data

        Args:
            raw_data (json) : storing raw data for further usage
        """
        engine = create_engine(self.database)

        # Create connection
        conn = engine.connect()
        meta = MetaData()

        #get table
        raw_tweet = Table('raws', meta, autoload=True, autoload_with=engine)

        # Begin transaction
        trans = conn.begin()

        ins = insert(raw_tweet).values(brand_id=self.brand_id,
                                    	keyword_id=self.keyword_id,
                                    	platform_id=2,
                                    	api_id=api_id,
                                    	raw_data=raw_data,
                                    	created_at=datetime.now()
                                        )
    

        do_update_ins = ins.on_conflict_do_update(
            constraint='api_id',
            set_=dict(brand_id=self.brand_id,
                      keyword_id=self.keyword_id,
                      platform_id=2,
                      api_id=api_id,
                      raw_data=raw_data,
                      created_at=datetime.now())
        )


        #actual content of request
        conn.execute(ins)

        try:
            trans.commit()

        except exc.SQLAlchemyError as e:
            print(e)
            log.error(e)
            trans.rollback()

        # Close connection
        conn.close()
        engine.dispose()
        print(f"Tweet colleted")
        return

In [4]:
# Define the URL for the profile page.
hashtag ='minecraft'
url = 'https://www.instagram.com/explore/tags/'+hashtag+'/'

# Initiate a scraper object and call one of the methods.
instagram = InstagramScraper(url, keyword_id=1, brand_id=1)
posts = instagram.raw_response()
IPython.display.JSON(posts)

<IPython.core.display.JSON object>

In [5]:
infos = posts['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges']
IPython.display.JSON(infos)

<IPython.core.display.JSON object>

In [6]:
posts = instagram.hashtag_posts()

Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet colleted
Tweet coll

In [7]:
IPython.display.JSON(posts)

<IPython.core.display.JSON object>