In [1]:
from random import choice
from datetime import datetime
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd

USER_AGENTS = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36']

In [2]:
class InstagramScraper:
    def __init__(self, url, user_agents=None):
        self.url = url
        self.user_agents = user_agents

    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(USER_AGENTS)

    def __request_url(self):
        try:
            response = requests.get(
                        self.url,
                        headers={'User-Agent': self.__random_agent()})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non-200 status code.')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text
    @staticmethod
    def extract_json(html):
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)
    
    def page_metrics(self):
        results = {}
        try:
            response = self.__request_url()
            json_data = self.extract_json(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                if key != 'edge_owner_to_timeline_media':
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
        return results
    def post_metrics(self):
        results = []
        try:
            response = self.__request_url()
            json_data = self.extract_json(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                if node and isinstance(node,dict):
                    results.append(node)
        return results

In [7]:

    ig_user = input(str("Enter a public Instagram handle: "))
    url = 'https://www.instagram.com/' + ig_user.strip('@')
    instagram = InstagramScraper(url)
    post_metrics = instagram.post_metrics()
    ig_df = pd.DataFrame()
    for post in post_metrics:
        day = datetime.fromtimestamp(post['taken_at_timestamp']).strftime('%A')
        time = datetime.fromtimestamp(post['taken_at_timestamp']).strftime('%I %p')
        likes = post['edge_liked_by']['count']
        comments = post['edge_media_to_comment']['count']
        total_engagements = likes + comments
        slide = post['__typename'] 
        if slide == 'GraphImage':
            slide_yesno = 'No'
        if slide == 'GraphSidecar':
            slide_yesno = 'Yes' 
        location = post['location']
        if location is None:
            location_yesno = 'No'
        else:
            location_yesno = 'Yes'
        ig_df = ig_df.append({'Day': day, 'Time': time, 'Likes': likes, 'Comments': comments, 
                              'Total Engagements': total_engagements, 'Used Slide Feature': slide_yesno, 
                             'Location': location_yesno},
                             ignore_index=True)

    #AVERAGE LIKES/COMMENTS
    avg_likes = ig_df['Likes'].mean() 
    avg_comments = ig_df['Comments'].mean() 

    #BY DAY
    by_day_df = ig_df.groupby('Day')
    avg_by_day_df = by_day_df.mean()
    best_day_df = avg_by_day_df[avg_by_day_df['Total Engagements']==
                          avg_by_day_df['Total Engagements'].max()]
    best_day = best_day_df.index.values[0]
    worst_day_df = avg_by_day_df[avg_by_day_df['Total Engagements']==
                          avg_by_day_df['Total Engagements'].min()]
    worst_day = worst_day_df.index.values[0]

    #BY TIME
    time_df = ig_df.groupby('Time')
    avg_by_time_df = time_df.mean()
    time_df = avg_by_time_df[avg_by_time_df['Total Engagements']==
                          avg_by_time_df['Total Engagements'].max()]
    best_time = time_df.index.values[0]
    worst_time_df = avg_by_time_df[avg_by_time_df['Total Engagements']==
                          avg_by_time_df['Total Engagements'].min()]
    worst_time = worst_time_df.index.values[0]

    #BY CAROUSEL
    slide_df = ig_df.groupby('Used Slide Feature')
    avg_by_slide_df = slide_df.mean()
    slide_df = avg_by_slide_df[avg_by_slide_df['Total Engagements']==
                          avg_by_slide_df['Total Engagements'].max()]
    slide_advice = slide_df.index.values[0]

    #BY LOCATION
    location_df = ig_df.groupby('Location')
    avg_by_location_df = location_df.mean()
    location_df = avg_by_location_df[avg_by_location_df['Total Engagements']==
                          avg_by_location_df['Total Engagements'].max()]
    location_advice = location_df.index.values[0]

    print('Average Number of Likes: %d\nAverage Number of Comments: %d\n'% (avg_likes, avg_comments))
    print('Suggested Instagram Advice:\nBest Day to Post: %s\nWorst Day to Post: %s\nBest Time to Post: %s\nWorst Time to Post: %s\nUse a Slide? %s\nInclude a Location? %s'
          % (best_day, worst_day, best_time, worst_time, slide_advice, location_advice))

    
ig_df

Enter a public Instagram handle: sarahgreen33
Average Number of Likes: 630
Average Number of Comments: 29

Suggested Instagram Advice:
Best Day to Post: Sunday
Worst Day to Post: Saturday
Best Time to Post: 08 PM
Worst Time to Post: 01 AM
Use a Slide? Yes
Include a Location? Yes
