# Image Retrieval From Instagram

**Goal:** collect image data from instagram and then preprocess it, extract information (image files) from a user's Instagram profile

**Constraints:** the user has no way of setting the image size (in KB), the resolution (1080x1080) of the images found on Instagram. The images are extracted from the Instagram page in raw form.  

#### Websites: 

This notebook's code is based on the following tutorials: 

https://medium.com/@srujana.rao2/scraping-instagram-with-python-using-selenium-and-beautiful-soup-8b72c186a058

https://edmundmartin.com/scraping-instagram-with-python/

https://michaeljsanders.com/2017/05/12/scrapin-and-scrollin.html

**Important Note:** *Remember to respect user’s rights when you download copyrighted content. Do not use images/videos from Instagram for commercial intent.*

# Instagram Scraper

### 1. Import dependencies

Install non-standard libraries: requests, BeautifulSoup 

In [1]:
import os
from random import choice
import json
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from keras.preprocessing.image import load_img, img_to_array
import shutil
# to install
import requests
from bs4 import BeautifulSoup

Using TensorFlow backend.


### 2. Build InstagramScraper class
based on: https://edmundmartin.com/scraping-instagram-with-python/

Switching user agents is often a best practice when web scraping and can help you avoid detection. Should the caller of our class have provided their own list of user agents we take a random agent from the provided list.  Otherwise we will return our default user agent.

Define a class called InstagramScraper: 

In [2]:
# url header for requests.get()
headers={'User-Agent':  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
         ,  'content-type': 'application/json'
         , 'accept-encoding': 'gzip, deflate, br'
         , 'cache-control': 'no-cache'
         , 'accept' : '*/*'
         , 'accept-language' : 'de-DE, de; q=0.9,en-US; q=0.8,en;q=0.7'
         #, 'referer' : url
         , 'connection' : 'keep-alive'
         , 'cookie' : 'ig_cb=1; ig_did=DA66C494-9DFE-48F6-BA63-66F11DF8EC03; csrftoken=ukE8jYSjQxVs1YGPYddEkAXsN6WZ4Qmw; mid=XoChrAALAAG78Upva7Ld0TAzeTtm; rur=ASH; urlgen="{\"2a04:ee41:4:95:91f9:b9d4:8aab:41c\": 15796\054 \"213.55.241.7\": 15796\054 \"2a04:ee41:4:95:60ae:def3:2fd7:3633\": 15796}:1jIpww:PTjjrSzpjC6dWww8-AVOnfdQAFA"'
        }
_user_agents = [
   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]

In [37]:
class InstagramScraper:

    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy

    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(_user_agents)

    def __request_url(self, url):
        """Our second helper method is simply a wrapper around requests. 
        We pass in a URL and try to make a request using the provided user agent and proxy. 
        If we are unable to make the request or Instagram responds with a non-200 status code we simply re-raise the error. 
        If everything goes fine, we return the page in questions HTML."""
        try:
            response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy, 'https': self.proxy})
            #response = requests.get(url, headers=headers, proxies={'http': self.proxy, 'https': self.proxy})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from Instagram')
        except requests.RequestException:
            raise requests.RequestException('Internet connection failed.')
        else:
            return response.text


    @staticmethod
    def extract_json_data(html):
        """Instagram serve’s all the of information regarding a user in the form of JavaScript object. 
        This means that we can extract all of a users profile information and their recent posts by just 
        making a HTML request to their profile page. We simply need to turn this JavaScript object into 
        JSON, which is very easy to do."""
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        #pprint(script_tag)
        #print('\n')
        #pprint(type(script_tag))
        content = script_tag.contents#.strip().replace('window._sharedData =', '').replace(';', '')
        #######
        content_string = ''.join(content)
        raw_string = content_string.strip().replace('window._sharedData =', '').replace(';', '')
        #######
        #print('\n')
        #pprint(raw_string)
        #print('\n')
        #pprint(type(raw_string))
        return json.loads(raw_string)

    def profile_page_metrics(self, profile_url):
        results = {}
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                #print('key:', key, '-value:', value)
                if key != 'edge_owner_to_timeline_media':
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
                    elif value:
                        results[key] = value
        return results

    
    def hash_page_metrics(self, profile_url):
        results = {}
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['TagPage'][0]['graphql']['hashtag']
         
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                #print('metrics:', metrics)
                if key != 'edge_hashtag_to_media' and key != 'edge_hashtag_to_top_posts' and key != 'profile_pic_url':
                    results[key] = value
                    if value and isinstance(value, dict):
                        try: 
                            value = value['count']            
                            results[key] = value
                        except: 
                            results[key] = value
                        try: 
                            sigma = []
                            for i in range(0,5): 
                                #print(i)
                                value = value['edges'][i]['node']['name']  
                                #print(i)
                            sigma.append(value)
                            print(len(value['edges']['node']))
                            
                            #results[key] = sigma
                        except: 
                            results[key] = value 
                    elif value:
                        results[key] = value
        return results
    
    def profile_page_posts(self, profile_url):
        results = []
        try:
            response = self.__request_url(profile_url)
            #pprint(response)
            json_data = self.extract_json_data(response)
            #pprint(json_data)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                #if node and isinstance(node, dict): #this line only gets most recent post out
                results.append(node)
        return results
    
    def hashtag_page_posts(self, hashtag_url):
        results = []
        try:
            response = self.__request_url(hashtag_url)
            json_data = self.extract_json_data(response)
            #pprint(json_data)
            metrics = json_data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']["edges"]
            #pprint(metrics)
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                #if node and isinstance(node, dict): #this line only gets most recent post out
                results.append(node)
        return results

### 3.1 Load URLS of Brand Names Data

This chapter is only applicable if you have a csv-file with a limited set of brands you want to scrape. If you do not want to limit yourself to a predefined list, you can skip this part. Specify instragram USERNAME profile whose page you want to scrape. Get a dictionary with all information (image, comments, etc.) from that Instagram profile.

In [4]:
# to specify
try: 
    directory= r'C:\Users\Anonym\Documents\GitHub\DLfM_BrandManagement\data\instagram_urls'
    os.chdir(directory)
except: 
    directory = r'C:\Users\lsamsi\Documents\GitHub\DLfM_BrandManagement\data\instagram_urls'
    os.chdir(directory)

Get out all apparel brands. 

In [5]:
#import csv

#data = pd.read_csv("firm_usernames.csv", header=None)

#firm_usernames = data[0].tolist()
#firm_usernames[:5]

In [6]:
#data = pd.read_csv("instagram_hashtags.csv", header=None)

#instagram_hashtags = data[0].tolist()
#instagram_hashtags[:5]

Perform set theory on both datasets. 

In [7]:
#def intersection(lst1, lst2): 
 #   lst3 = [value for value in lst1 if value in lst2] 
  #  return lst3 

In [8]:
#def non_intersection(lst1, lst2): 
 #   lst3 = list(set(lst1) ^ set(lst2))
  #  return lst3 

In [9]:
#def set_difference(lst1, lst2): 
 #   lst3 = list(set(lst1) - set(lst2))
  #  return lst3 

In [10]:
#print('Number of brands firm usernames: ', len(firm_usernames))
#print('Number of brands as instagram hashtags: ', len(instagram_hashtags))
#print('Number of same brands (firm usernames and hashtags): ', len(intersection(firm_usernames, instagram_hashtags)))
#print('Brands that are both firm usernames and hashtags: ', intersection(firm_usernames, instagram_hashtags)[:5], '...')
#print('Brands that are in neither firm usernames nor hashtags: ', non_intersection(firm_usernames, instagram_hashtags)[:5])
#print('Brands that are firm usernames only: ', set_difference(firm_usernames, instagram_hashtags)[:5])
#print('Brands that are hashtagged only: ', set_difference(instagram_hashtags, firm_usernames)[:5])

### 3.2 Specify Instagram page(s)

Specify instragram USERNAME profile whose page you want to scrape. Get a dictionary with all information (image, comments, etc.) from that Instagram profile. 

#### Convert unofficial hashtag to official user-profile name 

For 'cailler' the '#cailler' user-input will get results on Instagram. The official Instagram of cailler might differ, however. 
The official brandname on Instagram is 'cailler-suisse'. Thus, we need a dataframe to get out the corresponding official name given the unofficial name. 

This is the reason why **we can only have brands that are listed in this dataframe** and **no other brands**.

In [11]:
# set directory 
#import os
#directory= r"C:\Users\Anonym\Documents\GitHub\DLfM_BrandManagement\data\instagram_urls"
#os.chdir(directory)

In [12]:
# load dataframe 
import pandas as pd 

convert = pd.read_csv('hashToOfficialName.csv')

In [13]:
convert.head()

Unnamed: 0,instagram_hashtag,firm_account,brand_full_name
0,abercrombie,abercrombie,Abercrombie & Fitch
1,adidas,adidas,Adidas
2,anntaylor,anntaylor,Ann Taylor
3,bacardi,bacardiusa,Bacardi
4,bananarepublic,bananarepublic,Banana Republic


In [14]:
# pages that have access denial because of age limit
# are you 18/21 or over? 
#urls.remove('https://www.instagram.com/bacardiusa/?hl=en')

In [15]:
# items to be removed from list
#agelimited_brands = {'bacardiusa', 'budlight', 'budweiser', 'coorslight', 'corona', 'greygoose', 'jackdaniels_us', 'korbel_1882'} 
  
#firm_usernames = [ele for ele in firm_usernames if ele not in agelimited_brands] 

In [16]:
# items to be removed from conversion dataframe 
agelimited_brands = ['bacardiusa', 'budlight', 'budweiser', 'coorslight', 'corona', 'greygoose', 'jackdaniels_us', 'korbel_1882'] 

convert = convert[~(convert.firm_account.isin(agelimited_brands))]
convert.head()

Unnamed: 0,instagram_hashtag,firm_account,brand_full_name
0,abercrombie,abercrombie,Abercrombie & Fitch
1,adidas,adidas,Adidas
2,anntaylor,anntaylor,Ann Taylor
4,bananarepublic,bananarepublic,Banana Republic
5,bigelow,bigelowtea,Bigelow`s Tea


In [17]:
def hashToOfficial(hashing): 
    username = convert.loc[convert['instagram_hashtag'] == hashing, 'firm_account'].iloc[0]
    return username

#### Keyword input

In [19]:
brands_on_display = convert['instagram_hashtag'].tolist()
brands_on_display = ', '.join(brands_on_display)

In [20]:
print('Choose from these brandnames to get a brand management analysis:', brands_on_display)
keyword = input('Which brandname do you want to analyze?')
# 'sanpellegrino'

Choose from these brandnames to get a brand management analysis: abercrombie, adidas, anntaylor, bananarepublic, bigelow, carhartt, cocacola, converse, dockers, dolcegabbana, domperignon, drpepper, eddiebauer, fanta, gap, gatorade, gucci, guess, hanes, hollister, honesttea, jcrew, joeboxer, juicycouture, kennethcole, levis, lipton, llbean, luckybrand, moetchandon, monsterenergy, nesquik, oldnavy, oshkosh, prada, ralphlauren, sanpellegrino, snapple, tazo, tommyhilfiger, underarmour, urbanoutfitters, victoriassecret, vitaminwater, welchs, minutemaid, motts, swissmiss
Which brandname do you want to analyze?nestlé


#### Hashtag Page

If you want to open a hashtag page (instead of a user profile): 

In [21]:
# for multiple brands  
#hash_urls = []
#username = False

#for hashtag in instagram_hashtags: 
 #   url = 'https://www.instagram.com/explore/tags/'+hashtag
  #  hash_urls.append(url)

#hash_urls[:2]

In [22]:
# for one brand only 

# to specify user_input
hashtag = 'nestlé' 
hash_url = 'https://www.instagram.com/explore/tags/'+hashtag

#### User-profile Page

If you want to scrape a user-profile page, specify the username as:

In [23]:
# for multiple firms  
#urls = []
#hashtag = False

#for username in firm_usernames: 
 #   url = 'https://www.instagram.com/'+username+'/?hl=en'
  #  urls.append(url)

#urls[:2]

In [27]:
# for one firm only 

# to specify user_input
username= "nestle"
#username= 'swatch'
url = 'https://www.instagram.com/'+username+'/?hl=en'

### 3. Get information from Instagram page(s) [optional]

Now that the url of the Instagram page is defined, it will extract out all the posts or meta-information from the website usinge the InstagramScraper class. 

Get meta-information metrics by using a class method. 

In [None]:
# get profile page metrics
#from pprint import pprint

#k = InstagramScraper()
#results = k.profile_page_metrics(url) 
#pprint(results)

In [None]:
# get hashtag page metrics
#from pprint import pprint

#k = InstagramScraper()
#TODO
#results = k.hash_page_metrics(url) 
#pprint(results)

### 4. Get image posts from Instagram page(s)

Get all posts on an Instagram **profile page** that are visible on the landing page (more items only load as you scroll downwards). The page loads 12 items at a time, and I need to scroll to load all entries (for a total of 120).

#### User-profile Page

In [None]:
# get posts (images) from multiple profile pages 
#from pprint import pprint

#resultz = []
#for url in urls: 
 #   k = InstagramScraper()
  #  results = k.profile_page_posts(url)
   # resultz.append(results)
    #print('Instagram page: ', url)

#print('Total number of Instagram user-profile pages: ', len(resultz))
#print('Total number of images: ', len(resultz)*len(resultz[0]))
#print('Average number of images per Instagram user-profile page: ', len(resultz)*len(resultz[0])/len(resultz) )

In [38]:
# get posts (images) from single profile page 

from pprint import pprint

k = InstagramScraper()
results = k.profile_page_posts(url)

print('Instagram page: ', url)
print('Posts on Instagram profile page: ', len(results))
print('Second image url on instagram profile: ', results[1]['display_url'])

Instagram page:  https://www.instagram.com/nestle/?hl=en
Posts on Instagram profile page:  12
Second image url on instagram profile:  https://instagram.fzrh2-1.fna.fbcdn.net/v/t51.2885-15/e35/92550584_846774565843081_8318181282322828356_n.jpg?_nc_ht=instagram.fzrh2-1.fna.fbcdn.net&_nc_cat=101&_nc_ohc=os27ce6mZuwAX9_eRH-&oh=5973906d249c2599b4ce462cf1e28468&oe=5E982618


#### Hashtag Page

Get all posts on an Instagram **hashtag page** that are visible on the landing page. 

In [None]:
# get posts (images) from multiple hashtag pages 
#from pprint import pprint

#hash_result = []
#for url in hash_urls: 
 #   k = InstagramScraper()
  #  results = k.hashtag_page_posts(url)
   # hash_result.append(results)
    #print('Instagram page: ', url)

#print('Total number of Instagram hashtag pages: ', len(hash_result))
#print('Total number of hashed images: ', len(hash_result)*len(hash_result[0]))
#print('Average number of images per Instagram hashtag page: ', len(hash_result)*len(hash_result[0])/len(hash_result) )

In [None]:
# get posts (images) from a hashtag page 
from pprint import pprint

k = InstagramScraper()
hash_results = k.hashtag_page_posts(hash_url)

print('Instagram page: ', url)
print('Posts on Instagram hashtag page: ', len(hash_results))
print('Second image url on instagram hashtag: ', hash_results[1]['display_url'])

### 5. Save images into folders

Save images from list of dict: Use requests library to download images from the ‘display_url’ in pandas ‘result’ data frame and store them with respective shortcode as file name.

Specify the directory for storing the images. 

In [8]:
# load modules
import os
import requests
import shutil

Some functions...

In [9]:
def set_root_path_images(): 
    
    # to specify
    try: 
        directory= r"C:\Users\Anonym\Documents\GitHub\DLfM_BrandManagement\data"
        os.chdir(directory)
    except: 
        directory= r"C:\Users\lsamsi\Documents\GitHub\DLfM_BrandManagement\data"
        os.chdir(directory)
    folder = 'instagram_images' #image root folder, all subfolders' name are firmnames

    os.chdir(directory)

    try: 
        os.mkdir(folder)
    except: 
        pass

    path = os.path.join(directory, folder)
    os.chdir(path)
    return path 

In [10]:
def build_folders_images(account, folder, path): 
        try: 
            os.mkdir(os.path.join(path, account))
        except: 
            pass
        
        # set directory 
        directory = os.path.join(path, account)
        os.chdir(directory)   
        try: 
            os.mkdir(folder)
            print('new folder created for: ', account)
        except: 
            pass
        path = os.path.join(directory, folder)
        os.chdir(path)
        return path 


 #### User-profile page
 
 Save all images from user-profile Instagram pages to your folder. 

In [None]:
# download all visible images from multiple Instagram pages 

#for i, username in enumerate(firm_usernames): 
 #   path = set_root_path_images()
  #  build_folders_images(username, 'user_profile', path)

    # get image url 
   # for j in range(len(resultz[i])): 
    #    r = requests.get(resultz[i][j]['display_url'], stream=True)
     #   with open(resultz[i][j]['shortcode']+".jpg", 'wb') as f:
            # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
      #      r.raw.decode_content = True
            # Copy the response stream raw data to local image file.
       #     shutil.copyfileobj(r.raw, f)
            # Remove the image url response object.
        #    del r
            
   # print('processed: ', username, ' .', i, ' out of ', len(firm_usernames))

 #### Hashtag page
 
 Save all images from hashtag Instagram pages to your folder. 

In [None]:
# download all visible images from multiple Instagram pages 

#for i, hashtag in enumerate(instagram_hashtags):
 #   path = set_root_path_images()
  #  build_folders_images(hashtag, 'hashtag', path)

    # get image url 
    #for j in range(len(hash_result[i])): 
     #   r = requests.get(hash_result[i][j]['display_url'], stream=True)
      #  with open(hash_result[i][j]['shortcode']+".jpg", 'wb') as f:
            # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
       #     r.raw.decode_content = True
            # Copy the response stream raw data to local image file.
        #    shutil.copyfileobj(r.raw, f)
            # Remove the image url response object.
         #   del r
   # print('processed: ', hashtag, ' .', i, ' out of ', len(instagram_hashtags))

 #### Both pages
 
 Save all images from both user profile and hashtag Instagram pages to your folder. 

In [None]:
# download all visible images from multiple Instagram pages 

#if firm_usernames: 
 #   for i, username in enumerate(firm_usernames): 
  #      path = set_root_path_images()
   #     build_folders_images(username, 'user_profile', path)
       
        # get image url 
    #    for j in range(len(resultz[i])): 
     #       r = requests.get(resultz[i][j]['display_url'], stream=True)
      #      with open(resultz[i][j]['shortcode']+".jpg", 'wb') as f:
                # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
       #         r.raw.decode_content = True
                # Copy the response stream raw data to local image file.
        #        shutil.copyfileobj(r.raw, f)
                # Remove the image url response object.
         #       del r

#elif instagram_hashtags: 
 #   for i, hashtag in enumerate(instagram_hashtags):
  #      path = set_root_path_images()
   #     build_folders_images(hashtag, 'hashtag', path)

        # get image url 
    #    for j in range(len(hash_result[i])): 
     #       r = requests.get(hash_result[i][j]['display_url'], stream=True)
      #      with open(hash_result[i][j]['shortcode']+".jpg", 'wb') as f:
                # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
       #         r.raw.decode_content = True
                # Copy the response stream raw data to local image file.
        #        shutil.copyfileobj(r.raw, f)
                # Remove the image url response object.
         #       del r

In [None]:
# download all visible images from an Instagram page 

#path = set_root_path_images()

#if username: 
#    build_folders_images(username, 'user_profile', path)
#elif hashtag: 
 #   build_folders_images(hashtag, 'hashtag', path)

#for i in range(len(results)):
 #   r = requests.get(results[i]['display_url'], stream=True)
  #  with open(results[i]['shortcode']+".jpg", 'wb') as f:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
   #     r.raw.decode_content = True
        # Copy the response stream raw data to local image file.
    #    shutil.copyfileobj(r.raw, f)
        # Remove the image url response object.
     #   del r

In [None]:
# download one image only

#path = set_root_path_images()

#r = requests.get(url, stream=True)

#with open(directory+"B-Tckr0AgrH"+".jpg", 'wb') as f:
    # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
 #   r.raw.decode_content = True
    # Copy the response stream raw data to local image file.
  #  shutil.copyfileobj(r.raw, f)
    # Remove the image url response object.
   # del r

### 5. Save images

Specify the directory for storing the images.

Save images onto your PC, then load the images into a numpy array (variable) - for official and unofficial images of a brand. 

#### User-profile page

In [16]:
# load modules 
import imageio
import json
import numpy as np

# set directory 
path = set_root_path_images()

In [17]:
# create folders
build_folders_images(hashtag, 'official', path)
print('Directory set to: ', os.getcwd())

new folder created for:  margotrobbie
Directory set to:  C:\Users\Anonym\Documents\GitHub\DLfM_BrandManagement\data\instagram_images\margotrobbie\official


In [18]:
#https://gist.github.com/abhaymise/b011f9d68456f1d87561d71af2f7fd6a

# save images to PC 
for i in range(len(results)):
    r = requests.get(results[i]['display_url'], stream=True)
    with open(f"{i}_"+results[i]['shortcode']+".png", 'wb') as f:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True
        # Copy the response stream raw data to local image file.
        shutil.copyfileobj(r.raw, f)
        # Remove the image url response object.
        del r
        

In [None]:
# specify image dimension
#IMG_WIDTH=300
#IMG_HEIGHT=300
#IMG_DIM = (IMG_WIDTH, IMG_HEIGHT)

In [12]:
from PIL import Image
from io import BytesIO
from keras.preprocessing.image import load_img, img_to_array
import numpy as np

# specify the size of the image
target_size = (300,300)

images_list = []
for result in range(len(results)):
    # get the image based on the 'display_url'
    response = requests.get(results[result]['display_url'], stream=True)
    # convert it into a bytes object
    bytes = BytesIO(response.content)
    # convert it into an Image object
    image = Image.open(bytes)
    
    # resize the image if necessary
    if image.size != target_size:
        image = image.resize(target_size)
    
    # convert the image to a keras array and finally to a numpy array    
    train_image = img_to_array(image)
    train_image = np.array(train_image)
    print(train_image.shape)
    
    images_list.append(train_image)
    
# convert the images_list into one numpy array (used as X_test for the model)
images_np = np.stack(images_list, axis=0)
print(images_np.shape)

Using TensorFlow backend.


(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(300, 300, 3)
(12, 300, 300, 3)


In [None]:
# load modules 
#from io import BytesIO
#import base64
#from PIL import Image
#from keras.preprocessing.image import load_img, img_to_array

# list of images as np.arrays 
#images_lst = []
#np.array([0])
#for i in range(len(results)):
    # load image 
    #byteImg = Image.open(results[i]['shortcode']+".png")
    # image into numpy array 
    #img_np = np.array(byteImg)
    #images_np.append(img_np)
    
    # load image 
    #train_imgs = img_to_array(load_img(f"{i}_"+results[i]['shortcode']+".png", target_size=IMG_DIM))
    #train_imgs = np.array(train_imgs)
    #images_lst.append(train_imgs)

In [None]:
# all images as numpy array (for feeding as X_test)
#images_np = np.stack(images_lst, axis=0)
#print(images_np.shape)

In [19]:
# set directory 
path = set_root_path_images()

np.save(f'{hashtag}_official_npimgs.npy', images_np)

#### Hashtag page

In [None]:
# load modules 
import imageio
import json
import numpy as np

# set directory 
path = set_root_path_images()

In [None]:
# create folders
build_folders_images(hashtag, 'unofficial', path)
print('Directory set to: ', os.getcwd())

In [None]:
#https://gist.github.com/abhaymise/b011f9d68456f1d87561d71af2f7fd6a

# save images to PC
for i in range(len(hash_results)):
    r = requests.get(hash_results[i]['display_url'], stream=True)
    with open(f"{i}_"+hash_results[i]['shortcode']+".png", 'wb') as f:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True
        # Copy the response stream raw data to local image file.
        shutil.copyfileobj(r.raw, f)
        # Remove the image url response object.
        del r

In [None]:
# specify image dimension
IMG_WIDTH=300
IMG_HEIGHT=300
IMG_DIM = (IMG_WIDTH, IMG_HEIGHT)

In [None]:
# load modules 
from io import BytesIO
import base64
from PIL import Image
from keras.preprocessing.image import load_img, img_to_array

# list of images as np.arrays 
hash_images_lst = []
for i in range(len(hash_results)):
    # load image 
    byteImg = Image.open(f"{i}_"+hash_results[i]['shortcode']+".png")
    # load image 
    train_imgs2 = img_to_array(load_img(f"{i}_"+hash_results[i]['shortcode']+".png", target_size=IMG_DIM))
    train_imgs2 = np.array(train_imgs2)
    hash_images_lst.append(train_imgs2)

In [None]:
# all images as numpy array (for feeding as X_test)
hash_images_np = np.stack(hash_images_lst, axis=0)
print(hash_images_np.shape)

In [None]:
# set directory 
path = set_root_path_images()

np.save(f'{hashtag}_unofficial_npimgs.npy', hash_images_np)

# Instagram Private API (infinite scroll)

### 1. Import dependencies

In [1]:
import os
from random import choice
import json
import pandas as pd
import numpy as np
import csv
# load modules 
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from keras.preprocessing.image import load_img, img_to_array
import shutil 
# to install
import requests
from bs4 import BeautifulSoup

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow.python.tools'; 'tensorflow.python' is not a package

In [None]:

# pip install git+https://git@github.com/ping/instagram_private_api.git@1.6.0
from instagram_private_api import Client, ClientCompatPatch

In [None]:
user_name = 'chenpeling@hotmail.com'
password = 'Instagram2020'

USERNAME = 'nestle' # official nestle page: 'nestle'
HASHTAG = 'longines'
LIMIT_IMAGE_COUNT = 72 # 1st: 0, 2nd: 36, 3rd: 72 stops, at the first step over 50 


### 2. Setup Instagram Private API

In [5]:
# initialize client 
api = Client(user_name, password)

### 3. Get URLs of Image Posts on Instagram

### Hashtag

In [6]:
# all images urls 
all_hash_image_posts_urls = []

next_max_id = None
while (api.feed_tag(HASHTAG, api.generate_uuid())["more_available"] == True) and (len([item for sublist in all_hash_image_posts_urls for item in sublist]) <= LIMIT_IMAGE_COUNT): 
    if next_max_id == None: 
        #Gets the first 12 posts
        posts = api.feed_tag(HASHTAG, api.generate_uuid())
        len(posts['items'])
        image_urls = []
        for i in range(len(posts['items'])): 
            try: 
                url = posts['items'][i]['image_versions2']['candidates'][0]['url'] # some posts do not have 'image_version2', they are overlooked in that case
                image_urls.append(url)
            except: 
                pass 
        # Extract the value *next_max_id* from the above response, this is needed to load the next 12 posts
        next_max_id = posts["next_max_id"] 
        all_hash_image_posts_urls.append(image_urls)
    else: 
        next_page_posts = api.feed_tag(HASHTAG, api.generate_uuid())
        len(next_page_posts['items'])
        # get image urls 
        next_image_urls = []
        for i in range(len(next_page_posts['items'])):
            try: 
                url = next_page_posts['items'][i]['image_versions2']['candidates'][0]['url']
                next_image_urls.append(url)
            except: 
                pass
        # Extract the value *next_max_id*
        next_max_id = next_page_posts["next_max_id"] 
        all_hash_image_posts_urls.append(next_image_urls)

else:        
    flat_hash_image_posts_urls = [item for sublist in all_hash_image_posts_urls for item in sublist]
    print(f"A total of {len(flat_hash_image_posts_urls)} image post urls were retrieved from the Instagram page.")
 

A total of 333 image post urls were retrieved from the Instagram page.


### Username

In [None]:
# all images urls 
all_image_posts_urls = []

next_max_id = None
while (api.username_feed(USERNAME, max_id = next_max_id)["more_available"] == True)  and (len([item for sublist in all_image_posts_urls for item in sublist]) <= LIMIT_IMAGE_COUNT): 
    if next_max_id == None: 
        #Gets the first 12 posts
        posts = api.username_feed(USERNAME)
        len(posts['items'])
        image_urls = []
        for i in range(len(posts['items'])): 
            url = posts['items'][i]['image_versions2']['candidates'][0]['url']
            image_urls.append(url)
        # Extract the value *next_max_id* from the above response, this is needed to load the next 12 posts
        next_max_id = posts["next_max_id"] 
        all_image_posts_urls.append(image_urls)
    else: 
        next_page_posts = api.username_feed(USERNAME, max_id = next_max_id)
        len(next_page_posts['items'])
        # get image urls 
        next_image_urls = []
        for i in range(len(next_page_posts['items'])):
            try: 
                url = next_page_posts['items'][i]['image_versions2']['candidates'][0]['url']
                next_image_urls.append(url)
            except: 
                pass
        # Extract the value *next_max_id*
        next_max_id = next_page_posts["next_max_id"] 
        all_image_posts_urls.append(next_image_urls)
        
else: 
    flat_image_posts_urls = [item for sublist in all_image_posts_urls for item in sublist]
    print(f"A total of {len(flat_image_posts_urls)} image post urls were retrieved from the Instagram page.")



### Request Errors: 
# ClientConnectionError: URLError <urlopen error timed out>
        
# if Timeout: spyder/plugins/ipythonconsole/comms/kernelcomm.py 
# timeout = 10


### 4. Save Images to PC

In [8]:
# set directory
directory= r"C:\Users\lsamsi\Documents\GitHub\DLfM_BrandManagement\data\instagram_images\football"
os.chdir(directory)

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'C:\\Users\\lsamsi\\Documents\\GitHub\\DLfM_BrandManagement\\data\\instagram_images\\football'

### Hashtag

In [None]:
# save HASHTAG images to PC 
for i in range(len(flat_hash_image_posts_urls)):
    r = requests.get(flat_hash_image_posts_urls[i], stream=True)
    with open(f"{i}_"+flat_hash_image_posts_urls[i][-34:]+".png", 'wb') as f:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True
        # Copy the response stream raw data to local image file.
        shutil.copyfileobj(r.raw, f)
        # Remove the image url response object.
        del r

### Username

In [None]:
# save USERNAME images to PC 
for i in range(len(flat_image_posts_urls)):
    for j in range(len(flat_image_posts_urls[i])):
        r = requests.get(flat_image_posts_urls[i][j], stream=True)
        with open(f"{i}_"+flat_image_posts_urls[i][j][-34:]+".png", 'wb') as f:
            # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
            r.raw.decode_content = True
            # Copy the response stream raw data to local image file.
            shutil.copyfileobj(r.raw, f)
            # Remove the image url response object.
            del r

In [None]:
# FOR VINCE 

In [None]:
# pip install git+https://git@github.com/ping/instagram_private_api.git@1.6.0
from instagram_private_api import Client, ClientCompatPatch

USERNAME = 'nestle' # official nestle page: 'nestle'
HASHTAG = 'longines'
LIMIT_IMAGE_COUNT = 50 # 1st: 0, 2nd: 36, 3rd: 72 stops, at the first step over 50 


def official_images_unlimited(USERNAME, LIMIT_IMAGE_COUNT): 

    user_name = 'chenpeling@hotmail.com'
    password = 'Instagram2020'

    # initialize client 
    api = Client(user_name, password)

    ########### USERNAME ###################

    # all images urls 
    all_image_posts_urls = []

    next_max_id = None
    while (api.username_feed(USERNAME, max_id = next_max_id)["more_available"] == True)  and (len([item for sublist in all_image_posts_urls for item in sublist]) <= LIMIT_IMAGE_COUNT): 
        if next_max_id == None: 
            #Gets the first 12 posts
            posts = api.username_feed(USERNAME)
            len(posts['items'])
            image_urls = []
            for i in range(len(posts['items'])): 
                url = posts['items'][i]['image_versions2']['candidates'][0]['url']
                image_urls.append(url)
            # Extract the value *next_max_id* from the above response, this is needed to load the next 12 posts
            next_max_id = posts["next_max_id"] 
            all_image_posts_urls.append(image_urls)
        else: 
            next_page_posts = api.username_feed(USERNAME, max_id = next_max_id)
            len(next_page_posts['items'])
            # get image urls 
            next_image_urls = []
            for i in range(len(next_page_posts['items'])):
                try: 
                    url = next_page_posts['items'][i]['image_versions2']['candidates'][0]['url']
                    next_image_urls.append(url)
                except: 
                    pass
            # Extract the value *next_max_id*
            next_max_id = next_page_posts["next_max_id"] 
            all_image_posts_urls.append(next_image_urls)

    else: 
        flat_image_posts_urls = [item for sublist in all_image_posts_urls for item in sublist]
        #print(f"A total of {len(flat_image_posts_urls)} image post urls were retrieved from the Instagram page.")
    
    return flat_image_posts_urls


### Request Errors: 
# ClientConnectionError: URLError <urlopen error timed out>

# if Timeout: spyder/plugins/ipythonconsole/comms/kernelcomm.py 
# timeout = 10

In [None]:

def unofficial_images_unlimited(USERNAME, LIMIT_IMAGE_COUNT): 

    user_name = 'chenpeling@hotmail.com'
    password = 'Instagram2020'

    # initialize client 
    api = Client(user_name, password)
    
    #%%
########### HASHTAG ###################

    # all images urls 
    all_hash_image_posts_urls = []

    next_max_id = None
    while (api.feed_tag(HASHTAG, api.generate_uuid())["more_available"] == True) and (len([item for sublist in all_hash_image_posts_urls for item in sublist]) <= LIMIT_IMAGE_COUNT): 
        if next_max_id == None: 
            #Gets the first 12 posts
            posts = api.feed_tag(HASHTAG, api.generate_uuid())
            len(posts['items'])
            image_urls = []
            for i in range(len(posts['items'])): 
                try: 
                    url = posts['items'][i]['image_versions2']['candidates'][0]['url'] # some posts do not have 'image_version2', they are overlooked in that case
                    image_urls.append(url)
                except: 
                    pass 
            # Extract the value *next_max_id* from the above response, this is needed to load the next 12 posts
            next_max_id = posts["next_max_id"] 
            all_hash_image_posts_urls.append(image_urls)
        else: 
            next_page_posts = api.feed_tag(HASHTAG, api.generate_uuid())
            len(next_page_posts['items'])
            # get image urls 
            next_image_urls = []
            for i in range(len(next_page_posts['items'])):
                try: 
                    url = next_page_posts['items'][i]['image_versions2']['candidates'][0]['url']
                    next_image_urls.append(url)
                except: 
                    pass
            # Extract the value *next_max_id*
            next_max_id = next_page_posts["next_max_id"] 
            all_hash_image_posts_urls.append(next_image_urls)

    else:        
        flat_hash_image_posts_urls = [item for sublist in all_hash_image_posts_urls for item in sublist]
        print(f"A total of {len(flat_hash_image_posts_urls)} image post urls were retrieved from the Instagram page.")
    
    return flat_hash_image_posts_urls
