In [62]:
# import packages
import json
import os
import requests
import datetime as dt
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from skimage import io
from skimage.transform import resize
from skimage import img_as_ubyte
from skimage import color
from tqdm import tqdm
from shutil import copy

# Data Collection

First I collect the data from each subreddit gathering the post title, score, id, image URL, post URL, number of comments and creation timestamp. Reddit caches all of its content, so with both scraping and Reddit's API (PRAW) you're limited to 1,000 posts. Fortunately the /r/datasets community created their own API called [Pushshift](https://github.com/pushshift/api). Pushshift also has a limit of 100 posts per call, but what it has over the other methods is that you can specify a time period which allows us to bypass the limitations of caching.

In [78]:
def getPushshiftData(subreddit, start, end=time.time(), size=100):
    
    post_dict = { "title":[], "created":[], "id":[], "img_url":[], "full_link":[], "score":[], "num_comments":[]}
    
    time_list = []
    n = start 
    day = 86400 # epoch time
        
    while n < end:
        for period in range(int((end - start)/day)):
            time_list.append([start+day*period, start+day*(period+1)])
            n += day
    
    for date in time_list:
        url = 'https://api.pushshift.io/reddit/search/submission/?&size='+str(size)+'&after='+str(date[0])+'&before='+str(date[1])+'&subreddit='+str(subreddit)
        r = requests.get(url)
        js = json.loads(r.text)
        
        for post in range(len(js['data'])):
            post_dict['title'].append(js['data'][post]['title'])
            post_dict['score'].append(js['data'][post]['score'])
            post_dict['id'].append(js['data'][post]['id'])
            post_dict['img_url'].append(js['data'][post]['url'])
            post_dict['full_link'].append(js['data'][post]['full_link'])
            post_dict['num_comments'].append(js['data'][post]['num_comments'])
            post_dict['created'].append(dt.datetime.fromtimestamp(js['data'][post]['created_utc']))
            
        time.sleep(1)
            
    subreddit_df = pd.DataFrame(data=post_dict)
    
    return subreddit_df

In [89]:
subreddit = 'foodporn'
start = 1420070400 # 12:00 am jan 1, 2015

ps_fp = getPushshiftData(subreddit, start)

In [148]:
ps_fp.to_csv('Data/pushshift_fp.csv')

In [99]:
ps_fp.head()

Unnamed: 0,title,created,id,img_url,full_link,score,num_comments
0,My new years eve dinner,2014-12-31 18:24:30,2qyuqt,http://i.imgur.com/DgOnc4V.jpg,https://www.reddit.com/r/FoodPorn/comments/2qy...,2,1
1,Drambuie-Butterscotch-filled Banana Cupcakes w...,2014-12-31 18:32:40,2qyvif,http://i.imgur.com/RmP3y5e.jpg,https://www.reddit.com/r/FoodPorn/comments/2qy...,1,1
2,Made Coquilles St. Jacques as a farewell to 20...,2014-12-31 18:57:17,2qyxmm,http://i.imgur.com/ulGNzzQ.jpg,https://www.reddit.com/r/FoodPorn/comments/2qy...,1,1
3,"Some chocolate mint cake? Yes, please! [OC] [7...",2014-12-31 19:08:16,2qyyjq,http://i.imgur.com/ocbQx9B.jpg,https://www.reddit.com/r/FoodPorn/comments/2qy...,108,2
4,75$ Burger that my friend bought at an NBA gam...,2014-12-31 19:10:21,2qyyqf,http://imgur.com/gallery/2oeFCmm,https://www.reddit.com/r/FoodPorn/comments/2qy...,1,2


In [100]:
ps_fp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333866 entries, 0 to 333865
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   title         333866 non-null  object        
 1   created       333866 non-null  datetime64[ns]
 2   id            333866 non-null  object        
 3   img_url       333866 non-null  object        
 4   full_link     333866 non-null  object        
 5   score         333866 non-null  int64         
 6   num_comments  333866 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 17.8+ MB


In [93]:
subreddit = 'shittyfoodporn'
start = 1420070400 # 12:00 am jan 1, 2015

ps_sfp = getPushshiftData(subreddit, start)

In [149]:
ps_sfp.to_csv('Data/pushshift_sfp.csv')

In [101]:
ps_sfp.head()

Unnamed: 0,title,created,id,img_url,full_link,score,num_comments
0,was craving sushi,2014-12-31 18:01:49,2qysew,http://i.imgur.com/LS8qi14.jpg,https://www.reddit.com/r/shittyfoodporn/commen...,0,1
1,I had leftover Del Taco from yesterday. This i...,2014-12-31 18:07:04,2qyszb,http://i.imgur.com/Hz7yLZR.jpg,https://www.reddit.com/r/shittyfoodporn/commen...,8,3
2,A special New Years dinner of grilled cheese a...,2014-12-31 18:41:38,2qywbp,http://i.imgur.com/ik8MyrX.jpg,https://www.reddit.com/r/shittyfoodporn/commen...,6,2
3,Nailed it!,2014-12-31 18:52:59,2qyxad,http://i.imgur.com/4ULEOyC.jpg,https://www.reddit.com/r/shittyfoodporn/commen...,131,15
4,Some 'Snacks',2014-12-31 18:56:20,2qyxk1,http://i.imgur.com/UY3nodm.jpg,https://www.reddit.com/r/shittyfoodporn/commen...,1,0


In [102]:
ps_sfp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317486 entries, 0 to 317485
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   title         317486 non-null  object        
 1   created       317486 non-null  datetime64[ns]
 2   id            317486 non-null  object        
 3   img_url       317486 non-null  object        
 4   full_link     317486 non-null  object        
 5   score         317486 non-null  int64         
 6   num_comments  317486 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 17.0+ MB


Looking great so far!  I'll need to check for duplicates and also 600,000+ images is more than I can store, so I'll determine a cutoff based on score to get the "most representative" images.

In [109]:
fp_unique = ps_fp.drop_duplicates(keep='first')

In [110]:
fp_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 166934 entries, 0 to 333382
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   title         166934 non-null  object        
 1   created       166934 non-null  datetime64[ns]
 2   id            166934 non-null  object        
 3   img_url       166934 non-null  object        
 4   full_link     166934 non-null  object        
 5   score         166934 non-null  int64         
 6   num_comments  166934 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 10.2+ MB


In [112]:
sfp_unique = ps_sfp.drop_duplicates(keep='first')

In [113]:
sfp_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158743 entries, 0 to 158742
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   title         158743 non-null  object        
 1   created       158743 non-null  datetime64[ns]
 2   id            158743 non-null  object        
 3   img_url       158743 non-null  object        
 4   full_link     158743 non-null  object        
 5   score         158743 non-null  int64         
 6   num_comments  158743 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 9.7+ MB


In [144]:
for quant in np.arange(0.1,1,0.1):
    print(f'the {quant} quantile has {len(fp_unique.score[fp_unique.score > fp_unique.score.quantile(quant)])} fp images')

the 0.1 quantile has 71614 fp images
the 0.2 quantile has 71614 fp images
the 0.30000000000000004 quantile has 71614 fp images
the 0.4 quantile has 71614 fp images
the 0.5 quantile has 71614 fp images
the 0.6 quantile has 66695 fp images
the 0.7000000000000001 quantile has 49954 fp images
the 0.8 quantile has 33291 fp images
the 0.9 quantile has 16546 fp images


In [143]:
for quant in np.arange(0.1,1,0.1):
    print(f'the {quant} quantile has {len(sfp_unique.score[sfp_unique.score > sfp_unique.score.quantile(quant)])} sfp images')

the 0.1 quantile has 96341 sfp images
the 0.2 quantile has 96341 sfp images
the 0.30000000000000004 quantile has 96341 sfp images
the 0.4 quantile has 91149 sfp images
the 0.5 quantile has 77104 sfp images
the 0.6 quantile has 60393 sfp images
the 0.7000000000000001 quantile has 46420 sfp images
the 0.8 quantile has 31388 sfp images
the 0.9 quantile has 15775 sfp images


~30,000 images each is much more reasonable, so I'll use the 80th quantile. I also anticipate that this will be reduced more due to not every url being a png or jpg as videos and gifs are allowed. Additionally, since I'm looking back over 5 years, there will be images that have been removed or deleted.

In [145]:
fp.to_csv('Data/fp.csv')
sfp.to_csv('Data/sfp.csv')

# Getting The Images

Now that I have the urls, its time to get the images. The below function downloads the images from the subreddit dataframes created from the pushshift API and gives them unique names based on the id column.

In [189]:
def download_subreddit_images(subreddit_df, directory):
    """downloads all static images (png & jpg) from specified subreddit dataframe to desired directory"""
    
    headers = {'user-agent': 'image_downloader'}
    url_dict = {}
    skipped_urls = []
    
    # checks if directory exists, if not makes one
    current_path = os.getcwd()
    if not os.path.exists(directory):
        os.makedirs(directory)
    os.chdir(directory)
    
    # find urls and create file names from unique post id
    for row in subreddit_df.itertuples():
        filename = f"{row[3]}.{str.split(str(row[4]), sep='.')[-1]}"
        url =row[4]
        url_dict[filename] = url
    
    # iterates over static images in dict and saves to file
    for filename, url in tqdm(url_dict.items()):
        # check if url is png or jpg
        if (str.split(filename, sep='.')[-1] == 'jpg') or (str.split(filename, sep='.')[-1] =='png'):
            try:
                r = requests.get(url, headers=headers, stream=True)
       
                with open(filename, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
            except:
                skipped_urls.append(url)
            time.sleep(3)
        pass
    
    os.chdir(current_path)
    return skipped_urls

In [174]:
fp_skipped = download_subreddit_images(fp,'Images/FP')

100%|██████████| 33291/33291 [28:07:49<00:00,  3.04s/it]    


In [190]:
sfp_skipped = download_subreddit_images(sfp,'Images/SFP')

100%|██████████| 31388/31388 [23:55:05<00:00,  2.74s/it]    


# Extracting Data From Images

Now that the images are downloaded, 24,746 from FP and 20,453 from SFP, I want to get some basic info on them such as the heigh and witdth, aspect ratio, and the mean/standard deviation of the color channels.

In [195]:
def get_img_dict(path):
    """ create dictionary of statistics for each image, heigh, width, aspect ratio,
    and mean/standard deviation of red, green, blue color channels.
    Images causing error are skipped and placed into a list for additional inspection"""
        
    image_dict = {'path':[], 'height':[], 'width':[], 'ratio_(w/h)':[],
                 'mean_red':[], 'mean_green': [], 'mean_blue':[],
                 'std_red':[], 'std_green':[], 'std_blue':[]}

    # some images are throwing an error when extracting data, place them here to see whats going on
    bad_images = []     

    img_names = os.listdir(path)
    for image in tqdm(img_names):
        try:
            img = io.imread(f'{path}/{image}')
            red = img[:, :, 0]
            green = img[:, :, 1]
            blue = img[:, :, 2]
            image_dict['height'].append(img.shape[1])
            image_dict['width'].append(img.shape[0])
            image_dict['ratio_(w/h)'].append(img.shape[0]/img.shape[1])
            image_dict['mean_red'].append(np.mean(red))
            image_dict['mean_green'].append(np.mean(green))
            image_dict['mean_blue'].append(np.mean(blue))
            image_dict['std_red'].append(np.std(red))
            image_dict['std_green'].append(np.std(green))
            image_dict['std_blue'].append(np.std(blue))
            image_dict['path'].append(image)
        except:
            bad_images.append(image)
            
    return image_dict, bad_images

In [None]:
# build fp image dict
fp_path = 'Images/FP'
fp_image_dict, fp_bad_images = get_img_dict(fp_path)

In [None]:
# convert fp image dict & bad image list to dataframes and save 
fp_image_df = pd.DataFrame(data=fp_image_dict)
fp_image_df.to_csv('Data/fp_image_data.csv')

fp_bad_image_df = pd.DataFrame(data=fp_bad_images)
fp_bad_image_df.to_csv('Data/fp_bad_images.csv')

In [198]:
# build sfp image dict
path = 'Images/SFP'
sfp_image_dict, sfp_bad_images = get_img_dict(path)

100%|██████████| 20453/20453 [1:44:57<00:00,  3.25it/s]  


In [199]:
# convert sfp image dict & bad image list to dataframes and save
sfp_image_df = pd.DataFrame(data=sfp_image_dict)
sfp_image_df.to_csv('Data/sfp_image_data.csv')

sfp_bad_image_df = pd.DataFrame(data=sfp_bad_images)
sfp_bad_image_df.to_csv('Data/sfp_bad_images.csv')

In [2]:
# import image data dfs
fp_image_df = pd.read_csv('Data/fp_image_data.csv', index_col=0)
sfp_image_df = pd.read_csv('Data/sfp_image_data.csv', index_col=0)

In [203]:
# import bad images
fp_bad_image = pd.read_csv('Data/fp_bad_images.csv', index_col=0)
sfp_bad_image = pd.read_csv('Data/sfp_bad_images.csv', index_col=0)

Here I move the "bad images" to a new folder so I can see what the issue is. There are 1,846 from FP and 1,771 from SFP.

In [204]:
# moving the "bad images" to new folders so I can look at them
fp_bad_image.columns= ['image']
fp_bad_image_list = list(fp_bad_image.image)

for image in fp_bad_image_list:
    os.rename(f'Images/FP/{image}', f'Images/FP_Bad_Images/{image}')
    
    
sfp_bad_image.columns= ['image']
sfp_bad_image_list = list(sfp_bad_image.image)
for image in sfp_bad_image_list:
    os.rename(f'Images/SFP/{image}', f'Images/SFP_Bad_Images/{image}')

Good new, the "bad images" are only the imgur notifications that the images has been removed.

![example](Images/FP_Bad_Images/2r4v19.jpg)

# Resizing 

Next, I want to resize my images. I plan to use transfer learning from VGG16, which requires images to be 224x224, so that's the size I'll set my images to.

In [60]:
def custom_resize(image_path, new_path, size=(224,224)):
    '''resize images to specified size and saves a copy in new path'''
    img_to_convert = os.listdir(image_path)
    
    img_in_new = os.listdir(new_path)
    
    # this will take a while to run - if interupted this will check for images already resized
    images = list(set(img_to_convert) - set(img_in_new))
    
    for image in tqdm(images):
        img = io.imread(f'{image_path}/{image}')
        img_resize = resize(img, (224,224))
        try:
            # resize and save new image
            io.imsave(f'{new_path}/{image}', img_as_ubyte(img_resize))
        
        except IOError: 
            # IOError happens while saving as jpeg if the image has an alpha channel
            img_resize = color.rgba2rgb(img_resize)
            io.imsave(f'{new_path}/{image}', img_as_ubyte(img_resize))
            
        except Exception:
            pass

In [None]:
current_path = 'Images/FP'
new_path = 'Images/FP_224x224'

custom_resize(current_path, new_path)

In [None]:
current_path = 'Images/SFP'
new_path = 'Images/SFP_224x224'

custom_resize(current_path, new_path)

Lastly, while i have a lot of images, I will cut it down to 500 per subreddit for EDA and initial training. More can be added as needed.

In [10]:
fp_df = pd.read_csv('Data/fp.csv', index_col=0)
sfp_df = pd.read_csv('Data/sfp.csv', index_col=0)

In [11]:
fp_image_df = pd.read_csv('Data/fp_image_data.csv', index_col=0)
sfp_image_df = pd.read_csv('Data/sfp_image_data.csv', index_col=0)

In [31]:
# split path column to merge with fp_df on id
fp_image_df['id'] = fp_image_df['path'].str.split(pat='.',n=1).str[0]
sfp_image_df['id'] = sfp_image_df['path'].str.split(pat='.',n=1).str[0]

In [32]:
# merge fp_image_df with fp_df
fp500 = fp_image_df.merge(fp_df,how='left', on='id')

# sort by score and select the top 500
fp500 = fp500.sort_values('score', ascending=False).head(500)

# same for sfp data
sfp500 = sfp_image_df.merge(sfp_df,how='left', on='id')
sfp500 = sfp500.sort_values('score', ascending=False).head(500)

In [40]:
sfp500.to_csv('Data/sfp500.csv')
fp500.to_csv('Data/fp500.csv')

In [38]:
# copying the top 500 to a new folder
for image in fp500.path:
    copy(f'Images/FP_224x224/{image}', f'Images/FP500_224x224/{image}')

In [39]:
for image in sfp500.path:
    copy(f'Images/SFP_224x224/{image}', f'Images/SFP500_224x224/{image}')

Now that the images are resized, I also was to get the intensity.

In [50]:
def get_resize_dict(path):
    """ create dictionary of statistics for each image, heigh, width, aspect ratio,
    and mean/standard deviation of red, green, blue color channels.
    Images causing error are skipped and placed into a list for additional inspection"""
        
    image_dict = {'path':[], 'height':[], 'width':[], 'ratio_(w/h)':[], 'intensity':[],
                 'mean_red':[], 'mean_green': [], 'mean_blue':[],
                 'std_red':[], 'std_green':[], 'std_blue':[]}
    
    img_names = os.listdir(path)
    for image in tqdm(img_names):
        try:
            img = io.imread(f'{path}/{image}')
            red = img[:, :, 0]
            green = img[:, :, 1]
            blue = img[:, :, 2]
            image_dict['height'].append(img.shape[1])
            image_dict['width'].append(img.shape[0])
            image_dict['ratio_(w/h)'].append(img.shape[0]/img.shape[1])
            image_dict['mean_red'].append(np.mean(red))
            image_dict['mean_green'].append(np.mean(green))
            image_dict['mean_blue'].append(np.mean(blue))
            image_dict['std_red'].append(np.std(red))
            image_dict['std_green'].append(np.std(green))
            image_dict['std_blue'].append(np.std(blue))
            image_dict['intensity'].append(np.sum(red)+np.sum(green)+np.sum(blue))
            image_dict['path'].append(image)
        except Exception:
            pass
            
    return image_dict

In [44]:
path = "Images/FP500_224x224"
fp500_resize_image_df = pd.DataFrame(get_resize_dict(path))

100%|██████████| 500/500 [00:01<00:00, 314.26it/s]


In [64]:
path = "Images/SFP500_224x224"
sfp500_resize_image_df = pd.DataFrame(get_resize_dict(path))

100%|██████████| 500/500 [00:01<00:00, 329.77it/s]


In [66]:
fp500_resize_image_df.to_csv('Data/fp500_resize_data.csv')
sfp500_resize_image_df.to_csv('Data/sfp500_resize_data.csv')