# Notebook for Scraping Reddit

### Imports

In [9]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import praw
import mimetypes
from time import sleep
import requests
import pathlib
import shutil

### Input Reddit App Info

In [16]:
basedir = widgets.Text(
    value='./scraped',
    placeholder='./scraped',
    description='Base Directory:',
    disabled=False   
)


display(basedir)

Text(value='./scraped', description='Base Directory:', placeholder='./scraped')

In [10]:
id = widgets.Text(value="client id", description="Client ID:")
secret = widgets.Text(value="secret", description="Secret:")
useragent = widgets.Text(value="useragent", description="User Agent:")
display(id)
display(secret)
display(useragent)

Text(value='client id', description='Client ID:')

Text(value='secret', description='Secret:')

Text(value='useragent', description='User Agent:')

### Connect to Reddit

In [11]:
reddit = praw.Reddit(client_id=id.value, 
                         client_secret=secret.value, 
                         user_agent=useragent.value)

### Choose Subreddits to Scrape

In [28]:
subreddits = widgets.Textarea(
    value='',
    placeholder='use space as delimitor, do not include r/',
    description='Subreddits:',
    disabled=False   
)
minvotes = widgets.IntText(value="4",
                           description="Min Upvotes:",
                           disabled=False)
maxposts = widgets.IntText(value="400",
                            description="Max Posts:",
                            disabled=False)

time = widgets.Select(
    options=['all','year','month','week','day'],
    value='year',
    placeholder='year',
    description='Subreddits:',
    disabled=False   
)

display(subreddits)
display(minvotes)
display(maxposts)
display(time)

Textarea(value='', description='Subreddits:', placeholder='use space as delimitor, do not include r/')

IntText(value=4, description='Min Upvotes:')

IntText(value=400, description='Max Posts:')

Select(description='Subreddits:', index=1, options=('all', 'year', 'month', 'week', 'day'), value='year')

### Gather Image Links

In [29]:
all_images = []
for subreddit in subreddits.value.split(' '):
    subreddit_images = { 
                        'subreddit':subreddit,
                        'images':[]
    }
    top_posts = reddit.subreddit(subreddit).top(time_filter=time.value, limit=maxposts.value)
    for post in top_posts:
        mimetype = mimetypes.guess_type(post.url)[0]
        if mimetype is not None:
            if ('png' in mimetype or 'jpg' in mimetype or 'jpeg' in mimetype) and post.score > minvotes.value:
                image = {'title':'', 'url':'', 'meta':''}
                image['title'] = post.title
                image['url'] = post.url
                subreddit_images['images'].append(image)
    all_images.append(subreddit_images)


### Save Images

In [30]:

for subreddit in all_images:
    progress = widgets.IntProgress(
        value=0,
        min=0,
        max=len(subreddit['images']),
        description=f'Scraping\n {subreddit["subreddit"]}',
        bar_style='info', # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': 'blue'},
        orientation='horizontal')
    display(progress)
    
    subdir = "/".join([basedir.value, subreddit["subreddit"]])
    pathlib.Path(subdir).mkdir(parents=True, exist_ok=True)
    counter = 0
    for img in subreddit['images']:
        counter = counter+1
        imgurl = img['url']
        title = img['title'].replace(' ','_').replace('/','_')
        title = "".join(x for x in title if (x.isalnum() or x in "._- "))
        if len(title) > 200:
            title = title[:200]
        response = requests.get(imgurl)
        if response.status_code == 200:
            with open('/'.join([subdir, title])+imgurl[-4:], 'wb') as newfile:
                    newfile.write(response.content)
            progress.value = progress.value + 1
        else:
            progress.max = progress.max - 1
        if progress.value == progress.max:
            print("Done! 😊")
                        
        del response
        if counter == 100:
            sleep(1)
            counter = 0
            


IntProgress(value=0, bar_style='info', description='Scraping\n dataisbeautiful', max=298, style=ProgressStyle(…

Done! 😊
