In [None]:
!pip install praw -q
import praw
import getpass
import pandas as pd
import json
import os
import requests
import shutil

from datetime import datetime

In [3]:
with open(os.path.join('/home/jovyan/reddit/', 'env.json')) as secrets_file:
    secrets = json.load(secrets_file)
    
def get_secret(setting, secrets=secrets):
    """Get secret setting or fail with ImproperlyConfigured"""
    try:
        return secrets[setting]
    except KeyError:
        raise ImproperlyConfigured("Set the {} setting".format(setting))

In [4]:
reddit_read_only = praw.Reddit(client_id=get_secret('client_id'),       # your client id
                               client_secret=get_secret('client_secret'),     # your client secret
                               user_agent=get_secret('user_agent'))     # your user agent

In [None]:
subreddit = reddit_read_only.subreddit("battlestations")

posts = subreddit.top("year")
 
posts_dict = {"Title": [],
              "ID": [], "Upvotes": [],
              "Total Comments": [], "Post URL": []
              }
 
for post in posts:
    # Title of each post
    posts_dict["Title"].append(post.title)
     
    # Unique ID of each post
    posts_dict["ID"].append(post.id)
     
    # The score of a post
    posts_dict["Upvotes"].append(post.score)
     
    # Total number of comments inside the post
    posts_dict["Total Comments"].append(post.num_comments)
     
    # URL of each post
    posts_dict["Post URL"].append(post.url)
 
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts

In [None]:
current_directory = os.getcwd()
new_dir = "pics-" + datetime.now().strftime("%m%d%Y%H%M%S")
final_directory = os.path.join(current_directory, new_dir)
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

for index, row in top_posts.iterrows():
    url =  row['Post URL']
    filename = row['ID']
    
    res = requests.get(url, stream = True)
    if res.status_code == 200:
        with open(final_directory + '/' + filename + '.jpg','wb') as f:
            shutil.copyfileobj(res.raw, f)
    else:
        print(filename + ' couldn\'t be retrieved')

In [18]:
import requests
from datetime import datetime
import traceback
import time
import json
import sys
from IPython.display import clear_output

username = ""  # put the username you want to download in the quotes
subreddit = "battlestations"  # put the subreddit you want to download in the quotes
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit

filter_string = None
if username == "" and subreddit == "":
    print("Fill in either username or subreddit")
    sys.exit(0)
elif username == "" and subreddit != "":
    filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "":
    filter_string = f"author={username}"
else:
    filter_string = f"author={username}&subreddit={subreddit}"

url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&{}&before="

start_time = datetime.utcnow()

posts_dict = {"Title": [],
              "ID": [], "Date": [], "Upvotes": [], "URL": []
              }
posts = pd.DataFrame(posts_dict)



filename = 'data-' + datetime.now().strftime("%m%d%Y%H%M%S") + '.csv'
object_type = "submission"
print(f"Saving {object_type}s to {filename}")

count = 0
previous_epoch = int(start_time.timestamp())
while True:
    new_url = url.format(object_type, filter_string)+str(previous_epoch)
    json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
    time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
    try:
        json_data = json_text.json()
    except json.decoder.JSONDecodeError:
        time.sleep(1)
        continue

    if 'data' not in json_data:
        break
    objects = json_data['data']
    if len(objects) == 0:
        break

    batch_dict = {"Title": [],
          "ID": [], "Date": [], "Upvotes": [], "URL": []
          }    

    for object in objects:
        previous_epoch = object['created_utc'] - 1
        count += 1
        if object_type == 'submission':
            if 'url' not in object:
                print("continuing")
                continue
            try:
                batch_dict["Title"].append(object['title'])
                batch_dict["ID"].append(object['id'])
                batch_dict["Date"].append(datetime.fromtimestamp(object['created_utc']).strftime("%Y-%m-%d %H:%M:%S"))
                batch_dict["Upvotes"].append(object['score'])
                batch_dict["URL"].append(object['url'])

                # handle.write("Title: " + str(object['title']))
                # handle.write("\nID: " + str(object['id']))
                # handle.write("\nScore: " + str(object['score']))
                # handle.write("\nAwards: " + str(object['total_awards_received']))
                # handle.write("\nTime: " + datetime.fromtimestamp(object['created_utc']).strftime("%Y-%m-%d %H:%M:%S"))
                # handle.write("\nURL: " + str(object['url']))
                # handle.write("\n-------------------------------\n")
            except Exception as err:
                print(f"Couldn't print post: {object['url']}")
                print(traceback.format_exc())

    batch = pd.DataFrame(batch_dict)
    posts = posts.append(batch)
    posts.to_csv(filename, index=False)

    clear_output(wait=True)
    print("Saved {} {}s through {}".format(count, object_type, datetime.fromtimestamp(previous_epoch).strftime("%Y-%m-%d")))

print(f"Saved {count} {object_type}s")

Saved 162792 submissions through 2009-12-09
Saved 162792 submissions


In [7]:
posts = pd.read_csv('data-01292022191230.csv')
posts = posts.sort_values(by=['Upvotes'], ascending=False)
header_list = ["Title", "ID", "Date", "Upvotes", "URL", "Status"]
posts = posts.reindex(columns = header_list)
posts

Unnamed: 0,Title,ID,Date,Upvotes,URL,Status
105428,The Gum Ball PC. Tried to be different and fun...,88mrjw,2018-03-31 23:53:50,34077.0,https://i.redd.it/qxw0cetfs6p01.jpg,
79310,I live in a van and this is my battle station!,cetna0,2019-07-18 15:05:17,32530.0,https://i.redd.it/5bh2vhihu2b31.jpg,
82196,I call it...Serenity,bx4csz,2019-06-05 16:10:59,31412.0,https://i.redd.it/oy36njp1bk231.jpg,
107394,Can I join your ranks?,7y1byy,2018-02-16 19:42:51,29146.0,https://i.imgur.com/WFXtOln.jpg,
105988,My Music / Gaming Room,85ikwj,2018-03-19 10:53:54,27462.0,https://i.redd.it/kj4rw6mqapm01.jpg,
...,...,...,...,...,...,...
149945,It's not too much. Just my chilled battlestati...,12mpmd,2012-11-04 21:35:37,0.0,http://imgur.com/a/uEQvy,
149948,Standing and Sitting Battlestation,12mns8,2012-11-04 21:05:32,0.0,http://imgur.com/a/1zCON,
149951,STOP in the name of my station,12mi6f,2012-11-04 19:36:50,0.0,http://i.imgur.com/ZdX7K.jpg,
149954,My own Battlestation (cat included)!,12mb4t,2012-11-04 17:41:19,0.0,http://i896.photobucket.com/albums/ac162/Mikei...,


In [None]:
from os.path import exists
current_directory = os.getcwd()
new_dir = "pics" # -" + datetime.now().strftime("%m%d%Y%H%M%S")
final_directory = os.path.join(current_directory, new_dir)
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

df.sort_values(by=['col1'])

processed = 0
skipped = 0
failed = 0
success = 0
status = ""

for index, row in posts.iterrows():
    url =  row['URL']
    filename = row['ID']
    #print(index)
    
    if row['Status'] is "Failed" or row['Status'] is "BadURL":
        failed += 1
        processed += 1
        status = "skipped"
        print("ping!")
        continue
        
    if exists(final_directory + '/' + filename + '.jpg'):
        # clear_output(wait=True)
        # print("Skipped " + filename + ", URL = " + url)
        skipped += 1
        processed += 1
        status = "skipped"
        continue
        
    if "https://i.redd.it" not in url:
        skipped += 1
        processed += 1  
        posts.at[index,'Status'] = "BadURL"
        status = "bad url"
        continue
    
    res = requests.get(url, stream = True)
    if res.status_code == 200:
        with open(final_directory + '/' + filename + '.jpg','wb') as f:
            shutil.copyfileobj(res.raw, f)
        success += 1
        posts.at[index,'Status'] = "Downloaded"
        status = "downloaded"
        time.sleep(1)
    else:
        failed += 1
        posts.at[index,'Status'] = "Failed"
        status = "failed"
    processed += 1
    clear_output(wait=True)
    total = 162792
    percentage = (processed / total) * 100
    # print(str(processed) + " / 162792 processed, " + str(percentage) + "%")
    # print(str(success) + " successful" + " (" + str((success/total)*100) + "%)")
    # print(str(failed) + " failed" + " (" + str((failed/total)*100) + "%)")
    # print(str(skipped) + " skipped" + " (" + str((skipped/total)*100) + "%)")
    # print(status)

  if row['Status'] is "Failed" or "BadURL":


ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping!
ping

In [55]:
posts

Unnamed: 0,Title,ID,Date,Upvotes,URL,Status
0,My custom built Battlestation,sfod0s,2022-01-29 18:31:44,1.0,https://i.redd.it/14ex9j9l7oe81.jpg,Downloaded
1,My custom built battle station,sfocc4,2022-01-29 18:30:53,1.0,https://i.redd.it/ad6v0ibf7oe81.jpg,BadURL
2,My battlestation finally complete,sfo325,2022-01-29 18:19:12,1.0,https://i.redd.it/vpj9ptv85oe81.jpg,Downloaded
3,Had some additions since the last time I poste...,sfo1kx,2022-01-29 18:17:17,1.0,https://i.redd.it/rg02web05oe81.jpg,Failed
4,Finally got all the kids to pose together,sfo13g,2022-01-29 18:16:38,1.0,https://i.redd.it/17nbdv7o4oe81.jpg,Downloaded
...,...,...,...,...,...,...
37,"Music, Video and Time Killing Machine",acori,2009-12-09 07:41:33,41.0,http://imgur.com/MykTa.jpg,BadURL
38,Computing Power Spectrum (A MechE's home office),acmva,2009-12-09 04:27:35,19.0,http://i.imgur.com/hA9LA.png,Downloaded
39,Music Maker's / College Dude's Battlestation,acmt7,2009-12-09 04:20:21,13.0,http://imgur.com/15d4Z.jpg,BadURL
40,Battlestation Ready,acml6,2009-12-09 03:54:22,11.0,https://www.reddit.com/r/battlestations/commen...,Failed


array([nan])