In [11]:
!pip install praw -q
import praw
import getpass
import pandas as pd
import json
import os
import requests
import shutil

from datetime import datetime

In [3]:
with open(os.path.join('/home/jovyan/reddit/', 'env.json')) as secrets_file:
    secrets = json.load(secrets_file)
    
def get_secret(setting, secrets=secrets):
    """Get secret setting or fail with ImproperlyConfigured"""
    try:
        return secrets[setting]
    except KeyError:
        raise ImproperlyConfigured("Set the {} setting".format(setting))

In [4]:
reddit_read_only = praw.Reddit(client_id=get_secret('client_id'),       # your client id
                               client_secret=get_secret('client_secret'),     # your client secret
                               user_agent=get_secret('user_agent'))     # your user agent

In [None]:
subreddit = reddit_read_only.subreddit("battlestations")

posts = subreddit.top("year")
 
posts_dict = {"Title": [],
              "ID": [], "Upvotes": [],
              "Total Comments": [], "Post URL": []
              }
 
for post in posts:
    # Title of each post
    posts_dict["Title"].append(post.title)
     
    # Unique ID of each post
    posts_dict["ID"].append(post.id)
     
    # The score of a post
    posts_dict["Upvotes"].append(post.score)
     
    # Total number of comments inside the post
    posts_dict["Total Comments"].append(post.num_comments)
     
    # URL of each post
    posts_dict["Post URL"].append(post.url)
 
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts

In [None]:
current_directory = os.getcwd()
new_dir = "pics-" + datetime.now().strftime("%m%d%Y%H%M%S")
final_directory = os.path.join(current_directory, new_dir)
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

for index, row in top_posts.iterrows():
    url =  row['Post URL']
    filename = row['ID']
    
    res = requests.get(url, stream = True)
    if res.status_code == 200:
        with open(final_directory + '/' + filename + '.jpg','wb') as f:
            shutil.copyfileobj(res.raw, f)
    else:
        print(filename + ' couldn\'t be retrieved')

In [18]:
import requests
from datetime import datetime
import traceback
import time
import json
import sys
from IPython.display import clear_output

username = ""  # put the username you want to download in the quotes
subreddit = "battlestations"  # put the subreddit you want to download in the quotes
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit

filter_string = None
if username == "" and subreddit == "":
    print("Fill in either username or subreddit")
    sys.exit(0)
elif username == "" and subreddit != "":
    filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "":
    filter_string = f"author={username}"
else:
    filter_string = f"author={username}&subreddit={subreddit}"

url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&{}&before="

start_time = datetime.utcnow()

posts_dict = {"Title": [],
              "ID": [], "Date": [], "Upvotes": [], "URL": []
              }
posts = pd.DataFrame(posts_dict)



filename = 'data-' + datetime.now().strftime("%m%d%Y%H%M%S") + '.csv'
object_type = "submission"
print(f"Saving {object_type}s to {filename}")

count = 0
previous_epoch = int(start_time.timestamp())
while True:
    new_url = url.format(object_type, filter_string)+str(previous_epoch)
    json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
    time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
    try:
        json_data = json_text.json()
    except json.decoder.JSONDecodeError:
        time.sleep(1)
        continue

    if 'data' not in json_data:
        break
    objects = json_data['data']
    if len(objects) == 0:
        break

    batch_dict = {"Title": [],
          "ID": [], "Date": [], "Upvotes": [], "URL": []
          }    

    for object in objects:
        previous_epoch = object['created_utc'] - 1
        count += 1
        if object_type == 'submission':
            if 'url' not in object:
                print("continuing")
                continue
            try:
                batch_dict["Title"].append(object['title'])
                batch_dict["ID"].append(object['id'])
                batch_dict["Date"].append(datetime.fromtimestamp(object['created_utc']).strftime("%Y-%m-%d %H:%M:%S"))
                batch_dict["Upvotes"].append(object['score'])
                batch_dict["URL"].append(object['url'])

                # handle.write("Title: " + str(object['title']))
                # handle.write("\nID: " + str(object['id']))
                # handle.write("\nScore: " + str(object['score']))
                # handle.write("\nAwards: " + str(object['total_awards_received']))
                # handle.write("\nTime: " + datetime.fromtimestamp(object['created_utc']).strftime("%Y-%m-%d %H:%M:%S"))
                # handle.write("\nURL: " + str(object['url']))
                # handle.write("\n-------------------------------\n")
            except Exception as err:
                print(f"Couldn't print post: {object['url']}")
                print(traceback.format_exc())

    batch = pd.DataFrame(batch_dict)
    posts = posts.append(batch)
    posts.to_csv(filename, index=False)

    clear_output(wait=True)
    print("Saved {} {}s through {}".format(count, object_type, datetime.fromtimestamp(previous_epoch).strftime("%Y-%m-%d")))

print(f"Saved {count} {object_type}s")

Saved 162792 submissions through 2009-12-09
Saved 162792 submissions


In [64]:
posts = pd.read_csv('posts.csv', dtype={'Title': 'str',
                                        'ID': 'str',
                                        'Date': 'str',
                                        'Upvotes': 'int',
                                        'URL': 'str',
                                        'Status': 'str'
                                       })
posts = posts.iloc[: , 1:]
#posts = posts.sort_values(by=['Upvotes'], ascending=False)
#header_list = ["Title", "ID", "Date", "Upvotes", "URL", "Status"]
#posts = posts.reindex(columns = header_list)
posts = posts[posts['URL'].str.contains("https://i.redd.it/")]
posts = posts.reset_index(drop=True)
posts

Unnamed: 0,Title,ID,Date,Upvotes,URL,Status
0,The Gum Ball PC. Tried to be different and fun...,88mrjw,2018-03-31 23:53:50,34077,https://i.redd.it/qxw0cetfs6p01.jpg,Downloaded
1,I live in a van and this is my battle station!,cetna0,2019-07-18 15:05:17,32530,https://i.redd.it/5bh2vhihu2b31.jpg,Downloaded
2,I call it...Serenity,bx4csz,2019-06-05 16:10:59,31412,https://i.redd.it/oy36njp1bk231.jpg,Downloaded
3,My Music / Gaming Room,85ikwj,2018-03-19 10:53:54,27462,https://i.redd.it/kj4rw6mqapm01.jpg,Downloaded
4,Confined to bed after surgury - built this in ...,8pkdt1,2018-06-08 14:37:20,23885,https://i.redd.it/lrp3ezjags211.jpg,Downloaded
...,...,...,...,...,...,...
95615,Different vibe of my set up,d9ugxm,2019-09-27 03:07:27,0,https://i.redd.it/lx1yr66bz1p31.jpg,
95616,Blendtec Total Classic Original Blender - Wild...,d9k0ll,2019-09-26 13:59:38,0,https://i.redd.it/6b5dn1jo2yo31.jpg,
95617,My battlestation with rgb and philips hue.,d9kudu,2019-09-26 15:03:29,0,https://i.redd.it/z8hvcic5eyo31.jpg,
95618,Living on campus in the year 2019...,d9lua5,2019-09-26 16:16:12,0,https://i.redd.it/f5pfhqf4ryo31.jpg,


In [73]:
import pandas as pd
posts2 = pd.read_csv('data-01292022191230.csv', dtype={'Title': 'str',
                                        'ID': 'str',
                                        'Date': 'str',
                                        'Upvotes': 'int',
                                        'URL': 'str'
                                       })
# posts2 = posts.iloc[: , 1:]
posts2 = posts2.sort_values(by=['Upvotes'], ascending=False)
header_list = ["Title", "ID", "Date", "Upvotes", "URL", "Status"]
posts2 = posts2.reindex(columns = header_list)
posts2 = posts2[posts2['URL'].str.contains("i.imgur.com")]
posts2 = posts2.reset_index(drop=True)
posts2

# http*://imgur.com domains = 39445
# http*://i.imgur.com domains = 16295
# total = 55740

Unnamed: 0,Title,ID,Date,Upvotes,URL,Status
0,Can I join your ranks?,7y1byy,2018-02-16 19:42:51,29146,https://i.imgur.com/WFXtOln.jpg,
1,Update: My Brooklyn Battlestation,b1s9fi,2019-03-16 13:09:06,23215,https://i.imgur.com/SekRb37.jpg,
2,Last LAN before I'm a Dad!,nyycak,2021-06-13 15:09:40,22052,https://i.imgur.com/wGoGyJe.jpg,
3,Pretty happy with how this turned out,6jieb8,2017-06-26 02:21:13,16249,http://i.imgur.com/pQz53qW.jpg,
4,Ultrawides in an Old Cotton Mill,6iwlqy,2017-06-22 21:18:59,12452,http://i.imgur.com/7N5NVE4.jpg,
...,...,...,...,...,...,...
16290,It gets the job done,53096b,2016-09-16 04:22:04,0,http://i.imgur.com/ZMOgDEu.jpg,
16291,"Server, Mac, BSD, other Mac, and Win gaming rig",52fu4t,2016-09-12 18:27:56,0,http://i.imgur.com/abIGMOy.jpg,
16292,Temp Command Center,52fvs3,2016-09-12 18:35:54,0,http://i.imgur.com/1p83ckh.jpg,
16293,My Battlestation!,12p91b,2012-11-06 01:07:54,0,http://i.imgur.com/tTDtV.jpg,


In [None]:
!pip install wget -q
import wget
import time
import traceback
import sys
from os.path import exists
from IPython.display import clear_output
from datetime import datetime

current_directory = os.getcwd()
new_dir = "pics" # -" + datetime.now().strftime("%m%d%Y%H%M%S")
final_directory = os.path.join(current_directory, new_dir)
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

processed = 0
skipped = 0
failed = 0
success = 0

for index, row in posts.iterrows():
    
    # check if the file exists already
    if exists(final_directory + '/' + filename + '.jpg'):
        success += 1
        processed += 1
        posts.at[index,'Status'] = "Downloaded"
        clear_output(wait=True)
        print("Skipped " + row['ID'] + " (" + str(processed) + ")")
        time.sleep(0.05)
        continue
        
    # check if previously failed
    if row['Status'] == "Failed":
        failed += 1
        processed += 1
        continue
        
    try:
        wget.download(url, current_directory + '/test.jpg')
        success += 1
        posts.at[index,'Status'] = "Downloaded"
    except:
        failed += 1
        posts.at[index,'Status'] = "Failed"
        
    processed += 1
    clear_output(wait=True)
    total = 162792
    percentage = (processed / total) * 100
    
    if (processed % 100) == 0:
        posts.to_csv("posts.csv")
    
    clear_output(wait=True)
    print(str(processed) + " / " + str(total) + " processed, " + str(percentage) + "%")
    print(str(success) + " successful" + " (" + str((success/total)*100) + "%)")
    print(str(failed) + " failed" + " (" + str((failed/total)*100) + "%)")
    print(str(skipped) + " skipped" + " (" + str((skipped/total)*100) + "%)")
    print("Index: " + str(index))
        
    

Skipped c822t9 (3108)


In [69]:
from os.path import exists
import time
from IPython.display import clear_output
import requests
from datetime import datetime
import traceback
import time
import json
import sys

current_directory = os.getcwd()
new_dir = "pics" # -" + datetime.now().strftime("%m%d%Y%H%M%S")
final_directory = os.path.join(current_directory, new_dir)
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

processed = 0
skipped = 0
failed = 0
success = 0
status = ""
exceptions = 0

for index, row in posts.iterrows():
    url =  row['URL']
    filename = row['ID']
    
    if row['Status'] == "Failed":
        failed += 1
        processed += 1
        status = "skipped"
        continue
        
    if row['Status'] == "BadURL":
        skipped += 1
        processed += 1
        status = "skipped"
        continue
        
    if row['Status'] == "Downloaded":
        success += 1
        processed += 1
        status = "downloaded"
        continue
        
    if exists(final_directory + '/' + filename + '.jpg'):
        success += 1
        processed += 1
        status = "downloaded"
        continue
        
    matches = ["://i.redd.it", "://i.imgur.com"]
    if not any(x in url for x in matches):
        skipped += 1
        processed += 1  
        posts.at[index,'Status'] = "BadURL"
        status = "bad url"
        continue
    try:
        res = requests.get(url, stream = True, timeout=20)
    except:
        skipped += 1
        processed += 1
        exceptions += 1
        posts.at[index,'Status'] = "Exception"
        continue
    if res.status_code == 200:
        with open(final_directory + '/' + filename + '.jpg','wb') as f:
            shutil.copyfileobj(res.raw, f)
        success += 1
        posts.at[index,'Status'] = "Downloaded"
        status = "downloaded"
        
    else:
        failed += 1
        posts.at[index,'Status'] = "Failed"
        status = "failed"
        
        
    processed += 1
    clear_output(wait=True)
    total = 95620
    percentage = (processed / total) * 100
    
    clear_output(wait=True)
    # print(str(processed) + " / " + str(total) + " processed, " + str(percentage) + "%")
    # print(str(success) + " successful" + " (" + str((success/total)*100) + "%)")
    # print(str(failed) + " failed" + " (" + str((failed/total)*100) + "%)")
    # print(str(skipped) + " skipped" + " (" + str((skipped/total)*100) + "%)")
    # print("Exceptions: " + str(exceptions))
    # print("Index: " + str(index))
    
    time.sleep(1)
    if (index % 100 == 0):
        posts.to_csv("posts.csv")

8850 / 95620 processed, 9.25538590253085%
7516 successful (7.860280276092868%)
1329 failed (1.3898765948546328%)
5 skipped (0.005229031583350764%)
Exceptions: 5
Index: 8849


KeyboardInterrupt: 

posts.to_csv("posts.csv")