In [2]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import difflib


## 1/2. Scraping the hot topics to choose from

In [3]:
base_url = "https://www.reddit.com/"


In [4]:
response= requests.get(base_url)
html = response.content
soup = bs(html, "html.parser")

In [5]:
topic_dict={}

#get all parent topics
topic_parents = soup.select('#TOPICS faceplate-expandable-section-helper > details')

#for all parent topics, find the parent topic name and list of subtopics
for topic_parent in topic_parents:
    #parent topic name
    topic = topic_parent.find('summary')['aria-controls']

    subtopics = topic_parent.select(f'ul left-nav-topic-tracker') 

    topic_dict[topic]= [subtopic['topic'] for subtopic in subtopics] #get the 'topic' attribute from each selected subtopics
    

In [6]:
topics_df= pd.DataFrame.from_dict(topic_dict, orient='index')
topics_df= topics_df.T.fillna('-')
print("Available Options \n")
topics_df

Available Options 



Unnamed: 0,Internet Culture (Viral),Games,Q&As,Technology,Pop Culture,Movies & TV
0,amazing,action_games,q_and_as,3d_printing,celebrities,action_movies_and_series
1,animals_and_pets,adventure_games,stories_and_confessions,artificial_intelligence_and_machine_learning,creators_and_influencers,animated_movies_and_series
2,cringe_and_facepalm,esports,-,computers_and_hardware,generations_and_nostalgia,comedy_movies_and_series
3,funny,gaming_consoles_and_gear,-,consumer_electronics,podcasts,crime_mystery_and_thriller_movies_and_series
4,interesting,gaming_news_and_discussion,-,diy_electronics,streamers,documentary_movies_and_series
5,memes,mobile_games,-,programming,tarot_and_astrology,drama_movies_and_series
6,oddly_satisfying,other_games,-,software_and_apps,-,fantasy_movies_and_series
7,reddit_meta,role_playing_games,-,streaming_services,-,horror_movies_and_series
8,wholesome_and_heartwarming,simulation_games,-,tech_news_and_discussion,-,movie_news_and_discussion
9,-,sports_and_racing_games,-,virtual_and_augmented_reality,-,reality_tv


In [7]:
possible_values = set(topics_df.values.flatten().tolist())
possible_values.discard("-")
print(possible_values)

{'3d_printing', 'horror_movies_and_series', 'drama_movies_and_series', 'memes', 'programming', 'reddit_meta', 'tech_news_and_discussion', 'computers_and_hardware', 'q_and_as', 'documentary_movies_and_series', 'reality_tv', 'amazing', 'generations_and_nostalgia', 'streaming_services', 'action_movies_and_series', 'interesting', 'gaming_news_and_discussion', 'animals_and_pets', 'podcasts', 'tv_news_and_discussion', 'wholesome_and_heartwarming', 'role_playing_games', 'action_games', 'superhero_movies_and_series', 'tarot_and_astrology', 'cringe_and_facepalm', 'adventure_games', 'gaming_consoles_and_gear', 'funny', 'oddly_satisfying', 'scifi_movies_and_series', 'mobile_games', 'stories_and_confessions', 'esports', 'diy_electronics', 'consumer_electronics', 'software_and_apps', 'strategy_games', 'movie_news_and_discussion', 'comedy_movies_and_series', 'sports_and_racing_games', 'celebrities', 'fantasy_movies_and_series', 'streamers', 'creators_and_influencers', 'other_games', 'tabletop_games'

In [8]:
def find_closest_topic(input_topic):
    closest_match = difflib.get_close_matches(input_topic, possible_values, n=3, cutoff=0.5)

    return closest_match if closest_match else None

In [9]:
input_topic = input('Enter the topic to scrape for')

while input_topic not in possible_values:
    closest_topics = find_closest_topic(input_topic)

    if closest_topics is not None:
        input_topic = input(f'Enter valid topic name. Did you mean? {closest_topics}')
    else:
        input_topic = input('Enter valid topic name.')


In [11]:
input_topic

'amazing'

## 2/2. Scraping the content in chosen topic:
TODO:
- [x] Obtain videos and images with their OP info
- [x] Store in dataFrame and export to csv
- [ ] Fix bad src links
- [ ] Fix untimely breaking of scroll
- [ ] Download media and include a path in the csv

In [24]:
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time



In [25]:
scrape_target_url = base_url + "t/" + input_topic
target_count = 90
items = []


In [26]:
driver = webdriver.Chrome()

In [27]:
driver.get(scrape_target_url)

In [28]:
previous_height = driver.execute_script("return document.body.scrollHeight;")
previous_height

14514

In [29]:
content_dict_list = []

In [30]:
def get_available_content():
    global target_count
    elements = driver.find_elements(By.XPATH, '//*[@id="topic-tabs"]/section[1]/shreddit-feed/article')

    for i in range(len(elements)):
        content_dict= {}
        info_tag = elements[i].find_element(By.TAG_NAME, 'shreddit-post')

        content_type = info_tag.get_attribute('post-type')

        if content_type == 'image':
                
            img_tag = info_tag.find_elements(By.CSS_SELECTOR, 'div.relative > shreddit-aspect-ratio > shreddit-media-lightbox-listener > div > img')
            img_src = img_tag[0].get_attribute('src')
            src = img_src

        elif content_type == 'video':
            video_tag = info_tag.find_element(By.CSS_SELECTOR, 'div.relative > shreddit-aspect-ratio > shreddit-async-loader > media-telemetry-observer > shreddit-player')
            video_src = video_tag.get_attribute('src')
            src = video_src

        else:
            src = None
            

        content_dict = {
            'title': info_tag.get_attribute('post-title'),
            'subreddit_id': info_tag.get_attribute('subreddit-id'),
            'subreddit': info_tag.get_attribute('subreddit-prefixed-name'),
            'author': info_tag.get_attribute('author'),
            'upvotes': info_tag.get_attribute('score'),
            'comments': info_tag.get_attribute('comment-count'),
            'type': info_tag.get_attribute('post-type'),
            'source': src
        }

        if (content_dict not in content_dict_list) and (len(content_dict_list) < target_count):
            content_dict_list.append(content_dict)
        else:
            return
        
        print (len(content_dict_list))
            

while target_count > len(content_dict_list):
    
    # wait for some time or wait until loaded
    time.sleep(5)

    # get all the available content
    get_available_content()

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight /8);")

    time.sleep(5)
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    if new_height == previous_height:
        print('breaking')
        break

    previous_height = new_height

In [None]:
from selenium.webdriver.common.keys import Keys
body = driver.find_element(By.TAG_NAME, 'body')

while target_count > len(content_dict_list):
    time.sleep(3)
    get_available_content()
    body.send_keys(Keys.PAGE_DOWN)



In [32]:
content_df = pd.DataFrame(content_dict_list)
content_df

Unnamed: 0,title,subreddit_id,subreddit,author,upvotes,comments,type,source
0,Dude makes aquaman look like a regular mermaid,t5_m0bnr,r/nextfuckinglevel,Agitated_Ad_1095,9888,225,video,https://packaged-media.redd.it/fx2gtspphc6d1/p...
1,True definition of trust the process,t5_363r3,r/BeAmazed,Remote_Return1716,3336,71,video,https://v.redd.it/15huvf874d6d1/HLSPlaylist.m3...
2,This girl's flips are definitely top talent,t5_363r3,r/BeAmazed,reflective_map21,7594,185,video,https://packaged-media.redd.it/0ngywg4swb6d1/p...
3,This amazing converted campervan is bigger tha...,t5_363r3,r/BeAmazed,Majoodeh,2447,117,video,https://v.redd.it/cvvwhzbh8c6d1/HLSPlaylist.m3...
4,"Chaser: The Smartest Dog in the World, Who Cou...",t5_m0bnr,r/nextfuckinglevel,Mad_Bulls_007,1363,49,video,https://v.redd.it/9q8pct7trc6d1/HLSPlaylist.m3...
...,...,...,...,...,...,...,...,...
85,The art of Cinematography,t5_363r3,r/BeAmazed,Ultimate_Kurix,1371,12,video,https://packaged-media.redd.it/zhekp1j1x56d1/p...
86,Steam through a prism refracted sunbeam matchi...,t5_363r3,r/BeAmazed,Natashaalovelyy,156,11,video,https://packaged-media.redd.it/oag19gg7d96d1/p...
87,Japanese Firefighters training,t5_m0bnr,r/nextfuckinglevel,Parasyte-vn,30240,491,video,https://packaged-media.redd.it/oywy19clb26d1/p...
88,Absolute unit of a cow stands over 6ft tall,t5_363r3,r/BeAmazed,Sunnyudd,2229,89,video,https://packaged-media.redd.it/9j4y0ykkx46d1/p...


In [33]:
content_df.to_csv(f'{input_topic}.csv')