In [56]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import difflib


## 1/2. Scraping the hot topics to choose from

In [57]:
base_url = "https://www.reddit.com/"


In [58]:
response= requests.get(base_url)
html = response.content
soup = bs(html, "html.parser")

In [59]:
topic_dict={}

#get all parent topics
topic_parents = soup.select('#TOPICS faceplate-expandable-section-helper > details')

#for all parent topics, find the parent topic name and list of subtopics
for topic_parent in topic_parents:
    #parent topic name
    topic = topic_parent.find('summary')['aria-controls']

    subtopics = topic_parent.select(f'ul left-nav-topic-tracker') 

    topic_dict[topic]= [subtopic['topic'] for subtopic in subtopics] #get the 'topic' attribute from each selected subtopics
    

In [60]:
topics_df= pd.DataFrame.from_dict(topic_dict, orient='index')
topics_df= topics_df.T.fillna('-')
print("Available Options \n")
topics_df

Available Options 



Unnamed: 0,Internet Culture (Viral),Games,Q&As,Technology,Pop Culture,Movies & TV
0,amazing,action_games,q_and_as,3d_printing,celebrities,action_movies_and_series
1,animals_and_pets,adventure_games,stories_and_confessions,artificial_intelligence_and_machine_learning,creators_and_influencers,animated_movies_and_series
2,cringe_and_facepalm,esports,-,computers_and_hardware,generations_and_nostalgia,comedy_movies_and_series
3,funny,gaming_consoles_and_gear,-,consumer_electronics,podcasts,crime_mystery_and_thriller_movies_and_series
4,interesting,gaming_news_and_discussion,-,diy_electronics,streamers,documentary_movies_and_series
5,memes,mobile_games,-,programming,tarot_and_astrology,drama_movies_and_series
6,oddly_satisfying,other_games,-,software_and_apps,-,fantasy_movies_and_series
7,reddit_meta,role_playing_games,-,streaming_services,-,horror_movies_and_series
8,wholesome_and_heartwarming,simulation_games,-,tech_news_and_discussion,-,movie_news_and_discussion
9,-,sports_and_racing_games,-,virtual_and_augmented_reality,-,reality_tv


In [61]:
possible_values = set(topics_df.values.flatten().tolist())
possible_values.discard("-")
print(possible_values)

{'streaming_services', 'movie_news_and_discussion', 'consumer_electronics', 'role_playing_games', 'software_and_apps', 'programming', 'crime_mystery_and_thriller_movies_and_series', 'simulation_games', 'mobile_games', 'creators_and_influencers', 'documentary_movies_and_series', 'diy_electronics', 'comedy_movies_and_series', 'horror_movies_and_series', 'reality_tv', 'reddit_meta', 'streamers', 'animals_and_pets', 'drama_movies_and_series', 'esports', 'romance_movies_and_series', 'q_and_as', 'memes', 'funny', 'celebrities', 'oddly_satisfying', '3d_printing', 'tabletop_games', 'generations_and_nostalgia', 'scifi_movies_and_series', 'animated_movies_and_series', 'fantasy_movies_and_series', 'artificial_intelligence_and_machine_learning', 'podcasts', 'gaming_consoles_and_gear', 'amazing', 'wholesome_and_heartwarming', 'computers_and_hardware', 'action_movies_and_series', 'adventure_games', 'tarot_and_astrology', 'sports_and_racing_games', 'strategy_games', 'interesting', 'tv_news_and_discus

In [62]:
def find_closest_topic(input_topic):
    closest_match = difflib.get_close_matches(input_topic, possible_values, n=3, cutoff=0.5)

    return closest_match if closest_match else None

In [63]:
input_topic = input('Enter the topic to scrape for')

while input_topic not in possible_values:
    closest_topics = find_closest_topic(input_topic)

    if closest_topics is not None:
        input_topic = input(f'Enter valid topic name. Did you mean? {closest_topics}')
    else:
        input_topic = input('Enter valid topic name.')


In [64]:
input_topic

'amazing'

## 2/2. Scraping the content in chosen topic:
TODO:
- [x] Obtain videos and images with their OP info
- [x] Store in dataFrame and export to csv
- [ ] Fix bad src links
- [ ] Fix untimely breaking of scroll
- [ ] Download media and include a path in the csv

In [65]:
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time



In [66]:
scrape_target_url = base_url + "t/" + input_topic
target_count = 90
items = []


In [67]:
driver = webdriver.Chrome()

In [69]:
driver.get(scrape_target_url)

In [70]:
scrollable_height = driver.execute_script("return document.body.scrollHeight;")
scrollable_height

14483

In [71]:
content_dict_list = []

In [76]:
def get_available_content():
    global target_count
    elements = driver.find_elements(By.XPATH, '//*[@id="topic-tabs"]/section[1]/shreddit-feed/article')
    print(elements)

    for i in range(len(elements)):
        content_dict= {}
        info_tag = elements[i].find_element(By.TAG_NAME, 'shreddit-post')

        content_type = info_tag.get_attribute('post-type')

        if content_type == 'image':
                
            img_tag = info_tag.find_elements(By.CSS_SELECTOR, 'div.relative > shreddit-aspect-ratio > shreddit-media-lightbox-listener > div > img')
            img_src = img_tag[0].get_attribute('src')
            src = img_src

        elif content_type == 'video':
            video_tag = info_tag.find_element(By.CSS_SELECTOR, 'div.relative > shreddit-aspect-ratio > shreddit-async-loader > media-telemetry-observer > shreddit-player')
            video_src = video_tag.get_attribute('src')
            src = video_src

        else:
            src = None
            

        content_dict = {
            'title': info_tag.get_attribute('post-title'),
            'subreddit_id': info_tag.get_attribute('subreddit-id'),
            'subreddit': info_tag.get_attribute('subreddit-prefixed-name'),
            'author': info_tag.get_attribute('author'),
            'upvotes': info_tag.get_attribute('score'),
            'comments': info_tag.get_attribute('comment-count'),
            'type': info_tag.get_attribute('post-type'),
            'source': src
        }

        if (content_dict not in content_dict_list) and (len(content_dict_list) < target_count):
            content_dict_list.append(content_dict)
        else:
            return
            

In [77]:
while target_count > len(content_dict_list):
    
    # wait for some time or wait until loaded
    time.sleep(5)

    # get all the available content
    get_available_content()

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight /8);")

    time.sleep(5)
    new_height = driver.execute_script("return document.body.scrollHeight /8")
    
    if new_height == scrollable_height:
        print('breaking')
        break

    scrollable_height = new_height

[<selenium.webdriver.remote.webelement.WebElement (session="b058e7da5a2f951dd99fccd813767815", element="f.487ED674AAE73915CFB13B0E56805E03.d.19DE4147C770F8ED03221B1A26094A06.e.393")>, <selenium.webdriver.remote.webelement.WebElement (session="b058e7da5a2f951dd99fccd813767815", element="f.487ED674AAE73915CFB13B0E56805E03.d.19DE4147C770F8ED03221B1A26094A06.e.394")>, <selenium.webdriver.remote.webelement.WebElement (session="b058e7da5a2f951dd99fccd813767815", element="f.487ED674AAE73915CFB13B0E56805E03.d.19DE4147C770F8ED03221B1A26094A06.e.395")>, <selenium.webdriver.remote.webelement.WebElement (session="b058e7da5a2f951dd99fccd813767815", element="f.487ED674AAE73915CFB13B0E56805E03.d.19DE4147C770F8ED03221B1A26094A06.e.396")>, <selenium.webdriver.remote.webelement.WebElement (session="b058e7da5a2f951dd99fccd813767815", element="f.487ED674AAE73915CFB13B0E56805E03.d.19DE4147C770F8ED03221B1A26094A06.e.397")>, <selenium.webdriver.remote.webelement.WebElement (session="b058e7da5a2f951dd99fccd81

In [78]:
content_df = pd.DataFrame(content_dict_list)
content_df

Unnamed: 0,title,subreddit_id,subreddit,author,upvotes,comments,type,source
0,Dude makes aquaman look like a regular mermaid,t5_m0bnr,r/nextfuckinglevel,Agitated_Ad_1095,8451,193,video,https://packaged-media.redd.it/fx2gtspphc6d1/p...
1,This girl's flips are definitely top talent,t5_363r3,r/BeAmazed,reflective_map21,7113,170,video,https://packaged-media.redd.it/0ngywg4swb6d1/p...
2,True definition of trust the process,t5_363r3,r/BeAmazed,Remote_Return1716,2049,54,video,https://v.redd.it/15huvf874d6d1/HLSPlaylist.m3...
3,This amazing converted campervan is bigger tha...,t5_363r3,r/BeAmazed,Majoodeh,2280,111,video,https://v.redd.it/cvvwhzbh8c6d1/HLSPlaylist.m3...
4,"Chaser: The Smartest Dog in the World, Who Cou...",t5_m0bnr,r/nextfuckinglevel,Mad_Bulls_007,1222,50,video,https://v.redd.it/9q8pct7trc6d1/HLSPlaylist.m3...
5,Luxury sink shows how hydrophobic surfaces work,t5_363r3,r/BeAmazed,jmcarlos27,16480,730,video,https://packaged-media.redd.it/8wt7vpick96d1/p...
6,"Which floor is the ground floor in Chongqing,C...",t5_363r3,r/BeAmazed,Literally_black1984,1488,117,video,https://v.redd.it/pycsfu11cc6d1/HLSPlaylist.m3...
7,Unusual fish spotted on the ocean floor,t5_363r3,r/BeAmazed,Literally_black1984,2194,93,video,https://v.redd.it/eac7iz5slb6d1/HLSPlaylist.m3...
8,Better than me!,t5_363r3,r/BeAmazed,No-Forever8138,4323,69,video,https://packaged-media.redd.it/xc2jgplg8a6d1/p...
9,of a container ship,t5_a7wuv,r/AbsoluteUnits,Green____cat,1255,47,video,https://packaged-media.redd.it/o1wmjll4rb6d1/p...


In [79]:
content_df.to_csv(f'{input_topic}.csv')