In [8]:
from os.path import isfile
import praw
import pandas as pd
from time import sleep

reddit = praw.Reddit(client_id='', \
                     client_secret='', \
                     user_agent='', \
                     username='', \
                     password='')

class SubredditScraper:

    def __init__(self, sub,time_=' all', sort='new', lim=900, mode='w'):
        self.sub = sub
        self.time_ = time_
        self.sort = sort
        self.lim = lim
        self.mode = mode        

        print(
            f'SubredditScraper instance created with values '
            f'sub = {sub}, sort = {sort}, lim = {lim}, mode = {mode}')
        print(time_)
        print(sort)

    def set_sort(self):
        if self.sort == 'new':
            return self.sort, reddit.subreddit(self.sub).new(limit=self.lim)
        elif self.sort == 'top':
            return self.sort, reddit.subreddit(self.sub).top(limit=self.lim,time_filter=self.time_)
        elif self.sort == 'hot':
            return self.sort, reddit.subreddit(self.sub).hot(limit=self.lim)
        elif self.sort=='rising':
            return self.sort, reddit.subreddit(self.sub).rising(limit=self.lim)
        elif self.sort=='controversial':
            return self.sort,reddit.subreddit(self.sub).controversial(limit=self.lim,time_filter=self.time_)            
        else:
            self.sort = 'hot'
            print('Sort method was not recognized, defaulting to hot.')
            return self.sort, reddit.subreddit(self.sub).hot(limit=self.lim)

    def get_submissions(self):
        """Get unique posts from a specified subreddit."""

        sub_dict = {
            'id': [],'body': [], 'title': [],'flair':[],'comment':[]}
        csv = f'{self.sub}_posts.csv'

        sort, subreddit = self.set_sort()

        df, csv_loaded = (pd.read_csv(csv), 1) if isfile(csv) else ('', 0)

        print(f'csv = {csv}')
        print(f'After set_sort(), sort = {sort} and sub = {self.sub}')
        print(f'csv_loaded = {csv_loaded}')

        print(f'Collecting information from r/{self.sub}.')
        for submission in subreddit:

            # Check if post.id is in df and set to True if df is empty.
            # This way new posts are still added to dictionary when df = ''
            unique_id = submission.id not in tuple(df.id) if csv_loaded else True

            # Save any unique, non-stickied posts with descriptions to sub_dict.
            if unique_id:
                sub_dict['id'].append(submission.id)
                sub_dict['body'].append(submission.selftext)
                sub_dict['title'].append(submission.title)               
                sub_dict['flair'].append(submission.link_flair_text)
            
                submission.comments.replace_more(limit=None)
                comment = ''
                count = 0
                for top_level_comment in submission.comments:
                    comment = comment + ' ' + top_level_comment.body
                    count+=1     
                    if(count > 10):
                        break

                sub_dict["comment"].append(str(comment))

        # pprint(sub_dict)
        new_df = pd.DataFrame(sub_dict)

        # Add new_df to df if df exists then save it to a csv.
        if 'DataFrame' in str(type(df)) and self.mode == 'w':
            pd.concat([df, new_df], axis=0, sort=0).to_csv(csv, index=False)
            print(
                f'{len(new_df)} new posts collected and added to {csv}')
        elif self.mode == 'w':
            new_df.to_csv(csv, index=False)
            print(f'{len(new_df)} posts collected and saved to {csv}')
        else:
            print(
                f'{len(new_df)} posts were collected but they were not '
                f'added to {csv} because mode was set to "{self.mode}"')
            
    def get_top(self):
        sub_dict = {
        'id': [], 'body': [], 'title': [], 'flair':[],'comments':[]}
        csv = f'new_{self.sub}_posts.csv'
        subreddit=reddit.subreddit(self.sub).top(limit=self.lim,time_filter='all')
        df, csv_loaded = (pd.read_csv(csv), 1) if isfile(csv) else ('', 0)

        print(f'csv = {csv}')
        print(f'csv_loaded = {csv_loaded}')

        print(f'Collecting information from r/{self.sub}.')
        for submission in subreddit:

            # Check if post.id is in df and set to True if df is empty.
            # This way new posts are still added to dictionary when df = ''
            unique_id = submission.id not in tuple(df.id) if csv_loaded else True

            # Save any unique, non-stickied posts with descriptions to sub_dict.
            if unique_id:
                sub_dict['id'].append(submission.id)
                sub_dict['body'].append(submission.selftext)
                sub_dict['title'].append(submission.title)
                sub_dict['flair'].append(submission.link_flair_text)
            sleep(0.1)

        # pprint(sub_dict)
        new_df = pd.DataFrame(sub_dict)

        # Add new_df to df if df exists then save it to a csv.
        if 'DataFrame' in str(type(df)) and self.mode == 'w':
            pd.concat([df, new_df], axis=0, sort=0).to_csv(csv, index=False)
            print(
                f'{len(new_df)} new posts collected and added to {csv}')
        elif self.mode == 'w':
            new_df.to_csv(csv, index=False)
            print(f'{len(new_df)} posts collected and saved to {csv}')
        else:
            print(
                f'{len(new_df)} posts were collected but they were not '
                f'added to {csv} because mode was set to "{self.mode}"')
        


# if __name__ == '__main__':
#     SubredditScraper('India', lim=997, mode='w', sort='top').get_submissions()

In [9]:
SubredditScraper('India',lim=2000,mode='w',sort='new',time_='yearly').get_submissions()

SubredditScraper instance created with values sub = India, sort = new, lim = 2000, mode = w
yearly
new
csv = India_posts.csv
After set_sort(), sort = new and sub = India
csv_loaded = 0
Collecting information from r/India.
408 posts collected and saved to India_posts.csv


In [11]:
SubredditScraper('India',lim=2000,mode='w',sort='top',time_='year').get_submissions()

SubredditScraper instance created with values sub = India, sort = top, lim = 2000, mode = w
year
top
csv = India_posts.csv
After set_sort(), sort = top and sub = India
csv_loaded = 1
Collecting information from r/India.
998 new posts collected and added to India_posts.csv


In [13]:
SubredditScraper('India',lim=2000,mode='w',sort='hot',time_='month').get_submissions()

SubredditScraper instance created with values sub = India, sort = hot, lim = 2000, mode = w
month
hot
csv = India_posts.csv
After set_sort(), sort = hot and sub = India
csv_loaded = 1
Collecting information from r/India.
4 new posts collected and added to India_posts.csv
