# Reading data from the Reddit API

This module runs a script to import reddit posts, stores them in a DataFrame and outputs that DataFrame to a csv file for usage later.  

To do this, it employs and a class object and some supporting functions which have been designed to interact with the Reddit HTTP JSON Application Programming Interface (API).  The class object is called ReadReddit and it is built to pull Reddity posts or listings.  It is built using the Requests Python [library](http://docs.python-requests.org/en/master/) for HTTP communication.

ReadRedditPosts has the following attributes

* url_base - the base URL for data pulls in this case 'http://www.reddit.com/'
* url_ - the actual URL used to retrieve data from subreddit
* no_posts_ - the number of posts returned after calling collect_posts
* status_code_ - the HTTP status code returned after calling collect_posts
* json_ = the json format of the web call content
* after_ = the after parameter returned from a Reddit API

and the following methods
    
* collect_posts(sub_grp = None, params = {}, headers = {}) - collecting posts data
* return_posts() - return the individual posts as a list 
* return_post_keys() - return the keys of posts records
* posts(features = []) - return a list of dictionaries containing posts data
    
Key functions are hit_reddit and write_data.  Hit_reddit takes inputs of a lists of subreddits and features and repeatedly calls the a variable instantiated as ReadReddit object to retrieve data.  The results are returned as a data frame and saved as a csv file.




### References

- https://docs.python.org/3/library/time.html
- Practice SQL with pandas, Pt. 1 by Sam Stack (DC), Mark Popovich (SF)
- https://stackoverflow.com/questions/775049/how-do-i-convert-seconds-to-hours-minutes-and-seconds


## Import libraries

In [1]:
# !pip install psycopg2
# !pip install sqlalchemy

In [2]:
## Imports

import requests
import pandas as pd
import os
import time
from time import gmtime, strftime, sleep, localtime
from sqlalchemy import create_engine
import psycopg2
from pandas.io import sql
import datetime

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

  """)


## Establish functions

In [3]:
# Function to streamline min, max, type and null
def print_summary(df):
    for column in df.columns:
        try:
            col_type = df[column].dtype
        except:
            col_type = 'Unknown'
        try:
            col_min = df[column].min()
        except:
            col_type = 'Unknown'
        try:
            col_max = df[column].max()
        except:
            col_type = 'Unknown'
   
        print("Column: %15s  min: %15s  max: %15s  type: %15s  null: %15s" % (column[:15], 
                str(col_min)[:15], str(col_max)[:15], str(col_type)[:15], str(df[column].isnull().sum()))[:15])


# Function to hit the reddit API for specified subgroups and features to return
def hit_reddit(sub_groups = [], features = [], calls = 15, inc_comm = False):
    
    # parameters for the API call
    headers = {'user-agent': 'SteveG'}
    params = {}
    aft_lst = {}
    # Calculate the sleep interval
    slp_int = 1
    
    pst_lst = []
    # for each of the calls
    for i in range(calls):
        # for each subreddit
        print("working on call: ", i)
        for j, sub in enumerate(sub_groups):
            # If already called pass the after parameter to get latest posts
            if i != 0:
                params = {'after': aft_lst[j]}
            # Call the ReadReddit object to get the posts in a list of dictionaries
            posts = ReadRedditPosts()
            posts.collect_posts(sub_grp=sub, params = params, headers = headers)
            sub_post = posts.posts(features = features)
            
            #Include comments if flag set and permalink in features
            if inc_comm and 'permalink' in features:
                for sub_item in sub_post:
                    if len(sub_item['permalink']) > 0:
                        comm_string = ''
                        comm = RedditComments()
                        comm.collect_comments(url = sub_item['permalink'], headers = headers)
                        # Add comments as one long string separated by three ;
                        for comment in comm.comments(features=['body']):
                            comm_string += comment['body'] + ';;;'
                        sub_item['num_comments_cap'] = comm.no_comments_
                        sub_item['comments'] = comm_string
                        # pause before hitting the API again
                        time.sleep(slp_int)   

            pst_lst.extend(sub_post)
            # Set the after value for the next call to the API
            aft_lst[j] = posts.after_
            # pause before hitting the API again
            time.sleep(slp_int)   
    
    # Convert the list to a DataFrame and drop dups
    df = pd.DataFrame(pst_lst)
    df.drop_duplicates(inplace = True)
    df.reset_index(drop=True, inplace = True)

    return df


def write_data(df, data_path):
    # assign a unique file name based on the current time
    t_stmp = strftime("%d%b%Y_%H_%M", localtime())
    o_file = "posts_" + t_stmp + ".csv"
    df.to_csv(os.path.join(data_path, o_file), index = False)

    
def write_to_database(df, engine = None, table = None):
    # write posts to the posts table
    if engine == None:
        return
    else:
        df.to_sql(table, con=engine, index=False, if_exists='append')


## Establish classes

In [4]:
class ReadRedditPosts:
    # Attributes of the data retrieval
    url_base = 'http://www.reddit.com/'
    url_ = None
    no_posts_ = None
    status_code_ = None
    json_ = None
    after_ = None
    
    # Initialization method
    def __init__(self):
        pass
    
    # method to collect data from posts
    def collect_posts(self, sub_grp = None, params = {}, headers = {}):
        # Set the URL and save it to the class variable
        url = self.url_base + 'r/' + sub_grp + '.json'
        self.url_ = url
        # Hit the API to get posts from this URL
        res = requests.get(url, params = params, headers = headers)
        # If 200 return
        res_code_ = res.status_code
        if res.status_code == 200:
            self.json_ = res.json()
            self.no_posts_ = len(self.json_['data']['children'])
            self.after_ = self.json_['data']['after']
            return res.json()
        else:
            return 'Data retrieval error: status code:' + str(res.status_code)

    # Method to return the individual posts as a list    
    def return_posts(self):
        # Refer to the json variable set during collect_posts()
        data = self.json_
        # Return the children posts
        return data['data']['children']
    
    # Method to return the dictionary keys for posts
    def return_post_keys(self):
        # Refer to the json variable set during collect_posts()
        data = self.json_
        # Return the children posts
        return data['data']['children'][0]['data'].keys()

    # Method to return a list of dictionaries of posts with specified fields
    def posts(self, features = []):
        # Refer to the json variable set during collect_posts()
        data = self.json_
        posts = []
        # For every entry in the children posts add a dictionary to the list
        for entry in data['data']['children']:
            post = {}
            # For each item in features create a dictionary key: value pair
            for item in features:
                try:
                    post[item] = entry['data'][item]
                except:
                    post[item] = ''                   
            posts.append(post)
        return posts

    
class RedditComments:
    # Attributes of the data retrieval
    url_ = None
    no_comments_ = None
    status_code_ = None
    json_ = None

    
    # Initialization method
    def __init__(self):
        pass
    
    # method to collect data from posts
    def collect_comments(self, url = None, params = {}, headers = {}):
        # Use the provided URL and save it to the class variable
        if url == None:
            return
        if url[:21] == 'http://www.reddit.com/':
            url = url + '.json'
        else:
            url = 'http://www.reddit.com' + url +'.json'         
        self.url_ = url
        # Hit the API to get posts from this URL
        res = requests.get(url, params = params, headers = headers)
        # If 200 return
        res_code_ = res.status_code
        if res.status_code == 200:
            self.json_ = res.json()
            self.no_comments_ = len(self.json_[1]['data']['children'])
            return res.json()
        else:
            return 'Data retrieval error: status code:' + str(res.status_code)

    # Method to return the individual posts as a list    
    def return_comments(self):
        # Refer to the json variable set during collect_posts()
        data = self.json_
        # Return the children posts
        return data[1]['data']['children']
    
    # Method to return the dictionary keys for posts
    def return_comment_keys(self):
        # Refer to the json variable set during collect_posts()
        data = self.json_
        # Return the children posts
        return data[1]['data']['children'][0]['data'].keys()

    # Method to return a list of dictionaries of posts with specified fields
    def comments(self, features = []):
        # Refer to the json variable set during collect_posts()
        data = self.json_
        comments = []
        # For every entry in the children posts add a dictionary to the list
        for entry in data[1]['data']['children']:
            comment = {}
            # For each item in features create a dictionary key: value pair
            for item in features:
                try:
                    comment[item] = entry['data'][item]
                except:
                    comment[item] = ''                   
            comments.append(comment)
        return comments

    
    

##   Establish parameters

In [5]:
# These are the parameters for retrieving reddit posts data
sub_groups = ['relationships', 'diy','politics', 'woodworking']
inc_list = ['name','subreddit','selftext','created_utc','author_fullname',
           'title', 'num_comments','id', 'permalink']
# Set relative data path
data_path = "../data"

# Database engine
engine = create_engine('postgres://postgres:pass@54.69.8.168:5432')


## Retrieve data from the reddit API and write to a file

In [6]:
# Return a dataframe of reddit posts and calculate time to run the function
start_time = time.time()
df =  hit_reddit(sub_groups = sub_groups, features = inc_list, calls = 50, inc_comm = True)
# write_data(df, data_path)
write_to_database(df, engine = engine, table = 'posts')
end_time = time.time()


working on call:  0
working on call:  1
working on call:  2
working on call:  3
working on call:  4
working on call:  5
working on call:  6
working on call:  7
working on call:  8
working on call:  9
working on call:  10
working on call:  11
working on call:  12
working on call:  13
working on call:  14
working on call:  15
working on call:  16
working on call:  17
working on call:  18
working on call:  19
working on call:  20
working on call:  21
working on call:  22
working on call:  23
working on call:  24
working on call:  25
working on call:  26
working on call:  27
working on call:  28
working on call:  29
working on call:  30
working on call:  31
working on call:  32
working on call:  33


KeyboardInterrupt: 

In [None]:
# Show the running time
run_time = str(datetime.timedelta(seconds = end_time - start_time))
print("run time: ", run_time)


In [None]:
# tposts = ReadReddit()
# tdata = tposts.collect_posts(sub_grp='politics',headers = {'user-agent': 'SteveG'})
# print(tposts.return_post_keys())
# tjson = tposts.json_
# tjson['data']

# print(tposts.posts(features=inc_list)[3]['permalink'])
# comm = RedditComments()
# comms = comm.collect_comments(url = tposts.posts(features=inc_list)[10]['permalink'], headers = {'user-agent': 'SteveG'})
# comm.comments(features=['body'])


In [None]:
df.shape


## Examine the resulting DataFrame

In [None]:
# Look at the resulting DataFrame
print(df.shape)
df.head()


In [None]:
# Examine Value counts of subreddit
df['subreddit'].value_counts()


In [None]:
# Count number of duplicated rows
print("Duplicated rows: %d \n" % sum([int(i) for i in df.duplicated()]))

#Print a summary of DataFrame columns
print_summary(df)


In [None]:
# Look for duplicates in the selftext column
print("There might be duplicates in %d rows" % (len(df['selftext']) - len(set(df['selftext']))))
