In [1]:
pip install pmaw

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pmaw import PushshiftAPI    #library Pushshift
import datetime as dt            #library for date management
import pandas as pd                         #library for data manipulation
import matplotlib.pyplot as plt  #library for plotting
import praw
from IPython.display import display, clear_output
import time

In [2]:
api = PushshiftAPI()

In [3]:
def give_me_intervals(start_at, end_at, number_of_days_per_interval = 3):
    
    # end_at = math.ceil(datetime.utcnow().timestamp())
        
    ## 1 day = 86400,
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    yield (int(start_at), int(end))
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)

In [4]:
"""FOR COMMENTS"""
def data_prep_comments(subreddit, start_time, end_time, filters, limit):
    if(len(filters) == 0):
        filters = ['id','title','author','created_utc',
                   'body', 'url','score','upvote_ratio','ups','downs','permalink'
                   ,'num_comments', 'link_id', 'comment_type', 'name', 'parent_id']                 
                   #We set by default some useful columns
    comments = []
    comments_global = pd.DataFrame()
    for interval in give_me_intervals(start_time,end_time,1):
        clear_output(wait=True)
        display('Retrieving Posts for ' + dt.datetime.utcfromtimestamp(interval[0]).strftime('%Y-%m-%d : %h : %M : %S') + " to " + dt.datetime.utcfromtimestamp(interval[1]).strftime('%Y-%m-%d : %h : %M : %S'))
        comments = list(api.search_comments(
            subreddit=subreddit,   #Subreddit we want to audit
            after=interval[0],      #Start date
            before=interval[1],       #End date
            filter=filters,        #Column names we want to retrieve
            limit=1000,
            q = "-body:[removed]"))          ##Max number of posts
        comments_local = pd.DataFrame(comments)
        comments_global = pd.concat([comments_global, comments_local],ignore_index=True)


    return comments_global #Return dataframe for analysis

In [5]:
from azure.storage.blob import BlobServiceClient, BlobClient
import json

subreddit = "wallstreetbets"     #Subreddit we are auditing
conn_str = '...'
container_name = '...'

filters = []                     
limit = 1000            
for i in range(1, 13):
    if i != 4:
        start_time = int(dt.datetime(2022, i, 1).timestamp())
        end_time = int(dt.datetime(2022, i+1, 1).timestamp())
        df_p = data_prep_comments(subreddit,start_time,end_time,filters,limit) 

        # Define the connection string and blob information
        blob_name = 'wallstreetbets_comments_' + str(i) +'.csv'

        # Convert dataframe to CSV string
        csv_string = df_p.to_csv(index=False, escapechar='\\')

        # Create a BlobServiceClient object using the connection string
        blob_service_client = BlobServiceClient.from_connection_string(conn_str)

        # Get a BlobClient object for the blob
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

        # Upload the JSON string to the blob
        blob_client.upload_blob(csv_string, overwrite=True)



'Retrieving Posts for 2022-04-01 : Apr : 00 : 01 to 2022-04-02 : Apr : 00 : 00'

In [6]:
from azure.storage.blob import BlobServiceClient, BlobClient
import json

subreddit = "wallstreetbets"     #Subreddit we are auditing
conn_str = '...'
container_name = '...'

filters = []                     
limit = 1000            
for i in range(1, 4):
    if i != 4:
        start_time = int(dt.datetime(2023, i, 1).timestamp())
        end_time = int(dt.datetime(2023, i+1, 1).timestamp())
        df_p = data_prep_comments(subreddit,start_time,end_time,filters,limit) 

        # Define the connection string and blob information
        blob_name = 'wallstreetbets_comments_2023_' + str(i) +'.csv'

        # Convert dataframe to CSV string
        csv_string = df_p.to_csv(index=False, escapechar='\\')

        # Create a BlobServiceClient object using the connection string
        blob_service_client = BlobServiceClient.from_connection_string(conn_str)

        # Get a BlobClient object for the blob
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

        # Upload the JSON string to the blob
        blob_client.upload_blob(csv_string, overwrite=True)



'Retrieving Posts for 2023-04-01 : Apr : 00 : 01 to 2023-04-02 : Apr : 00 : 00'

In [55]:
from azure.storage.blob import BlobServiceClient
import io

# Connection string for your storage account
conn_str = '...'

# Blob service client
blob_service_client = BlobServiceClient.from_connection_string(conn_str)

# List all blobs in the container/directory with .csv extension
blob_list = blob_service_client.get_container_client(container_name).list_blob_names()

csv_blobs = []
for i in range(1, 10):
    x = 'wallstreetbets_comments_' + str(i) + '.csv'
    if x in blob_list:
        csv_blobs.append(x)

blob_list = blob_service_client.get_container_client(container_name).list_blob_names()
for i in range(10, 14):
    x = 'wallstreetbets_comments_' + str(i) + '.csv'
    if x in blob_list:
        csv_blobs.append(x)

blob_list = blob_service_client.get_container_client(container_name).list_blob_names()
for i in range(1, 4):
    x = 'wallstreetbets_comments_2023_' + str(i) + '.csv'
    if x in blob_list:
        csv_blobs.append(x)

# Read CSV files into a list of dataframes
dfs = []
for blob_name in csv_blobs:
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    stream_data = blob_client.download_blob().content_as_text()
    df = pd.read_csv(io.StringIO(stream_data))
    dfs.append(df)

# Concatenate all dataframes into one
df_combined = pd.concat(dfs, ignore_index=True)

# Write the combined dataframe to a CSV file in the blob storage
output_blob = blob_service_client.get_blob_client(container=container_name, blob='comments_combined.csv')
output_blob.upload_blob(df_combined.to_csv(index=False), overwrite=True)

{'etag': '"0x8DB1D6BEA1EF880"',
 'last_modified': datetime.datetime(2023, 3, 5, 11, 22, 32, tzinfo=datetime.timezone.utc),
 'content_md5': None,
 'content_crc64': bytearray(b'+7\xa8\xa7\x9fH\x1f<'),
 'client_request_id': '05ea3452-bb48-11ed-af25-6045bdbfeb1d',
 'request_id': '8cd2e7c7-601e-005c-6954-4fa713000000',
 'version': '2021-08-06',
 'version_id': None,
 'date': datetime.datetime(2023, 3, 5, 11, 22, 32, tzinfo=datetime.timezone.utc),
 'request_server_encrypted': True,
 'encryption_key_sha256': None,
 'encryption_scope': None}

In [58]:
"""FOR POSTS"""
def data_prep_posts(subreddit, start_time, end_time, filters, limit):
    if(len(filters) == 0):
        filters = ['id','title','author','created_utc',
                   'body', 'url','score','upvote_ratio','ups','downs','permalink'
                   ,'num_comments', 'link_id', 'comment_type', 'name']                 
                   #We set by default some useful columns
    comments = []
    comments_global = pd.DataFrame()
    for interval in give_me_intervals(start_time,end_time,1):
        clear_output(wait=True)
        display('Retrieving Posts for ' + dt.datetime.utcfromtimestamp(interval[0]).strftime('%Y-%m-%d : %h : %M : %S') + " to " + dt.datetime.utcfromtimestamp(interval[1]).strftime('%Y-%m-%d : %h : %M : %S'))
        comments = list(api.search_submissions(
            subreddit=subreddit,   #Subreddit we want to audit
            after=interval[0],      #Start date
            before=interval[1],       #End date
            filter=filters,        #Column names we want to retrieve
            limit=None,
            q = "-body:[removed]"))          ##Max number of posts
        comments_local = pd.DataFrame(comments)
        comments_global = pd.concat([comments_global, comments_local],ignore_index=True)


    return comments_global #Return dataframe for analysis

In [59]:
from azure.storage.blob import BlobServiceClient, BlobClient
import json

subreddit = "wallstreetbets"     #Subreddit we are auditing
conn_str = '...'
container_name = '...'

filters = []                     
limit = 1000            
start_time = int(dt.datetime(2023, 1, 1).timestamp())
end_time = int(dt.datetime(2023, 3, 1).timestamp())
df_p = data_prep_comments(subreddit,start_time,end_time,filters,limit) 

# Define the connection string and blob information
blob_name = 'wallstreetbets_posts_2023.csv'

# Convert dataframe to CSV string
csv_string = df_p.to_csv(index=False, escapechar='\\')

# Create a BlobServiceClient object using the connection string
blob_service_client = BlobServiceClient.from_connection_string(conn_str)

# Get a BlobClient object for the blob
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

# Upload the JSON string to the blob
blob_client.upload_blob(csv_string, overwrite=True)



'Retrieving Posts for 2023-03-01 : Mar : 00 : 01 to 2023-03-02 : Mar : 00 : 00'

{'etag': '"0x8DB1D71981AA86E"',
 'last_modified': datetime.datetime(2023, 3, 5, 12, 3, 12, tzinfo=datetime.timezone.utc),
 'content_md5': bytearray(b'\xca2\xd2<\x87\n\xb5O\xf2\xe4>E)!\xf7\xcb'),
 'client_request_id': 'b3c6f84e-bb4d-11ed-af25-6045bdbfeb1d',
 'request_id': '8f968b0c-701e-0048-5c5a-4f6477000000',
 'version': '2021-08-06',
 'version_id': None,
 'date': datetime.datetime(2023, 3, 5, 12, 3, 11, tzinfo=datetime.timezone.utc),
 'request_server_encrypted': True,
 'encryption_key_sha256': None,
 'encryption_scope': None}