## Imports, and Primary Set-Up

In [1]:
import requests
import json
import pandas as pd
from time import sleep

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

## Testing out the links:

In [2]:
crypto = 'https://api.pushshift.io/reddit/search/submission/?subreddit=CryptoCurrency&size=100'
stock = 'https://api.pushshift.io/reddit/search/submission/?subreddit=StockMarket&size=100'

res_crypto = requests.get(crypto)
sleep(1)
res_stock = requests.get(stock)

In [3]:
# making sure both urls are fine
print(res_crypto.status_code)
print(res_stock.status_code)

200
200


In [4]:
# getting the dictionaries
crypto_dict = res_crypto.json()
stock_dict = res_stock.json()

In [5]:
# examining the dictionaries:
print(crypto_dict.keys())
print(stock_dict.keys())

dict_keys(['data'])
dict_keys(['data'])


In [6]:
## Exploring the dictionaries: 
crypto_dict['data'][0]
crypto_dict['data'][0].keys()
len(crypto_dict['data'])

100

In [7]:
crypto_dict['data'][5]['selftext']


''

In [17]:
stock_dict['data'][5]['selftext']

''

## Features and Target Variables

Posts (data) are in crypto_dict['data'][i]['selftext], and target variable (class) is crypto_dict['data'][i]['subreddit']

In [8]:
crypto_data = []
crypto_target = []
for i,_ in enumerate(crypto_dict['data']):
    if crypto_dict['data'][i]['selftext'] !='':
        crypto_data.append(crypto_dict['data'][i]['selftext'])
        crypto_target.append(crypto_dict['data'][i]['subreddit'])
crypto = pd.DataFrame(zip(crypto_data, crypto_target), columns = ['post', 'topic'])

In [10]:
stock_data = []
stock_target = []
for i,_ in enumerate(stock_dict['data']):
    if stock_dict['data'][i]['selftext']!='':
        stock_data.append(stock_dict['data'][i]['selftext'])
        stock_target.append(stock_dict['data'][i]['subreddit'])
stock = pd.DataFrame(zip(stock_data, stock_target), columns = ['post','topic'])

In [11]:
#merging the two dataframes together: 
df = pd.concat([crypto, stock], axis = 0, sort = False)
df.shape

(112, 2)

In [12]:
# more cleaning, dropping rows with '[removed]' posts 
removed_posts_indices = df.loc[df['post']=='[removed]', :].index
df.drop(removed_posts_indices, inplace = True)

### Collecting a 100 posts from 60 days ago till now, in reverse order:

In [13]:
base_url_cook = 'https://api.pushshift.io/reddit/search/submission/?subreddit=CryptoCurrency&size=100&before={}d'
urls_cook = [base_url_cook.format(i) for i in range(60,-1,-1)] # generate the urls
# the first -1 is the stopping point, coz range is exclusive to the endpoint. 
# the second -1 is to go in reverse on the range.
base_url_nut = 'https://api.pushshift.io/reddit/search/submission/?subreddit=StockMarket&size=100&before={}d'
urls_nut = [base_url_nut.format(i) for i in range(60,-1,-1)]

In [14]:
pages_crypto = []
for u in urls_cook:
    sleep(1)
    pages_crypto.append(requests.get(u).json()['data'])

In [15]:
pages_stock = []
for u in urls_nut:
    sleep(1)
    pages_stock.append(requests.get(u).json()['data'])

In [18]:
crypto_data = []
stock_data = []
crypto_target = []
stock_target = []

for p in pages_crypto: 
    for post in p:
        if post['selftext']!='':
            crypto_data.append(post['selftext'])
            crypto_target.append(post['subreddit'])

KeyError: 'selftext'

In [19]:
for p in pages_stock:
    count2 = 0
    for post in p:
        try: #because one post doesn't have a 'selftext'. nut_data stops at 116. therefore I need try/except
            if post['selftext']!='':
                stock_data.append(post['selftext'])
                stock_target.append(post['subreddit'])
        except: 
            stock_data.append('[removed]') # i want to add what I want to drop later on. some posts have '[removed]' in them
            stock_target.append('nutrition')