In [1]:
import requests
from datetime import datetime as dt
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
import pandas as pd
import time as tm

In [2]:
api = "https://api.pushshift.io/reddit/search/submission?subreddit="

In [3]:
def get_model_and_features():
    with open('clf_v2.pkl', 'rb') as f:
        clf = pickle.load(f)
    with open('features_weights.pkl', 'rb') as f:
        feature_weight_list = pickle.load(f)
    return clf, feature_weight_list

In [4]:
def config_CV():
    clf, feature_weight_list = get_model_and_features()
    stop = set(stopwords.words('english'))
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=True, ngram_range=(1,1), strip_accents='ascii', vocabulary=feature_weight_list)
    return clf, cv

In [5]:
# subreddit = input("Enter subreddit: ")
subreddit = "Conservative"
#1/2/2022 in unix
start = 1641099600
#5/26/2022 in unix time
end = 1653623999
#call limit
limit = 500

# #tester
# end = 1641154362

In [6]:
def get_posts_from_sub(subreddit, start, end, limit):
    submissions_list = []
    timestamp_list = []
    counter = start
    break_counter = 0
    try:
        while counter <= end and break_counter < 5:
            endpoint = f'{api}{subreddit}&after={counter}&before={end}&limit={limit}'
            req = requests.get(endpoint)
            submissions = req.json()
            submissions  = submissions['data']
            if len(submissions) == 0:
                break_counter +=1
                counter += 1
                print("broke from the break counter i guess")
            else:
                try:
                    for submission in submissions:
                        submissions_list.append(submission['title'])
                        timestamp_list.append(dt.utcfromtimestamp(submission['created_utc']).date())
                        counter = submission['created_utc']
                        break_counter = 0
                except Exception as e_mid:
                    print(f' Error at mid lvl, {e_mid}')
                    counter +=1
                    tm.sleep(1)
                    continue
            print("success")
    except Exception as e:
        print(endpoint)
        print(f'Error at top lvl, {e}')
        return submissions_list, timestamp_list
        # print(f'Top level error {e.with_traceback()}')
    return submissions_list, timestamp_list


In [7]:
def vectorize_posts(subreddit, start, end, limit):
    clf, cv = config_CV()
    submissions_list, timestamp_list = get_posts_from_sub(subreddit, start, end, limit)
    word_count_vector = cv.fit_transform(submissions_list)
    tfid_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfid_transformer.fit(word_count_vector)
    tfid_vector = tfid_transformer.transform(word_count_vector)
    feature_names = cv.get_feature_names_out()
    df = pd.DataFrame(tfid_vector.toarray(), columns=feature_names)
    predictions = clf.predict(df)
    return predictions, timestamp_list


In [8]:
preds, times = vectorize_posts(subreddit, start, end, limit)

success
success
success
success
success
success
success
success
success
https://api.pushshift.io/reddit/search/submission?subreddit=Conservative&after=1641313263&before=1653623999&limit=500
Error at top lvl, Expecting value: line 1 column 1 (char 0)




In [9]:
df = pd.DataFrame(preds, columns=['prediction'])
df['timestamp'] = times
df = df.groupby('timestamp').agg({'prediction':'mean'})
df.head()

Unnamed: 0_level_0,prediction
timestamp,Unnamed: 1_level_1
2022-01-02,0.511905
2022-01-03,0.496437
2022-01-04,0.497797
