In [10]:
import requests
from datetime import datetime as dt
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
import pandas as pd
import time as tm

In [11]:
api = "https://api.pushshift.io/reddit/search/submission?subreddit="

In [12]:
def get_model_and_features():
    with open('clf_v2.pkl', 'rb') as f:
        clf = pickle.load(f)
    with open('features_weights.pkl', 'rb') as f:
        feature_weight_list = pickle.load(f)
    return clf, feature_weight_list

In [13]:
def config_CV():
    clf, feature_weight_list = get_model_and_features()
    stop = set(stopwords.words('english'))
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=False, ngram_range=(1,1), strip_accents='unicode', vocabulary=feature_weight_list)
    return clf, cv

In [14]:
subreddit = input("Enter subreddit: ")
# subreddit = "Conservative"

In [15]:
def get_posts(subreddit):
    submissions_list = []
    url = f'{api}{subreddit}&limit=500'
    r = requests.get(url)
    response = r.json()
    data = response['data']
    for i in data:
        submissions_list.append(i['title'])
    submissions_list = [str(i) for i in submissions_list]
    return submissions_list

In [16]:
def vectorize_posts(subreddit):
    clf, cv = config_CV()
    submissions_list = get_posts(subreddit)
    word_count_vector = cv.fit_transform(submissions_list)
    tfid_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfid_transformer.fit(word_count_vector)
    tfid_vector = tfid_transformer.transform(word_count_vector)
    feature_names = cv.get_feature_names_out()
    df = pd.DataFrame(tfid_vector.toarray(), columns=feature_names)
    predictions = clf.predict(df)
    df = pd.DataFrame({'submission': submissions_list, 'prediction': predictions})
    return df


In [17]:
df = vectorize_posts(subreddit)
df = df.sort_values(by=['prediction'], ascending=False)

In [18]:
#make column width larger in df
pd.set_option('display.max_colwidth', None)
print(df['prediction'].mean())
df.head(25)

0.16


Unnamed: 0,submission,prediction
99,Michael Heiser talk about where freedom comes from in the Bible,1
44,"Oh... so, what are they trying to keep buried now? (and just imagine if he'd ""respected"" the u.kraine as much as the cops by ""defunding"" them 🤔..... with the utmost respect 🙃🤡)",1
34,WCHS Eyewitness News - Charleston police discuss Wednesday night shooting,1
24,Really sad to see all the pathetic people on this forum(meaning Reddit) demonizing vets who put their lives on the line to defend American citizens like them to bitch and complain how they didn’t die in battle so they shouldn’t be recognized on Memorial Day.,1
38,"War is peace. Freedom is slavery. Ignorance is strength. Lies are Truth, Facts are Outlawed and... No thank you. Lets bring tar and feathering back instead.",1
68,Inside joke for those who know of Pastor Steven Anderson. Chided ever since like Tom Cruise but... this was an AWESOME sermon. 🤣,1
19,"PSA: 2.4 billion genetically-modified mosquitoes were released in California and Florida, created by Oxitech, a biotech firm funded by the Bill and Melinda Gates Foundation.",1
85,Just love this.,1
42,The TV said he had a job. The TV said he had the money. The TV said he had only one goal. Amazing how TV has knowledge on everything these days 🤔 ...🙃🤡,1
31,First responders say Uvalde victim may have lived if law enforcement acted faster | The Post Millennial,1
