In [1]:
import requests
from datetime import datetime as dt
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
import pandas as pd
import time as tm

In [2]:
api = "https://api.pushshift.io/reddit/search/submission?subreddit="

In [3]:
def get_model_and_features():
    with open('clf_v2.pkl', 'rb') as f:
        clf = pickle.load(f)
    with open('features_weights.pkl', 'rb') as f:
        feature_weight_list = pickle.load(f)
    return clf, feature_weight_list

In [4]:
def config_CV():
    clf, feature_weight_list = get_model_and_features()
    stop = set(stopwords.words('english'))
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=False, ngram_range=(1,1), strip_accents='unicode', vocabulary=feature_weight_list)
    return clf, cv

In [5]:
subreddit = input("Enter subreddit: ")
# subreddit = "Conservative"

In [6]:
def get_posts(subreddit):
    submissions_list = []
    url = f'{api}{subreddit}&limit=500'
    r = requests.get(url)
    response = r.json()
    data = response['data']
    for i in data:
        submissions_list.append(i['title'])
    submissions_list = [str(i) for i in submissions_list]
    return submissions_list

In [7]:
def vectorize_posts(subreddit):
    clf, cv = config_CV()
    submissions_list = get_posts(subreddit)
    word_count_vector = cv.fit_transform(submissions_list)
    tfid_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfid_transformer.fit(word_count_vector)
    tfid_vector = tfid_transformer.transform(word_count_vector)
    feature_names = cv.get_feature_names_out()
    df = pd.DataFrame(tfid_vector.toarray(), columns=feature_names)
    predictions = clf.predict(df)
    df = pd.DataFrame({'submission': submissions_list, 'prediction': predictions})
    return df


In [8]:
df = vectorize_posts(subreddit)
df = df.sort_values(by=['prediction'], ascending=False)

In [9]:
#make column width larger in df
pd.set_option('display.max_colwidth', None)
print(df['prediction'].mean())
df.head(25)

0.32


Unnamed: 0,submission,prediction
0,UFO crashed this year and something terrible happened. an alien hunts people in abandoned buildings and forests,1
28,Stanley Kubrick might have been killed for his Eyes Wide Shut movie. Because those secret societies are scary. The question is... will anything happen to me if I were to share a similar story? Here is my experience at a rich people's orgy...,1
81,"This sub cares more about ending crimes against black people, than the left wing MSM who pretend to be so caring, when they’re actually the racist ones, why won’t even talk about it help with this",1
23,"It's not just about the 28k reported mRNA vaxx deaths and the fact that the real death count is likely 280k+ considering 9 out of 10 deaths aren't reported. It's about the millions of lives, reported and unreported, which have been permanently altered by the mRNA gene therapy. Elon should apologize",1
24,I hate to be Machiavellian but American youth should have slow miserable deaths by fentanyl(China)or be kidnapped by Ghislaine(Israel). I noticed people here are aligned with white evangelic. well if you support Israel torturing Palestinians then why are you so hypocritical?,1
25,Operation “baby lift” seems SUS 🧐,1
46,52 people shot in Chicago.... 10 have died..,1
45,NPR reminds us that democrats stole the election in 2020,1
66,"On May 17th, 2018, Tim Kennedy tells Joe Rogan that Hitler escaped to Argentina after WW2. The VERY NEXT DAY, mainstream media mass distributes a scientific study contradicting his claims.",1
85,"Major glitch in the mainstream news narrative of the Uvalde school shooting, as CNN and NBC interview two completely different individuals posing as the grieving ""father"" of 10 year old victim Amerie Jo Garza. Neither is remotely convincing.",1
