In [1]:
import praw
import os
import pandas as pd
import re
from random import sample
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import spacy
nlp = spacy.load("en_core_web_lg")

from gensim.models import Doc2Vec
import gensim.models.doc2vec as doc2vec
from gensim.models.doc2vec import TaggedDocument

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
#username and password are not needed for public subreddits
reddit = praw.Reddit(client_id = os.getenv("REDDIT_CLIENT_ID"),
                    client_secret = os.getenv("REDDIT_CLIENT_SECRET"),
                    username = os.getenv("REDDIT_USERNAME"),
                    password = os.getenv("REDDIT_PASSWORD"),
                    user_agent = 'post-here-script'
                    )

In [70]:
%%time
'''
Given two sample subreddits out of the top 200
get the top 25 all time up voted submissions
then get the text of those submissions (only text)
and write subreddit name, title, and text to a file
'''

def build_corpus_df(subreddit_list, num_sub):
    '''
    Input: a list of subreddits to scrape
            number of submissions per group
    Output: creates a dataframe for each self.text only
            submissions
    Dataframe Structure: subreddit name
                    submission id
                    submission text (a join of the title and body)
    '''
    rows =[]
    for subred in subreddit_list:
        for comment_id in reddit.subreddit(subred).top(limit=num_sub): #top('all') returns top 100 comments from subreddit
            if comment_id.is_self == True: #only text posts, no pics or vids
                submission = reddit.submission(id=comment_id)
                subreddit = subred
                id = str(comment_id)
                text = submission.title + "\n" + submission.selftext
                row = [subreddit, id, text]
                rows.append(row)
    df=pd.DataFrame(rows,columns=["subreddit","id","text"])
    return df

    

CPU times: user 11 µs, sys: 1 µs, total: 12 µs
Wall time: 16.7 µs


In [None]:
#top 200 subs from https://redditmetrics.com/top
#grep r/ Top\ subreddits\ * | sed -e "s|^.*r/|r/|"|grep \>|sed -e "s/r\///"|sed -e "s/>.*$//"|sort|uniq|sed -e ':a;N;$!ba;s/\n/,/g'
top200 = ["AdviceAnimals","AmItheAsshole","Android","AnimalsBeingBros","AnimalsBeingDerps","AnimalsBeingJerks","Art","ArtisanVideos","AskMen","AskReddit","Awwducational","BeAmazed","BetterEveryLoop","BikiniBottomTwitter","BlackPeopleTwitter","ChildrenFallingOver","ChoosingBeggars","ContagiousLaughter","Cooking","CrappyDesign","DIY","Damnthatsinteresting","DnD","Documentaries","EarthPorn","EatCheapAndHealthy","Eyebleach","FiftyFifty","Fitness","FoodPorn","Futurology","Games","GetMotivated","GifRecipes","HighQualityGifs","HistoryMemes","HistoryPorn","HumansBeingBros","IAmA","IdiotsInCars","InternetIsBeautiful","Jokes","KidsAreFuckingStupid","LifeProTips","MadeMeSmile","MakeupAddiction","Minecraft","MovieDetails","MurderedByWords","Music","NSFW_GIF","NatureIsFuckingLit","NetflixBestOf","NintendoSwitch","NoStupidQuestions","OldSchoolCool","OutOfTheLoop","Outdoors","Overwatch","PS4","Parenting","PewdiepieSubmissions","PublicFreakout","RealGirls","Roadcam","RoastMe","Showerthoughts","StarWars","Tinder","TrendingReddits","TrollYChromosome","TwoXChromosomes","Unexpected","UpliftingNews","WTF","WatchPeopleDieInside","Wellthatsucks","Whatcouldgowrong","WhitePeopleTwitter","WritingPrompts","YouShouldKnow","anime","announcements","askscience","atheism","aww","backpacking","battlestations","bestof","biology","blackmagicfuckery","blog","boardgames","books","buildapc","cars","cats","confession","creepy","cursedcomments","dadjokes","dankmemes","dataisbeautiful","drawing","electronicmusic","europe","explainlikeimfive","facepalm","food","frugalmalefashion","funny","gadgets","gameofthrones","gaming","gardening","gifs","gonewild","hiphopheads","history","hmmm","horror","howto","humor","insanepeoplefacebook","instant_regret","interestingasfuck","iphone","itookapicture","keto","leagueoflegends","lifehacks","listentothis","loseit","mac","madlads","malefashionadvice","me_irl","memes","mildlyinfuriating","mildlyinteresting","movies","natureismetal","nba","nevertellmetheodds","news","nextfuckinglevel","nfl","nintendo","nonononoyes","nosleep","nottheonion","nsfw","oddlysatisfying","offmychest","pcgaming","pcmasterrace","personalfinance","philosophy","photography","photoshopbattles","pics","pokemon","pokemongo","politics","programming","rarepuppers","raspberry_pi","reactiongifs","reallifedoodles","recipes","relationship_advice","relationships","rickandmorty","science","scifi","sex","slowcooking","soccer","socialskills","space","sports","streetwear","tattoos","technology","teenagers","television","therewasanattempt","tifu","todayilearned","trashy","travel","trees","trippinthroughtime","videos","whatisthisthing","wholesomememes","woahdude","woodworking","worldnews","xboxone","youseeingthisshit"]

In [71]:
%%time
# pick a random selection of 100 subreddits from the top 200
# look at the all time top 50 posts from each reddit
# since some subs have more self posts than other this will
# probably lead to an imbalance and affect prediction results

df = build_corpus_df(sample(top200,100), 50)

CPU times: user 1min 26s, sys: 1.79 s, total: 1min 28s
Wall time: 1h 37min 24s


In [73]:
# save the dataframe to csv for faster future development
df.to_csv('reddit.csv', index=False) 

In [41]:
%%time
df['text_tokens'] = df['text'].apply(lambda x: nlp(x))

CPU times: user 37.2 s, sys: 1.96 s, total: 39.1 s
Wall time: 10.3 s


In [42]:
df

Unnamed: 0,subreddit,id,text,text_tokens
0,WritingPrompts,5uilpw,"[WP] The year is 1910. Adolf Hitler, a struggl...","([, WP, ], The, year, is, 1910, ., Adolf, Hitl..."
1,WritingPrompts,7en7vl,[WP] The year is 2038 and net neutrality has b...,"([, WP, ], The, year, is, 2038, and, net, neut..."
2,WritingPrompts,7vt58k,[WP] “I’ll tell you what I’m going to do Mr Bo...,"([, WP, ], “, I, ’ll, tell, you, what, I, ’m, ..."
3,WritingPrompts,7ezd5t,"[WP] When you die, you appear in a cinema with...","([, WP, ], When, you, die, ,, you, appear, in,..."
4,WritingPrompts,arwr7a,[OT] Seven months ago I responded to a prompt ...,"([, OT, ], Seven, months, ago, I, responded, t..."
5,WritingPrompts,6i00oy,"[WP] You hire a female prostitute, tell her to...","([, WP, ], You, hire, a, female, prostitute, ,..."
6,WritingPrompts,cvoaso,[WP] You lost your sight - along with everyone...,"([, WP, ], You, lost, your, sight, -, along, w..."
7,WritingPrompts,8aec6t,[WP] It's 3 AM. An official phone alert wakes ...,"([, WP, ], It, 's, 3, AM, ., An, official, pho..."
8,WritingPrompts,7dcwik,[WP] Every country has ninjas but the world on...,"([, WP, ], Every, country, has, ninjas, but, t..."
9,WritingPrompts,7i3bs6,[WP] Humans once wielded formidable magical po...,"([, WP, ], Humans, once, wielded, formidable, ..."


In [46]:
def doc2vec_labels(corpus, label_to_use):
    """
    Doc2Vec needs a label for each document. The format will be "train_i" or "test_i" where "i" is
    an index value.
    https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.TaggedDocument
    https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1
    """
    with_label = []
    for j, k in enumerate(corpus):
        label = label_to_use + '_' + str(j)
        with_label.append(doc2vec.TaggedDocument(k.split(), [label]))
    return with_label

In [74]:
# create features and targets

X = df.text
y = df.subreddit

In [75]:
# create 80/20 split train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
X_train = doc2vec_labels(X_train, 'train')
X_test = doc2vec_labels(X_test, 'test')
all_data = X_train + X_test

In [52]:
all_data[:10]

[TaggedDocument(words=['I', 'survived', 'my', 'first', 'week', 'as', 'legal', 'guardian', 'to', 'my', 'two', 'little', 'brothers,', '7', '&', '11', 'years', 'of', 'age', 'I’m', 'a', '44', 'y/o', 'single', 'male,', 'and', 'our', 'dad', 'was', '67.', 'These', 'are', 'my', 'half-brothers,', 'and', 'our', 'dad', 'passed', 'away', 'unexpectedly', 'Dec', '8th.', 'Dad', 'and', 'the', 'boys', 'lived', 'off-grid', 'in', 'the', 'remote', 'Rocky', 'Mountains,', 'and', 'five', 'years', 'ago', 'their', 'mom', 'got', '“cabin', 'fever”', 'after', 'having', 'been', 'so', 'isolated', '(as', 'in', 'no', 'cell-phone', 'signal', 'even),', 'and', 'abandoned', 'them', 'and', 'devolved', 'into', 'alcoholism', 'and', 'addiction.', 'She', 'hasn’t', 'shown', 'up', 'for', 'any', 'of', 'her', '(supervised)', 'visitation', 'in', 'almost', '4', 'years,', 'so', 'DHS', 'called', 'me', 'the', 'morning', 'that', 'dad', 'died', 'and', 'let', 'me', 'know', 'that', 'if', 'I', 'didn’t', 'get', 'there', 'ASAP', 'to', 'take'

In [76]:
# https://medium.com/@amarbudhiraja/understanding-document-embeddings-of-doc2vec-bfe7237a26da
# https://radimrehurek.com/gensim/models/doc2vec.html
bow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, workers=8, alpha=0.1, min_alpha=0.05)
bow.build_vocab(all_data)



In [77]:
# https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.trainables
# https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1
for epoch in range(50):
    bow.train(all_data, total_examples=len(all_data), epochs=3)
    bow.alpha -= 0.002 # decrease the learning rate
    bow.min_alpha = bow.alpha

In [None]:
def make_vectors(model, corpus_size, vectors_size, label_type):
    """
    Trained doc2vec model vectors
    Inputs: model: Trained Doc2Vec model
            corpus_size: feature data size
            vectors_size: embedding vector size
            label_type: the label we're looking at
    Outputs: vector list
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = label_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [78]:
# vectorize the gensim bag of words
vectors_bow_train = make_vectors(bow, len(X_train), 300, 'train')
vectors_bow_test = make_vectors(bow, len(X_test), 300, 'test')

In [79]:
%%time

lr = LogisticRegression(n_jobs=30, multi_class='multinomial')
lr = logreg.fit(vectors_bow_train, y_train)

CPU times: user 49.3 ms, sys: 169 ms, total: 218 ms
Wall time: 1.22 s


In [87]:
user_title = "this is a title"
user_body = "some stuff i want to write"
# user_title = "no spoilers  how does rising the dead work"
# user_body = "some stuff i wonder about how fresh do the dead need to be  can the nk rise people ho died weeks months years ago and how close"
# user_title = "please help me with my relationship issue!"
# user_body = "my wife beats me and calls me little man. I want out! She takes my lunch money and kicks my nuts."
# user_title = "I am a new grad looking for a job and currently in the process with a company for a junior backend engineer role."
# user_body = """I was under the impression that the position was Javascript but instead it is actually Java. My general programming and "leet code" skills are pretty good, but my understanding of Java is pretty shallow. How can I use the next three days to best improve my general Java knowledge? Most resources on the web seem to be targeting complete beginners. Maybe a book I can skim through in the next few days?

# Edit:

# A lot of people are saying "the company is a sinking ship don't even go to the interview". I just want to add that the position was always for a "junior backend engineer". This company uses multiple languages and the recruiter just told me the incorrect language for the specific team I'm interviewing for. I'm sure they're mainly interested in seeing my understanding of good backend principles and software design, it's not a senior lead Java position."""

In [88]:
all_user_text = [user_title + ' ' + user_body]
user_input = bow.infer_vector(all_user_text, steps=30)
user_input = user_input.reshape(1, -1)
prediction = lr.predict(user_input)
print(prediction)

['Jokes']


In [89]:
import pickle

In [91]:
with open('aaron_lr_pickle', 'wb') as f:
    pickle.dump(logreg, f)

In [92]:
with open('aaron_bow_pickle', 'wb') as f:
    pickle.dump(bow, f)