In [55]:
import praw
import os
from random import sample 

import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import spacy
nlp = spacy.load("en_core_web_lg")

from dotenv import load_dotenv
load_dotenv()

In [4]:
#username and password are not needed for public subreddits
reddit = praw.Reddit(client_id = os.getenv("REDDIT_CLIENT_ID"),
                    client_secret = os.getenv("REDDIT_CLIENT_SECRET"),
                    username = os.getenv("REDDIT_USERNAME"),
                    password = os.getenv("REDDIT_PASSWORD"),
                    user_agent = 'post-here-script'
                    )

In [5]:
#top 200 subs from https://redditmetrics.com/top
#grep r/ Top\ subreddits\ * | sed -e "s|^.*r/|r/|"|grep \>|sed -e "s/r\///"|sed -e "s/>.*$//"|sort|uniq|sed -e ':a;N;$!ba;s/\n/,/g'
top200 = ["AdviceAnimals","AmItheAsshole","Android","AnimalsBeingBros","AnimalsBeingDerps","AnimalsBeingJerks","Art","ArtisanVideos","AskMen","AskReddit","Awwducational","BeAmazed","BetterEveryLoop","BikiniBottomTwitter","BlackPeopleTwitter","ChildrenFallingOver","ChoosingBeggars","ContagiousLaughter","Cooking","CrappyDesign","DIY","Damnthatsinteresting","DnD","Documentaries","EarthPorn","EatCheapAndHealthy","Eyebleach","FiftyFifty","Fitness","FoodPorn","Futurology","Games","GetMotivated","GifRecipes","HighQualityGifs","HistoryMemes","HistoryPorn","HumansBeingBros","IAmA","IdiotsInCars","InternetIsBeautiful","Jokes","KidsAreFuckingStupid","LifeProTips","MadeMeSmile","MakeupAddiction","Minecraft","MovieDetails","MurderedByWords","Music","NSFW_GIF","NatureIsFuckingLit","NetflixBestOf","NintendoSwitch","NoStupidQuestions","OldSchoolCool","OutOfTheLoop","Outdoors","Overwatch","PS4","Parenting","PewdiepieSubmissions","PublicFreakout","RealGirls","Roadcam","RoastMe","Showerthoughts","StarWars","Tinder","TrendingReddits","TrollYChromosome","TwoXChromosomes","Unexpected","UpliftingNews","WTF","WatchPeopleDieInside","Wellthatsucks","Whatcouldgowrong","WhitePeopleTwitter","WritingPrompts","YouShouldKnow","anime","announcements","askscience","atheism","aww","backpacking","battlestations","bestof","biology","blackmagicfuckery","blog","boardgames","books","buildapc","cars","cats","confession","creepy","cursedcomments","dadjokes","dankmemes","dataisbeautiful","drawing","electronicmusic","europe","explainlikeimfive","facepalm","food","frugalmalefashion","funny","gadgets","gameofthrones","gaming","gardening","gifs","gonewild","hiphopheads","history","hmmm","horror","howto","humor","insanepeoplefacebook","instant_regret","interestingasfuck","iphone","itookapicture","keto","leagueoflegends","lifehacks","listentothis","loseit","mac","madlads","malefashionadvice","me_irl","memes","mildlyinfuriating","mildlyinteresting","movies","natureismetal","nba","nevertellmetheodds","news","nextfuckinglevel","nfl","nintendo","nonononoyes","nosleep","nottheonion","nsfw","oddlysatisfying","offmychest","pcgaming","pcmasterrace","personalfinance","philosophy","photography","photoshopbattles","pics","pokemon","pokemongo","politics","programming","rarepuppers","raspberry_pi","reactiongifs","reallifedoodles","recipes","relationship_advice","relationships","rickandmorty","science","scifi","sex","slowcooking","soccer","socialskills","space","sports","streetwear","tattoos","technology","teenagers","television","therewasanattempt","tifu","todayilearned","trashy","travel","trees","trippinthroughtime","videos","whatisthisthing","wholesomememes","woahdude","woodworking","worldnews","xboxone","youseeingthisshit"]

In [182]:
%%time
'''
Given two sample subreddits out of the top 200
get the top 25 all time up voted submissions
then get the text of those submissions (only text)
and write subreddit name, title, and text to a file
'''

def build_corpus(subreddit_list):
    '''
    Input: a list of subreddits to scrape
    Output: creates a directory for each subreddit
            each of which contains self.text only 
            the text submissions.
    File Structure: subreddit name
                    submission title
                    submission text
    Improvement idea: accept output directory as
                        a parameter
    '''
    for subred in subreddit_list:
        for comment_id in reddit.subreddit(subred).top(limit=25): #top('all') returns top 100 comments from subreddit
            if comment_id.is_self == True: #only text posts, no pics or vids
                submission = reddit.submission(id=comment_id)
                directory = subred
                os.makedirs(directory, exist_ok=True)
                filename = comment_id
                with open(os.path.join(str(directory), str(filename)), "w") as file:
                    file.write(subred)
                    file.write("\n")
                    file.write(submission.title)
                    file.write("\n")
                    file.write(str(submission.selftext))
                    file.close()
    return

    

CPU times: user 11.5 s, sys: 275 ms, total: 11.8 s
Wall time: 10min 39s


In [103]:
def gather_data(filefolder):
    """ Creates a List of Documents from a Directory
    then reads the files and returns a list of strings
    """
    
    data = []
    listOfFiles = list()
    for (dirpath, dirnames, filenames) in os.walk("data"):
        listOfFiles += [os.path.join(dirpath, file) for file in filenames]
    for article in listOfFiles:
        with open(article, 'rb') as f:
            data.append(f.read())
   
    return data

In [186]:
data = gather_data('./data')

In [124]:
def tokenize(document):
    doc = nlp(document)
    return [token.lemma_ for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [187]:
%%time
# Apply tfidf vectorizer to our Data
# Use custom Spacy Vectorizer
# reddit articles in `data` variable

tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=.05, max_df=.90, ngram_range=[1,2])

#Learn our Vocab
tfidf.fit_transform(data)

# Get sparse dtm
dtm = tfidf.transform(data)

dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

CPU times: user 57.5 s, sys: 3.09 s, total: 1min
Wall time: 15.6 s


In [188]:
dtm

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,1,2,edit,proof,start,thank,update,...,worth,wow,write,year,year ago,year old,yes,young,’,’s
0,0.371061,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.071200,0.185189,0.042717,0.000000,0.071200,0.000000,0.000000,0.000000,0.000000
1,0.151450,0.000000,0.000000,0.000000,0.000000,0.126305,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.296121,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.035799,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.096162,0.033542,0.032056,0.000000,0.000000,0.000000,0.000000
5,0.031031,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.050013,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.257515,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.096794,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.059290,0.103404,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.255482,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.247053,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.157939,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041556,0.000000,0.000000,...,0.000000,0.000000,0.036785,0.076364,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.205689,0.000000,0.000000,0.000000,0.000000,0.057179,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.055251,0.000000,0.000000,0.192717,0.000000,0.000000,0.000000


In [190]:
%%time
# fit using knn
nn  = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

CPU times: user 1.93 ms, sys: 0 ns, total: 1.93 ms
Wall time: 1.77 ms


In [131]:
nn.kneighbors([dtm.iloc[2]])

(array([[0.        , 1.35825554, 1.38717068, 1.41421356]]),
 array([[2, 3, 1, 0]]))

In [133]:
sample_user_post = ['my friend beats me! someone please help!']

In [191]:
#transform the sample post with TF-IDF
new = tfidf.transform(sample_user_post)

In [195]:
#covnert the sparse matrix to a dense matrix
#pull out the closest matching document number
nn.kneighbors(new.todense())[1][0][0]

3

In [196]:
#the documents have the subreddits embedded in them as the first line
#this regex pulls that subreddit out and converts it to a string
re.sub("[b\']", '',str(data[nn.kneighbors(new.todense())[1][0][0]].splitlines()[0]))

'space'

In [203]:
type(nn)

sklearn.neighbors.unsupervised.NearestNeighbors