In [1]:
import pandas as pd
import csv

In [2]:
reddit_data = pd.read_csv('top_subreddit_info.csv')

In [3]:
reddit_data.head()

Unnamed: 0.1,Unnamed: 0,subreddit_description,subreddit_id,subreddit_name,subreddit_nsfw,subreddit_subscribers
0,0,###### [ [ SERIOUS ] ](http://www.reddit.com/r...,2qh1i,AskReddit,False,24877036
1,1,>* **[/r/inthenews](/r/inthenews?hl)**\r\n\r\n...,2qh3l,news,False,19153359
2,2,>>> - **Other Subs:**\r\n\r\n>>> - [Related](h...,2qh13,worldnews,False,22340987
3,3,## **Welcome to /r/Politics! Please read [the ...,2cneq,politics,False,5488713
4,4,**Welcome to r/Funny:**\r\n\r\nYou may only po...,2qh33,funny,False,26820807


In [4]:
reddit_data = reddit_data.drop(columns='Unnamed: 0')
reddit_data.head()

Unnamed: 0,subreddit_description,subreddit_id,subreddit_name,subreddit_nsfw,subreddit_subscribers
0,###### [ [ SERIOUS ] ](http://www.reddit.com/r...,2qh1i,AskReddit,False,24877036
1,>* **[/r/inthenews](/r/inthenews?hl)**\r\n\r\n...,2qh3l,news,False,19153359
2,>>> - **Other Subs:**\r\n\r\n>>> - [Related](h...,2qh13,worldnews,False,22340987
3,## **Welcome to /r/Politics! Please read [the ...,2cneq,politics,False,5488713
4,**Welcome to r/Funny:**\r\n\r\nYou may only po...,2qh33,funny,False,26820807


In [5]:
len(reddit_data['subreddit_description'])

100

In [6]:
# print(reddit_data['subreddit_description'][0])

In [7]:
reddit_data.isna().sum()

subreddit_description    1
subreddit_id             0
subreddit_name           0
subreddit_nsfw           0
subreddit_subscribers    0
dtype: int64

In [8]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS



In [9]:
reddit_data['subreddit_description'] = reddit_data['subreddit_description'].fillna('').astype(str) # creates a list of words which can't be tokenized

In [10]:
def tokenize(doc):
    return [token for token in simple_preprocess(doc) if token not in STOPWORDS]

In [11]:
reddit_data['tokens'] = reddit_data['subreddit_description'].apply(tokenize)

In [12]:
reddit_data.isna().sum()

subreddit_description    0
subreddit_id             0
subreddit_name           0
subreddit_nsfw           0
subreddit_subscribers    0
tokens                   0
dtype: int64

In [13]:
reddit_data['tokens'][0:10]

0    [http, www, reddit, com, askreddit, submit, se...
1    [inthenews, inthenews, hl, worldnews, worldnew...
2    [subs, related, http, goo, gl, ztbbza, news, w...
3    [welcome, politics, read, wiki, politics, inde...
4    [welcome, funny, post, funny, new, reddit, cli...
5    [nsfw, sex, bodily, discharge, filter, yes, ht...
6    [welcome, amitheasshole, http, www, reddit, co...
7    [http, www, reddit, com, aww, newlink, new, re...
8    [join, discord, server, https, discord, gg, tc...
9    [res_sr_config, need, help, relationship, roma...
Name: tokens, dtype: object

In [14]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [15]:
tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=0.1, max_df=0.9, ngram_range=(1, 2))
sparse = tfidf.fit_transform(reddit_data['subreddit_description'])
dtm = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())
dtm.head()

Unnamed: 0,acceptable,accounts,add,advice,allow,allowed,ama,animals,article,articles,...,wiki https,wiki index,wiki reddiquette,wiki rules,witch,word,work,www,www reddit,youtube
0,0.0,0.0,0.0,0.072558,0.0,0.044308,0.0,0.0,0.0,0.0,...,0.0,0.330153,0.036279,0.0,0.0,0.0,0.0,0.318133,0.324561,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188042,0.125339,...,0.0,0.0,0.0,0.07448,0.0,0.0,0.0,0.205051,0.209194,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.084714,0.0,0.0,0.079187,...,0.0,0.0,0.0,0.070583,0.0,0.0,0.0,0.259098,0.148687,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.19805,0.0,0.138873,0.154275,...,0.0,0.330027,0.030221,0.0,0.060442,0.069436,0.0,0.088337,0.064372,0.0
4,0.0,0.040954,0.04653,0.0,0.04653,0.208115,0.0,0.0,0.0,0.0,...,0.050301,0.0,0.0,0.077537,0.0,0.0,0.04653,0.17789,0.145188,0.038103


In [16]:
nn = NearestNeighbors(n_neighbors=10, algorithm='brute')
nn.fit(dtm)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=10, p=2, radius=1.0)

In [17]:
query = ["""PC's are the superior console."""]

In [18]:
looking = tfidf.transform(query)
nn.kneighbors(looking.todense())

(array([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[54,  8, 94, 17, 56, 20,  2, 39, 16, 59]], dtype=int64))

In [19]:
reddit_data['subreddit_name'][59]

'pcmasterrace'