In [1]:
import gzip
import bz2
import csv
import pandas as pd
from collections import defaultdict
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
import numpy as np

In [2]:
def readGz(path):
    unzipper = gzip
    if path.endswith('bz2'):
        unzipper = bz2
    for l in unzipper.open(path, 'rt'):
        yield eval(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d

In [4]:
data = list(readCSV('redditSubmissions.csv.gz'))
data = list(filter(lambda x: 'username' in x and len(x['username']) > 0 and 'subreddit' in x and len(x['subreddit']) > 0, data))
len(data)

112048

In [5]:
data[0]

{'#image_id': '0',
 'unixtime': '1333172439',
 'rawtime': '2012-03-31T12:40:39.590113-07:00',
 'title': "And here's a downvote.",
 'total_votes': '63470',
 'reddit_id': 'rmqjs',
 'number_of_upvotes': '32657',
 'subreddit': 'funny',
 'number_of_downvotes': '30813',
 'localtime': '1333197639',
 'score': '1844',
 'number_of_comments': '622',
 'username': 'Animates_Everything'}

In [6]:
user_posts = defaultdict(int)
for d in data:
    user = d['username']
    subreddit = d['subreddit']
    engagement = 0
    if int(d['total_votes']) > 0:
        engagement = int(d['number_of_upvotes']) / int(d['total_votes']) # can be number_of_posts, score, or number_of_upvotes
    user_posts[(user, subreddit)] += engagement

In [7]:
# df = pd.DataFrame(user_posts)
# df.head()
data_li = []
for key, score in user_posts.items():
    data_li.append([key[0], key[1], score])

In [8]:
df = pd.DataFrame(data_li, columns=['username', 'subreddit', 'score'])

In [9]:
df['subreddit'] = df['subreddit'].astype('category')
df['username'] = df['username'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76197 entries, 0 to 76196
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   username   76197 non-null  category
 1   subreddit  76197 non-null  category
 2   score      76197 non-null  float64 
dtypes: category(2), float64(1)
memory usage: 3.6 MB


In [10]:
df.head(10)

Unnamed: 0,username,subreddit,score
0,Animates_Everything,funny,1.030085
1,Gangsta_Raper,GifSound,3692.46476
2,Hellothereawesome,gifs,0.425
3,HadManySons,pics,0.45
4,TraumaticASH,gifs,0.47619
5,MidgetDance1337,gifs,0.774908
6,Pazzaz,funny,0.542972
7,koolkows,funny,0.380952
8,Moncole,funny,0.583333
9,shortguy014,gaming,0.380952


In [11]:
comments = coo_matrix((df['score'].astype(float), 
                   (df['subreddit'].cat.codes, 
                    df['username'].cat.codes)))

In [12]:
csr_comments = comments.tocsr()

In [13]:
model = AlternatingLeastSquares(factors=100, iterations=20)
model.fit(bm25_weight(csr_comments))

100%|██████████| 20/20 [00:45<00:00,  2.25s/it]


In [14]:
subreddit_factors, user_factors = model.item_factors, model.user_factors

In [15]:
class RecommenderSystem(object):
    def __init__(self, subreddit_factors):
        norms = np.linalg.norm(subreddit_factors, axis=-1)
        self.factors = subreddit_factors / norms[:, np.newaxis]
        self.subreddits = df['subreddit'].cat.categories.array.to_numpy()

    def get_related(self, subreddit, N=10):
        subredditid = np.where(self.subreddits == subreddit)[0][0]
        scores = self.factors.dot(self.factors[subredditid])
        best = np.argpartition(scores, -N)[-N:]
        best_ = [self.subreddits[i] for i in best]
        return sorted(zip(best_, scores[best]), key=lambda x: -x[1])

In [16]:
top_related_subreddits = RecommenderSystem(subreddit_factors)


In [22]:
top_related_subreddits.get_related('CrappyDesign', 20)

[('CrappyDesign', 1.0),
 ('Art', 0.9258941),
 ('chicago', 0.8314631),
 ('nope', 0.6441259),
 ('gamegrumps', 0.4420331),
 ('linux_gaming', 0.39905274),
 ('soccer', 0.37757373),
 ('Over 9000', 0.37359935),
 ('shittyfoodporn', 0.37307596),
 ('weareallfriendshere', 0.3685921),
 ('FoodPorn', 0.34834355),
 ('FinalFantasy', 0.34477356),
 ('Watches', 0.3446886),
 ('lostgeneration', 0.34303454),
 ('Pets', 0.34283724),
 ('golf', 0.3383833),
 ('Modern technology at its pinnacle', 0.33748755),
 ('Rabbits', 0.3368399),
 ('Boss', 0.33577192),
 ('runescape', 0.3223161)]