In [None]:
import numpy as np 
import pandas as pd 
import os

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

Read data and give a shot overview

In [None]:
df = pd.read_csv("../input/subreddit-recommender/reddit_user_data_filtered.csv")
df.describe(include="all")

In [None]:
df['subreddit'].value_counts()[:20].plot(kind='barh')

### Build a simple recommender based on KNN

In [None]:
# Use a smaller part of the dataset for performance
# df = df.iloc[:500000, :]

# Pivot table of shape 
df_feature = df.pivot_table(index='subreddit', columns='user', aggfunc='size', fill_value=0)
# Convert to sparse matrix
feature_matrix = csr_matrix(df_feature.values)

In [None]:
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
model.fit(feature_matrix)

In [None]:
def make_recommendation(subreddit, k=3):
    try:
        i = df_feature.index.get_loc(subreddit)
    except KeyError: 
        raise NameError(f"Subreddit {subreddit} is unknown.")

    neigh_dist, neigh_ind = model.kneighbors(X=feature_matrix[i], n_neighbors=k+1)

    # remove first cause its the given one
    result = sorted(list(zip(df_feature.index[neigh_ind.squeeze()], neigh_dist.squeeze())), key=lambda x: x[1])[1:]
    
    print(f"Recommendations for Subreddit '{subreddit}':")
    for i, (subreddit, dist) in enumerate(result):
        print(f"{i+1:>2}. {subreddit:<20} (Distance: {dist:.4f})")

In [None]:
# Specify the subreddit from which a recommendation should be made
my_subreddit = "Python"
# Specify the amount of recommendations
k = 10

make_recommendation(my_subreddit, k=k)
