# Subreddit Recommender
### Exploratory Data Analysis

In [63]:
# DEPENDENCIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from fuzzywuzzy import fuzz


In [7]:
# GLOBAL VARIABLES


In [12]:
# IMPORT THE DATA

try:
    # import the data from csv
    data = pd.read_csv('reddit_user_subreddit.csv')
    
except:
    # import and run the sql query
    connection = sqlite3.connect('reddit_data.db')
    
    data = pd.read_sql_query(
        """
        SELECT O.username_id, O.subreddit_id, COUNT(O.utc) AS visits
            FROM
                Observation O
            GROUP BY O.username_id, O.subreddit_id
            ;
        """,
        connection
        )

    data.to_csv('reddit_user_subreddit.csv', index = False)
    
#             WHERE S.subreddit_id = O.subreddit_id
#                 Subreddit S,
    

In [14]:
# IMPORT OUR SUBREDDIT MAPPER
try: 
    # a second table to map the subreddit_id to the subreddit name, to be accessed after our recommender
    mapper_df = pd.read_csv('subreddit_mapper.csv')
    
except:
    connection = sqlite3.connect('reddit_data.db')
    mapper_df = pd.read_sql_query(
        """
        SELECT S.subreddit_id, S.subreddit FROM Subreddit S;
        """, connection)
    
    mapper_df.to_csv('subreddit_mapper.csv', index = False)

In [48]:
#Creating dictionary for subreddits
def create_Dict_Mapper(key_series, value_series):
    dict_subreddits = key_series
    dict_subreddits.index = value_series
    dict_subreddits = dict_subreddits.to_dict()
#     inv_dict_sub = {v: k for k, v in dict_subreddits.items()}
    return dict_subreddits

In [51]:
category_num_mapper = create_Dict_Mapper(mapper_df['subreddit_id'], mapper_df['subreddit'])

In [None]:
# CONSIDER CREATING A HOLDOUT DATASET FOR TESTING!!!

In [20]:
data2 = data.copy()

In [25]:
def create_Usage_Col(df, col_users = 'username_id', col_visits = 'visits'):
    df['usage'] = 0
    for user in df[col_users].unique():
        
        
        return df

In [26]:
data_test = create_Usage_Col(data2)
data_test.head()

Unnamed: 0,username_id,subreddit_id,visits,usage
0,0,412,2,0.006116
1,0,568,1,0.003058
2,0,608,7,0.021407
3,0,1402,14,0.042813
4,0,1460,9,0.027523


In [39]:
# look for users that only have 1 subreddit they've posted and drop?

data_test[data_test['usage'] > 0.45]['usage'] 

# so here, we see none of the subreddits have taken up over 45% of a user's engagement.

Series([], Name: usage, dtype: float64)

In [40]:
data_test.head()

Unnamed: 0,username_id,subreddit_id,visits,usage
0,0,412,2,0.006116
1,0,568,1,0.003058
2,0,608,7,0.021407
3,0,1402,14,0.042813
4,0,1460,9,0.027523


In [41]:
# function takes in our usage dataframe and transforms it to a pivot table
def to_matrix(df, col_as_index = 'subreddit_id', col_as_col = 'username_id', values_wanted='usage'):
    return df.pivot(
        index = col_as_index,
        columns = col_as_col,
        values = values_wanted
    ).fillna(0)


In [36]:
df_user_usage = to_matrix(data_test) # test my function

In [37]:
df_user_usage.head()

username_id,0,1,2,3,4,5,6,7,8,9,...,22600,22601,22602,22603,22604,22605,22606,22607,22608,22609
subreddit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
from scipy.sparse import csr_matrix

mat_user_usage = csr_matrix(df_user_usage.values) # compress our matrix, build this into a function as well

mat_user_usage

<34967x22610 sparse matrix of type '<class 'numpy.float64'>'
	with 48 stored elements in Compressed Sparse Row format>

In [42]:
# mat_df = pd.DataFrame(mat_user_usage)

print(mat_user_usage)

  (412, 0)	0.0061162079510703364
  (568, 0)	0.0030581039755351682
  (608, 0)	0.021406727828746176
  (1402, 0)	0.04281345565749235
  (1460, 0)	0.027522935779816515
  (3716, 0)	0.0030581039755351682
  (4837, 0)	0.0030581039755351682
  (5886, 0)	0.009174311926605505
  (5993, 0)	0.0030581039755351682
  (6105, 0)	0.021406727828746176
  (10905, 0)	0.07033639143730887
  (11086, 0)	0.0061162079510703364
  (12002, 0)	0.0030581039755351682
  (12225, 0)	0.009174311926605505
  (13881, 0)	0.012232415902140673
  (15204, 0)	0.0061162079510703364
  (15883, 0)	0.009174311926605505
  (16685, 0)	0.3577981651376147
  (17545, 0)	0.0061162079510703364
  (17705, 0)	0.0030581039755351682
  (18385, 0)	0.009174311926605505
  (18498, 0)	0.03363914373088685
  (18862, 0)	0.0030581039755351682
  (19569, 0)	0.012232415902140673
  (20409, 0)	0.024464831804281346
  (20872, 0)	0.012232415902140673
  (21135, 0)	0.0030581039755351682
  (21142, 0)	0.07033639143730887
  (22036, 0)	0.0061162079510703364
  (23050, 0)	0.01223

In [76]:
from sklearn.neighbors import NearestNeighbors

# Initialize the Model
model_knn = NearestNeighbors(n_neighbors=25) #metric='cosine', algorithm='brute', , n_jobs=-1

# fit the dataset
model_knn.fit(mat_user_usage)



NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                 radius=1.0)

In [74]:
def fuzzy_matching(mapper, subreddit):
    match_tuple = []
     # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), subreddit.lower())
            
        if ratio >= 70:
            match_tuple.append((title, idx, ratio))
            match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
            
    if len(match_tuple) == 0:
             print('Oops! No match is found')
             return
    else:
             print(f"Found possible matches in our database: {[x[0] for x in match_tuple]}\n")
             return match_tuple[0][1]
    
#      “””
#      return the closest match via fuzzy ratio. If no match found, return None

#      Parameters
#      — — — — — 
#      mapper: dict, map movie title name to index of the movie in data
#     fav_movie: str, name of user input movie

#      verbose: bool, print log if True
#     Return
#      — — — 
#      index of the closest match
#      “””
     

In [72]:
def make_recommendation(data, mapper, model, subreddit, n_recommendations = 5):
    
    # fit
    model.fit(data)
    
    # get input movie index
    print('You have input subreddit:', subreddit)
    
    idx = fuzzy_matching(
        mapper,
        subreddit
    )

    print('Recommendation system start to make inference')
    print('......\n')
    
    distances, indices = model.kneighbors(
        data[idx],
        n_neighbors=n_recommendations+1
    )
    
    # get list of raw idx of recommendations
    
    raw_recommends = sorted(
        list(
            zip(
                indices.squeeze().tolist(),
                distances.squeeze().tolist()
            )
        ),
        key=lambda x: x[1]
    )[:0:-1]
    
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    
    # print recommendations
    print(f"Recommendations for {subreddit}:")
    
    for i, (idx, dist) in enumerate(raw_recommends):
        
        print(f"{i+1}: {reverse_mapper[idx]}, with distance of {dist}")

In [77]:
make_recommendation(
    model=model_knn,
    data=mat_user_usage,
    subreddit="skateboarding",
    mapper=category_num_mapper,
    n_recommendations=5
)

You have input subreddit: skateboarding
Found possible matches in our database: ['skateboarding', 'Kiteboarding', 'skimboarding', 'ElectricSkateboarding', 'seasteading', 'snowboarding', 'KeyboardMashing']

Recommendation system start to make inference
......

Recommendations for skateboarding:
1: eighthdoctor, with distance of 0.0
2: effects, with distance of 0.0
3: ehlersdanlos, with distance of 0.0
4: eggs, with distance of 0.0
5: effzeh, with distance of 0.0
