In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from  data_preprocessing_class import TextProcessor
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import ast
import warnings

warnings.filterwarnings("ignore")

In [2]:
reddit_api_info_nw_insights = pd.read_csv("subreddit_data_info_nw_insights.csv")
reddit_api_subreddit_add_info = pd.read_csv("subreddit_data_add_info.csv")
reddit_network_data = pd.read_csv("subreddit_network_data.csv")

In [3]:
merged_data = pd.merge(reddit_api_info_nw_insights, reddit_api_subreddit_add_info, on="Subreddit")
merged_data = pd.merge(merged_data, reddit_network_data, on="Subreddit")
merged_data = merged_data.loc[:, ~merged_data.columns.str.contains('^Unnamed')]

In [4]:
subreddit_desc_df = merged_data[['Subreddit', 'Subreddit Description']].copy()

processor = TextProcessor()
subreddit_desc_df = processor.text_preprocessed(subreddit_desc_df, 'Subreddit Description')

tfidf = TfidfVectorizer(stop_words='english', binary=True)
tfidf_matrix = tfidf.fit_transform(subreddit_desc_df['text_stemmed_no_stopped'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:
# Function to get recommendations based on similarity
def recommend_subreddits_by_topic(topic, tfidf_vectorizer, tfidf_matrix, df, top_n=5):
    topic_vector = tfidf_vectorizer.transform([topic])

    cosine_similarities = cosine_similarity(topic_vector, tfidf_matrix).flatten()

    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    recommended_subreddits = df.iloc[top_indices].copy()

    return recommended_subreddits['Subreddit']

In [6]:
def preprocess_topic(text):
    tokens = word_tokenize(text.lower())

    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    words = [word for word in stripped if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]

    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    return ' '.join(stemmed)

topic = "Foxes are lovable."
preprocessed_topic = preprocess_topic(topic)

topic_vector = tfidf.transform([preprocessed_topic]).toarray()

try:
    preprocessed_topic = preprocess_topic(topic)
    recommendations = recommend_subreddits_by_topic(preprocessed_topic, tfidf, tfidf_matrix, subreddit_desc_df, 5)
except ValueError as e:
    print(e)

In [7]:
posts_df = merged_data[["Subreddit", "Top 10 Posts"]]
posts_df["Top 10 Posts"] = posts_df["Top 10 Posts"].apply(ast.literal_eval)
expanded_posts_df = posts_df.explode("Top 10 Posts").reset_index(drop=True)

post_authors_df = merged_data[["Subreddit", "PostAuthors"]]
post_authors_df["PostAuthors"] = post_authors_df["PostAuthors"].apply(ast.literal_eval)
expanded_post_authors_df = post_authors_df.explode("PostAuthors").reset_index(drop=True)

post_interactions_df = pd.merge(expanded_posts_df, expanded_post_authors_df, left_index=True, right_index=True)
post_interactions_df.drop(columns=["Subreddit_y"], inplace=True)
post_interactions_df.rename(columns={"Subreddit_x": "Subreddit"}, inplace=True)

interaction_matrix = post_interactions_df.groupby(['PostAuthors', 'Subreddit']).size().unstack(fill_value=0)

interaction_sample = csr_matrix(interaction_matrix)

csr_sample = csr_matrix(interaction_matrix)

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_sample)

random_users = np.random.choice(post_interactions_df['PostAuthors'].unique(), size=5, replace=False)
print("Selected random users:", random_users)

Selected random users: ['MattyIce6969' 'Exclamation_Marc' 'KiwiMurky736' 'LudovicoSpecs'
 'Daft-Vader']


In [8]:
def hybrid_recommendation(user_id, tfidf_matrix, interaction_matrix, subreddit_desc_df, knn_model, tfidf_vectorizer, top_n=5):
    user_interaction = csr_sample[interaction_matrix.index.get_loc(user_id)]
    distances, indices = knn_model.kneighbors(user_interaction, n_neighbors=20)
    indices = indices.flatten()[1:]
    distances = distances.flatten()[1:]

    collab_subreddits = set()
    for idx in indices:
        neighbor_subreddits = interaction_matrix.columns[interaction_matrix.iloc[idx] > 0]
        collab_subreddits.update(neighbor_subreddits)
    collab_subreddits -= set(interaction_matrix.columns[interaction_matrix.iloc[interaction_matrix.index.get_loc(user_id)] > 0])

    all_subreddit_descriptions = " ".join(subreddit_desc_df['Subreddit Description'].tolist())
    content_recs = recommend_subreddits_by_topic(all_subreddit_descriptions, tfidf_vectorizer, tfidf_matrix, subreddit_desc_df, top_n=top_n)

    combined_recommendations = list(collab_subreddits.union(set(content_recs)))
    return combined_recommendations[:top_n]

In [9]:
for user_id in random_users:
    hybrid_recs = hybrid_recommendation(user_id, tfidf_matrix, interaction_matrix, subreddit_desc_df, knn, tfidf, top_n=10)
    print(f"Hybrid recommendations for user {user_id}: {hybrid_recs}")

Hybrid recommendations for user MattyIce6969: ['WritingPrompts', 'cuteanimals', 'reddeadredemption', 'AnimalCollective', 'RoundAnimals', 'hitmanimals', 'mildlyinteresting', 'todayilearned', 'animalid', 'teenagers']
Hybrid recommendations for user Exclamation_Marc: ['WritingPrompts', 'cuteanimals', 'reddeadredemption', 'RoundAnimals', 'mildlyinteresting', 'todayilearned', 'animalid', 'teenagers', 'glassanimals', 'pinkfloyd']
Hybrid recommendations for user KiwiMurky736: ['WritingPrompts', 'cuteanimals', 'reddeadredemption', 'RoundAnimals', 'AnimalsWithoutNecks', 'mildlyinteresting', 'animalid', 'explainlikeimfive', 'teenagers', 'glassanimals']
Hybrid recommendations for user LudovicoSpecs: ['WritingPrompts', 'cuteanimals', 'reddeadredemption', 'RoundAnimals', 'mildlyinteresting', 'RimWorld', 'todayilearned', 'ImaginaryAnimals', 'animalid', 'teenagers']
Hybrid recommendations for user Daft-Vader: ['AnimalsFailing', 'WritingPrompts', 'cuteanimals', 'reddeadredemption', 'RoundAnimals', 'An