In [1]:
pip install praw pandas

Note: you may need to restart the kernel to use updated packages.


In [49]:
import praw
import pandas as pd
import numpy as np
reddit = praw.Reddit(
    client_id="B9WowhFEGBHWEUoF-uvj-w",
    client_secret="UhuEX9ckmGUOCLwSup2nclKEuHt7Dw",
    user_agent="python:subreddit_finder_app:v1.0 (by u/phuckoph8)"
)

In [96]:
import re


def search_subreddits(keywords,limit):
    subreddits = {}
    for keyword in keywords:
        for subreddit in reddit.subreddits.search(keyword, limit=limit):
            # if subreddit.display_name not in subreddits and any(term.lower() in (subreddit.title.lower()) and term.lower() in (subreddit.public_description.lower())
            #         for term in keywords) and (subreddit.subscribers > 10000):
            relevancy_score = calculate_relevancy(subreddit, keywords)
            if (subreddit.subscribers is not None) and subreddit.subscribers > 10000 and relevancy_score > 0: 
                subreddits[subreddit.display_name] = {
                    "name": subreddit.display_name,
                    "title": subreddit.title,
                    "description": subreddit.public_description,
                    "subscribers": subreddit.subscribers,
                    "active_users": subreddit.active_user_count,
                    "relevance": relevancy_score
                }
    sorted_subreddits = {
        k: v
        for k, v in sorted(subreddits.items(), key=lambda item: item[1]["relevance"], reverse=True)
    }
    return dict(list(sorted_subreddits.items())[:10])

def calculate_relevancy(subreddit, keywords):
    keyword_pattern = r'\b(?:' + '|'.join(re.escape(keyword.lower()) for keyword in keywords) + r')\b'
    
    title = subreddit.title.lower()
    description = subreddit.public_description.lower()

    # Count matches in title and description
    title_matches = len(set(re.findall(keyword_pattern, title)))
    description_matches = len(set(re.findall(keyword_pattern, description)))

    # Weighted score (title matches carry more weight)
    score = (2 * title_matches) + description_matches

    return score

In [23]:
from itertools import chain
def fetch_engagement(subreddit_name):
    subreddit = reddit.subreddit(subreddit_name)
    
    posts = chain(subreddit.hot(limit=10), subreddit.new(limit=10))

    
    total_comments = 0
    total_upvotes = 0
    post_count = 0

    for post in posts:
        total_comments += post.num_comments
        total_upvotes += post.score
        post_count += 1
    
    if post_count == 0: 
        return {"avg_comments": 0, "avg_upvotes": 0}

    return {
        "avg_comments": total_comments / post_count,
        "avg_upvotes": total_upvotes / post_count,
    }

In [58]:
from datetime import datetime, timedelta

def get_subreddit_activity_score(subreddit_name, lookback_days=90):
    """
    Calculate the activity score for a subreddit based on recent posts and comments.

    :param subreddit_name: Name of the subreddit.
    :param reddit: An authenticated PRAW Reddit instance.
    :param lookback_days: Time period to analyze (in days).
    :return: A dictionary containing the activity score and relevant stats.
    """
    # Define the lookback period
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(days=lookback_days)
    
    subreddit = reddit.subreddit(subreddit_name)
    total_posts = 0
    total_comments = 0
    recent_post_time = None
    recent_comment_time = None

    # Fetch recent posts
    for submission in subreddit.new(limit=10):  
        post_time = datetime.utcfromtimestamp(submission.created_utc)
        if post_time >= start_time:
            total_posts += 1
            if not recent_post_time or post_time > recent_post_time:
                recent_post_time = post_time
    
    # Fetch recent comments
    for comment in subreddit.comments(limit=1000):
        comment_time = datetime.utcfromtimestamp(comment.created_utc)
        if comment_time >= start_time:
            total_comments += 1
            if not recent_comment_time or comment_time > recent_comment_time:
                recent_comment_time = comment_time

    # Calculate activity score
    recency_score = 100 if ((recent_comment_time is not None) and (recent_post_time is not None) and  (recent_comment_time and recent_post_time >= start_time)) else 0
    engagement_score = (total_posts + total_comments) / (subreddit.subscribers)
    activity_score = (total_posts * 0.4) + (total_comments * 0.4) + (engagement_score * 0.2) + recency_score

    return activity_score


In [89]:

def rank_subreddits(subreddits):
    ranked_list = []
    for name, data in subreddits.items():
        engagement = fetch_engagement(name)
        engagement_score=engagement["avg_comments"] + engagement["avg_upvotes"]
        size_score = data["subscribers"]
        activity_score = get_subreddit_activity_score(name)
            
        ranked_list.append({
            "subreddit": name,
            "engagement": engagement_score,
            "size": size_score,
            "activity": activity_score,
            "relevance": data.get("relevance", 0) 
        })
        
    df = pd.DataFrame(ranked_list)

    for col in ["engagement", "activity", "relevance"]:
        df[f"{col}_std"] = (df[col] - df[col].mean()) / df[col].std()

    # Calculate final weighted score
    weights = {"engagement_std": 0.3, "activity_std": 0.3, "relevance_std": 0.4}
    
    df["score"] = (
        weights["engagement_std"] * df["engagement_std"] +
        weights["activity_std"] * df["activity_std"] +
        weights["relevance_std"] * df["relevance_std"]
    )

    top_subreddits = df.sort_values("score", ascending=False).head(5)
    return top_subreddits

In [97]:
product_keywords = ["shoes", "football","nike","reebok","neymar","expensive","tournament", "india"]
subreddits = search_subreddits(product_keywords,50)
subreddits

{'IndianFootball': {'name': 'IndianFootball',
  'title': 'Football in India',
  'description': "This subreddit is dedicated to all things Indian Football, ISL, I-League, Indian men's and women's national team, youth teams, Santosh Trophy, Durand Cup, Super Cup, and everything in between",
  'subscribers': 213786,
  'active_users': None,
  'relevance': 5},
 'Shoes': {'name': 'Shoes',
  'title': 'Shoes? Shoes!!!',
  'description': "Shoes are so much more than a covering for a foot. Shoes are art and creativity. Shoes are distinction and class. Shoes can be an accessory or a lifestyle.\n\nLet's talk shoes!",
  'subscribers': 523575,
  'active_users': None,
  'relevance': 3},
 'RedWingShoes': {'name': 'RedWingShoes',
  'title': 'Red Wing Shoes',
  'description': 'Discuss Red Wing Boots and Shoes',
  'subscribers': 61616,
  'active_users': None,
  'relevance': 3},
 'Nike': {'name': 'Nike',
  'title': 'NIKE',
  'description': 'A community to post, appreciate, and discuss Nike. Please be resp

In [98]:
top_subreddits = rank_subreddits(subreddits)
top_subreddits

Unnamed: 0,subreddit,engagement,size,activity,relevance,engagement_std,activity_std,relevance_std,score
0,IndianFootball,145.95,213786,498.400932,5,0.388442,0.337735,2.84605,1.356273
7,footballmanagergames,496.5,444337,502.000452,3,2.730719,0.647899,-0.316228,0.887094
5,Fantasy_Football,72.8,165307,500.401211,3,-0.100326,0.510095,-0.316228,-0.00356
2,RedWingShoes,31.4,61616,503.603275,3,-0.376949,0.786011,-0.316228,-0.003773
8,footballcards,34.9,167288,503.201205,3,-0.353563,0.751365,-0.316228,-0.007151
