In [67]:
import praw
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from typing import List, Dict
from datetime import datetime
import os 
from dotenv import load_dotenv



load_dotenv()


reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
)
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 3),  # Include up to 3-word phrases
    analyzer='word',
    token_pattern=r'(?u)\b\w+\b'  # Include single-character words
)
        
def fetch_reddit_posts(search_query: str, limit) -> List[Dict]:
    """
    Fetch posts from all of Reddit based on search query
    
    Args:
        search_query (str): Search query string
        limit (int): Maximum number of posts to fetch
        
    Returns:
        List[Dict]: List of posts with their details
    """
    posts = []
    
    # Search across all of Reddit
    for submission in reddit.subreddit("all").search(
        search_query, 
        sort='relevance', 
        time_filter='month',
        limit=limit
    ):
        posts.append({
            'title': submission.title,
            'body': submission.selftext,
            'url': f"https://reddit.com{submission.permalink}",
            'score': submission.score,
            'created_utc': datetime.fromtimestamp(submission.created_utc),
            'num_comments': submission.num_comments,
            'subreddit': submission.subreddit.display_name
        })
    
    return posts

def find_relevant_posts(keywords: List[str], 
                        limit, 
                        min_similarity) -> pd.DataFrame:
    """
    Find posts relevant to given keywords across all of Reddit
    
    Args:
        keywords (List[str]): List of keywords to match
        limit (int): Maximum posts to fetch
        min_similarity (float): Minimum cosine similarity score to consider
        
    Returns:
        pd.DataFrame: Sorted dataframe of relevant posts
    """
    # Create search query from keywords
    search_query = ' OR '.join(f'"{kw}"' for kw in keywords)
    
    # Fetch posts using Reddit's search
    all_posts = fetch_reddit_posts(search_query, limit=limit)
    
    if not all_posts:
        return pd.DataFrame()
    
    # Combine title and body for text analysis
    posts_text = [f"{post['title']} {post['body']}" for post in all_posts]
    
    # Create keyword query for TF-IDF
    keyword_query = ' '.join(keywords)
    
    # Vectorize posts and keywords
    tfidf_matrix = vectorizer.fit_transform(posts_text + [keyword_query])
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(
        tfidf_matrix[-1:], 
        tfidf_matrix[:-1]
    )[0]
    
    # Create DataFrame with results
    results_df = pd.DataFrame(all_posts)
    results_df['similarity_score'] = similarity_scores
    
    # Filter and sort results
    relevant_posts = results_df[results_df['similarity_score'] >= min_similarity]
    relevant_posts = relevant_posts.sort_values(
        by=['similarity_score', 'score'], 
        ascending=[False, False]
    )
    
    return relevant_posts



In [68]:
keywords = ["ayurveda", "skin care"]

results = find_relevant_posts(
    keywords=keywords,
    limit=10000,
    min_similarity=0.3
)

if not results.empty:
    print(f"\nFound {len(results)} relevant posts:")
    for idx, post in results.iterrows():
        print(f"\nTitle: {post['title']}")
        print(f"Subreddit: r/{post['subreddit']}")
        print(f"Relevance Score: {post['similarity_score']:.3f}")
        print(f"Reddit Score: {post['score']}")
        print(f"URL: {post['url']}")
        print("-" * 80)
else:
    print("No relevant posts found.")


Found 5 relevant posts:

Title: Kriti Sanon Skin Care...
Subreddit: r/KritiSanonn
Relevance Score: 0.503
Reddit Score: 88
URL: https://reddit.com/r/KritiSanonn/comments/1ho52s0/kriti_sanon_skin_care/
--------------------------------------------------------------------------------

Title: Which stream is better, homeopathy or ayurveda 
Subreddit: r/MEDICOreTARDS
Relevance Score: 0.398
Reddit Score: 1
URL: https://reddit.com/r/MEDICOreTARDS/comments/1hv0d3z/which_stream_is_better_homeopathy_or_ayurveda/
--------------------------------------------------------------------------------

Title: Has anyone tried Ayurveda ?
Subreddit: r/UlcerativeColitis
Relevance Score: 0.333
Reddit Score: 0
URL: https://reddit.com/r/UlcerativeColitis/comments/1haz610/has_anyone_tried_ayurveda/
--------------------------------------------------------------------------------

Title: What’s Ayurveda Like in the EU? Let’s Chat! 🌿✨
Subreddit: r/Ayurveda
Relevance Score: 0.310
Reddit Score: 8
URL: https://reddit.

In [43]:
import praw
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from datetime import datetime
import re

load_dotenv()


reddit_instance = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
)

def preprocess_text(text):
    """Clean and standardize text for better matching"""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters but keep spaces between words
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def search_reddit(reddit_instance, keywords, limit, min_similarity):
    """
    Search Reddit for posts matching keywords
    Keywords can be single words or phrases
    """
    # Preprocess keywords
    processed_keywords = [preprocess_text(kw) for kw in keywords]
    
    # Create Reddit search query (keep original format for Reddit's search)
    search_query = ' OR '.join(f'"{kw}"' for kw in keywords)
    posts = []

    try:
        print("Fetching posts from Reddit...")
        for submission in reddit_instance.subreddit("all").search(
            search_query,
            sort='relevance',
            time_filter='month',
            limit=limit
        ):
            # Preprocess title and body
            processed_title = preprocess_text(submission.title)
            processed_body = preprocess_text(submission.selftext)
            
            # Calculate initial keyword matches
            title_matches = sum(1 for kw in processed_keywords if kw in processed_title)
            body_matches = sum(1 for kw in processed_keywords if kw in processed_body)
            
            posts.append({
                'title': submission.title,
                'body': submission.selftext,
                'processed_text': f"{processed_title} {processed_body}",
                'url': f"https://reddit.com{submission.permalink}",
                'score': submission.score,
                'created_utc': datetime.fromtimestamp(submission.created_utc),
                'num_comments': submission.num_comments,
                'subreddit': submission.subreddit.display_name,
                'keyword_matches': title_matches + body_matches
            })

        if not posts:
            return pd.DataFrame()

        # Prepare documents for TF-IDF
        # Include both original keywords and their individual words
        keyword_docs = []
        for kw in keywords:
            # Add the full phrase
            keyword_docs.append(kw)
        
        # Combine all keyword variations into one query document
        keyword_query = ' '.join(keyword_docs)
        
        # Get processed text from posts
        posts_text = [post['processed_text'] for post in posts]

        # Initialize TF-IDF vectorizer with specific settings for multi-word handling
        vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 3),  # Include up to 3-word phrases
            analyzer='word',
            token_pattern=r'(?u)\b\w+\b'  # Include single-character words
        )

        # Vectorize posts and keywords
        tfidf_matrix = vectorizer.fit_transform(posts_text + [keyword_query])
        
        # Calculate similarity scores
        similarity_scores = cosine_similarity(
            tfidf_matrix[-1:], 
            tfidf_matrix[:-1]
        )[0]

        # Create DataFrame with results
        results_df = pd.DataFrame(posts)
        results_df['similarity_score'] = similarity_scores
        
        # Adjust similarity scores based on exact keyword matches
        results_df['adjusted_score'] = results_df.apply(
            lambda x: x['similarity_score'] * (1 + 0.1 * x['keyword_matches']), 
            axis=1
        )

        # Filter and sort results
        min_similarity = min_similarity
        relevant_posts = results_df[results_df['adjusted_score'] >= min_similarity]
        return relevant_posts.sort_values(
            by=['adjusted_score', 'score'], 
            ascending=[False, False]
        )

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return pd.DataFrame()

In [47]:
    
keywords = ["Patient portal", "HIPAA compliance", "Virtual consultations", "Healthcare analytics"]
results = search_reddit(reddit, keywords,10000, 0.3)

if not results.empty:
    print("\nTop matching posts:")
    for idx, post in results.iterrows():
        print(f"\nTitle: {post['title']}")
        print(f"Relevance Score: {post['adjusted_score']:.3f}")
        print(f"Keyword Matches: {post['keyword_matches']}")
        print(f"URL: {post['url']}")

Fetching posts from Reddit...

Top matching posts:

Title: What are the key steps to effectively manage HIPAA compliance in a small healthcare practice?
Relevance Score: 0.365
Keyword Matches: 2
URL: https://reddit.com/r/hipaa/comments/1hkljvt/what_are_the_key_steps_to_effectively_manage/

Title: Texas A&M University-Mays Business School and Humana Announce Winners of Eighth Annual Healthcare Analytics Case Competition
Relevance Score: 0.338
Keyword Matches: 1
URL: https://reddit.com/r/Quantisnow/comments/1hhv4u3/texas_am_universitymays_business_school_and/

Title: Are you in Healthcare analytics role ?
Relevance Score: 0.334
Keyword Matches: 1
URL: https://reddit.com/r/ITCareerQuestions/comments/1hlkf07/are_you_in_healthcare_analytics_role/

Title: FORA | Forian Partners With Databricks to Expand Access to Advanced Healthcare Analytics
Relevance Score: 0.331
Keyword Matches: 1
URL: https://reddit.com/r/StockTitan/comments/1hfrkho/fora_forian_partners_with_databricks_to_expand/

Title: