In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from typing import List, Dict
import os

import praw
import re
from dotenv import load_dotenv
import numpy as np
from datetime import datetime, timedelta
from itertools import chain
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

load_dotenv()
class RedditKeywordMatcher:
    def __init__(self, client_id: str, client_secret: str, user_agent: str):
        """
        Initialize Reddit client and TF-IDF vectorizer
        
        Args:
            client_id (str): Reddit API client ID
            client_secret (str): Reddit API client secret
            user_agent (str): Reddit API user agent
        """
    

        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )

        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )
        
    def fetch_subreddit_posts(self, subreddit_name: str, limit: int = 100) -> List[Dict]:
        """
        Fetch posts from a subreddit
        
        Args:
            subreddit_name (str): Name of the subreddit
            limit (int): Maximum number of posts to fetch
            
        Returns:
            List[Dict]: List of posts with their details
        """
        subreddit = self.reddit.subreddit(subreddit_name)
        posts = []
        
        for post in subreddit.hot(limit=limit):
            posts.append({
                'title': post.title,
                'body': post.selftext,
                'url': f"https://reddit.com{post.permalink}",
                'score': post.score,
                'created_utc': datetime.fromtimestamp(post.created_utc),
                'num_comments': post.num_comments
            })
        
        return posts
    
    def find_relevant_posts(self, 
                          keywords: List[str], 
                          subreddits: List[str], 
                          limit: int = 100, 
                          min_similarity: float = 0.1) -> pd.DataFrame:
        """
        Find posts relevant to given keywords across specified subreddits
        
        Args:
            keywords (List[str]): List of keywords to match
            subreddits (List[str]): List of subreddits to search
            limit (int): Maximum posts to fetch per subreddit
            min_similarity (float): Minimum cosine similarity score to consider
            
        Returns:
            pd.DataFrame: Sorted dataframe of relevant posts
        """
        all_posts = []
        
        # Fetch posts from all specified subreddits
        for subreddit in subreddits:
            try:
                posts = self.fetch_subreddit_posts(subreddit, limit)
                all_posts.extend(posts)
            except Exception as e:
                print(f"Error fetching posts from r/{subreddit}: {str(e)}")
        
        if not all_posts:
            return pd.DataFrame()
        
        # Combine title and body for text analysis
        posts_text = [f"{post['title']} {post['body']}" for post in all_posts]
        
        # Create keyword query
        keyword_query = ' '.join(keywords)
        
        # Vectorize posts and keywords
        tfidf_matrix = self.vectorizer.fit_transform(posts_text + [keyword_query])
        
        # Calculate similarity scores
        similarity_scores = cosine_similarity(
            tfidf_matrix[-1:], 
            tfidf_matrix[:-1]
        )[0]
        
        # Create DataFrame with results
        results_df = pd.DataFrame(all_posts)
        results_df['similarity_score'] = similarity_scores
        
        # Filter and sort results
        relevant_posts = results_df[results_df['similarity_score'] >= min_similarity]
        relevant_posts = relevant_posts.sort_values(
            by=['similarity_score', 'score'], 
            ascending=[False, False]
        )
        
        return relevant_posts


In [None]:
from dotenv import load_dotenv

load_dotenv()

def main():
    # Replace these with your Reddit API credentials
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT")
    
    # Initialize the matcher
    matcher = RedditKeywordMatcher(client_id, client_secret, user_agent)
    
    # Example usage
    keywords = ["python", "machine learning", "data science"]
    subreddits = ["programming", "Python", "learnpython", "datascience"]
    
    results = matcher.find_relevant_posts(
        keywords=keywords,
        subreddits=subreddits,
        limit=100,
        min_similarity=0.1
    )
    
    # Display results
    if not results.empty:
        print(f"\nFound {len(results)} relevant posts:")
        for idx, post in results.iterrows():
            print(f"\nTitle: {post['title']}")
            print(f"Relevance Score: {post['similarity_score']:.3f}")
            print(f"Reddit Score: {post['score']}")
            print(f"URL: {post['url']}")
            print("-" * 80)
    else:
        print("No relevant posts found.")

if __name__ == "__main__":
    main()