In [37]:
#Load the packages
import json #working with JSON data
import re #regular expressions and text processing
import torch #deep learning with PyTorch
import nltk  #natural language processing tasks

#importing stopwords and tokenizers from NLTK corpus
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize

#Classes for GPT-2 tokenizer and model from HuggingFace's transformers library
from transformers import GPT2Tokenizer, GPT2Model

#to perform keyword extraction and topic modeling.
from keybert import KeyBERT

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [39]:
#Load the dataset
train_data_path = #Dataset location
test_data_path =  #Dataset location

In [40]:
#read the dataset
with open(train_data_path, 'r') as train_file:
    train_data = json.load(train_file)
    
with open(test_data_path, 'r') as test_file:
    test_data = json.load(test_file)

In [41]:
#initalize GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token    #set padding token

model = GPT2Model.from_pretrained('gpt2')

#defining additional preprocessing step
stop_words =set(stopwords.words('english'))

In [42]:
#function to remove special characters, URLs, and digitis 
def clean_text(text):
    # Remove special characters, URLs, and digits
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [43]:
def preprocess_data(data):
    cleaned_data = []
    for post_dict in data:
        cleaned_text = clean_text(post_dict['text'])
        cleaned_data.append(cleaned_text)
    return cleaned_data

cleaned_test_data = preprocess_data(test_data)

In [44]:
def extract_topics_gpt2(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    # Forward pass through GPT-2 model
    with torch.no_grad():
        outputs = model(**inputs)
    # Get hidden states from model output
    hidden_states = outputs.last_hidden_state
    # Sum the hidden states across all tokens
    pooled_output = hidden_states.sum(dim=1)
    # Extract topics from the pooled output using KeyBERT
    keybert_model = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = keybert_model.extract_keywords(text)
    topics = [keyword for keyword, _ in keywords]
    return topics

In [45]:
def calculate_relevance(user_profile, post, topics):
    #calculating relevance based on keyword matching
    keywords = user_profile + topics  # Combine user profile and post topics
    keyword_count = sum(post.lower().count(keyword.lower()) for keyword in keywords)
    post_length = len(post.split())  # Total number of words in the post
    
    # Calculate relevance score based on keyword count and post length
    relevance_score = keyword_count / post_length if post_length > 0 else 0
    
    return relevance_score

In [46]:
def recommend_posts(user_profile, posts):
    recommendations = []
    for post in posts:
        # Extracting topics for the current post
        topics = extract_topics_gpt2(post)
        # Compare user profile topics with post topics and calculate relevance score
        relevance_score = calculate_relevance(user_profile, post, topics)
        recommendations.append((post, relevance_score))
    # Sorting recommendations based on relevance score
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations

In [47]:
# Example usage
sample_user_profile = ['finance', 'retirement savings']
sample_posts = cleaned_test_data[:10]  # Example: Using first 10 posts for demonstration

# Ensure user profile is formatted correctly
sample_user_profile = [topic.lower() for topic in sample_user_profile]

# Recommend posts
recommended_posts = recommend_posts(sample_user_profile, sample_posts)

# Format output
for post, relevance_score in recommended_posts:
    print(f"Relevance Score: {relevance_score}\n{post}\n{'-' * 50}")


Relevance Score: 0.22857142857142856
Savings close subsecond account bunch articles Capital One redesigned site could open additional subsecond savings accounts close online done open one Christmas gifts close Christmas life cant see close subsecond savings account remove feature missing
--------------------------------------------------
Relevance Score: 0.21212121212121213
Backdoor ROTH conversion account transfer question Hi second year backdoor ROTH first year create dedicated ROTH account everything forms ampxB year customer rep TD Ameritrade matters telling cash traditional online internal transfer traditional ROTH claim form necessary im converting actual shares ampxB feels bit concerning fill ROTH conversion form case TDA ie paper record backdoor ampxB process every year transfer money traditional ROTH forms ampxB Thanks
--------------------------------------------------
Relevance Score: 0.20833333333333334
Looking open online savings account Hello first time Im posting subreddi