In [7]:
# Import necessary libraries
import pandas as pd

# Load the interaction data (user ratings and interactions)
interaction_df = pd.read_csv("data/processed/interaction_df.csv")
posts_df = pd.read_csv("data/processed/all_posts_with_features.csv")

# Check for missing values and get an overview of the data
display(interaction_df.head())
display(posts_df.head())

# Check for missing values
print(interaction_df.isnull().sum())
print(posts_df.isnull().sum())

Unnamed: 0,id,post_id,user_id,viewed_at,interaction_type,rating_percent,liked_at,inspired_at,rated_at
0,9447,1256,1,2024-09-24 13:33:57,viewed,,,,
1,9487,1253,1,2024-09-25 07:34:56,viewed,,,,
2,9488,1257,1,2024-09-25 07:36:46,viewed,,,,
3,9489,1258,1,2024-09-25 07:36:47,viewed,,,,
4,9502,1252,1,2024-09-26 15:09:11,viewed,,,,


Unnamed: 0,id,title,upvote_count,view_count,rating_count,average_rating,share_count,video_link,contract_address,chain_id,...,following,category_id,category_name,moods,post_id,total_views,total_likes,total_inspirations,total_ratings,average_rating_features
0,11,Recipe for a flow state,45,62,7,36,8,https://video-cdn.socialverseapp.com/michael_9...,,,...,True,2,Vible,"enthusiasm, engagement, curiosity, passion",11.0,10.0,2.0,0.0,7.0,39.571429
1,12,Why fit in..?,42,54,0,0,0,https://video-cdn.socialverseapp.com/michael_5...,,,...,True,2,Vible,"Anger, Determination, Pain, Empowerment, Isola...",12.0,1.0,2.0,0.0,0.0,0.0
2,13,Transcending Singularity,20,75,45,8,0,https://video-cdn.socialverseapp.com/michael_5...,,,...,True,2,Vible,"Wonder, Intensity, Compassion, Engagement, Con...",13.0,57.0,0.0,0.0,44.0,20.977273
3,14,Peak Performance?,71,95,6,28,0,https://video-cdn.socialverseapp.com/michael_4...,,,...,True,2,Vible,"excitement, passion, contemplation",14.0,4.0,2.0,1.0,6.0,33.333333
4,15,Our Existential Situation,64,70,9,33,0,https://video-cdn.socialverseapp.com/michael_b...,,,...,True,2,Vible,"Concern, Passion, Reflectiveness, Urgency, Hop...",15.0,8.0,1.0,0.0,9.0,37.888889


id                     0
post_id                0
user_id                0
viewed_at           4247
interaction_type       0
rating_percent      6978
liked_at            8547
inspired_at         9571
rated_at            6978
dtype: int64
id                            0
title                       326
upvote_count                  0
view_count                    0
rating_count                  0
average_rating                0
share_count                   0
video_link                    0
contract_address           1183
chain_id                   1183
chart_url                  1183
baseToken                     0
is_locked                     0
created_at                    0
first_name                    0
last_name                     0
username                      0
upvoted                       0
bookmarked                    0
following                     0
category_id                   0
category_name                 0
moods                         0
post_id                   

In [8]:
# Preprocess post data for content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess title and moods
def preprocess_content_data(posts_df):
    # Fill missing values with "Unknown"
    posts_df['title'] = posts_df['title'].fillna("Unknown")
    posts_df['moods'] = posts_df['moods'].fillna("Unknown")
    
    # Text preprocessing: lowercase and remove punctuation
    posts_df['processed_title'] = posts_df['title'].str.lower().str.replace('[^\w\s]','')
    posts_df['processed_moods'] = posts_df['moods'].str.lower().str.replace('[^\w\s]','')
    
    return posts_df

posts_df = preprocess_content_data(posts_df)

# Vectorize title and moods using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
title_tfidf = tfidf.fit_transform(posts_df['processed_title'])
moods_tfidf = tfidf.fit_transform(posts_df['processed_moods'])

# Combine the features (title and moods)
from scipy.sparse import hstack
combined_features = hstack([title_tfidf, moods_tfidf])

# Compute similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
content_similarity_matrix = cosine_similarity(combined_features, combined_features)

# Print content-based similarity matrix
print(content_similarity_matrix.shape)

(1251, 1251)


In [9]:
from sklearn.decomposition import TruncatedSVD

# Create a user-item interaction matrix for collaborative filtering
user_item_matrix = interaction_df.pivot_table(index='user_id', columns='post_id', values='rating_percent', fill_value=0)

# Perform matrix factorization using SVD (Singular Value Decomposition)
svd = TruncatedSVD(n_components=50, random_state=42)
user_item_matrix_svd = svd.fit_transform(user_item_matrix)

# Reconstruct the user-item matrix
user_item_matrix_reconstructed = svd.inverse_transform(user_item_matrix_svd)

# Print the reconstructed matrix
print(user_item_matrix_reconstructed.shape)


(174, 438)


In [10]:
class HybridRecommender:
    def __init__(self, content_similarity_matrix, collaborative_matrix, weight_content=0.5, weight_collaborative=0.5):
        self.content_similarity_matrix = content_similarity_matrix
        self.collaborative_matrix = collaborative_matrix
        self.weight_content = weight_content
        self.weight_collaborative = weight_collaborative
    
    def recommend(self, user_id, top_n=10):
        # Get collaborative filtering recommendations for the user
        user_index = user_id - 1  # Assuming user_id starts from 1
        collaborative_scores = self.collaborative_matrix[user_index]

        # Get content-based recommendations (use similarity matrix for this)
        content_scores = self.content_similarity_matrix[user_index]

        # Combine the content-based and collaborative filtering scores (weighted sum)
        hybrid_scores = self.weight_content * content_scores + self.weight_collaborative * collaborative_scores

        # Get the top N recommendations
        top_recommendations = hybrid_scores.argsort()[-top_n:][::-1]  # Sort and get top N
        
        return top_recommendations

# Initialize the hybrid recommender
hybrid_recommender = HybridRecommender(content_similarity_matrix, user_item_matrix_reconstructed)

# Get recommendations for a specific user
user_id = 1  # Example user_id
top_n = 10
recommendations = hybrid_recommender.recommend(user_id, top_n)

print(f"Top-{top_n} recommendations for user {user_id}: {recommendations}")


ValueError: operands could not be broadcast together with shapes (1251,) (438,) 