In [None]:
!pip install nltk



In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk import corpus
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import wsd
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from sklearn.neighbors import NearestNeighbors

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
blog_df = pd.read_excel('/content/sampled_blogs.xlsx')
author_df = pd.read_excel('/content/sampled_authors.xlsx')
ratings_df = pd.read_excel('/content/sampled_ratings.xlsx')

In [None]:
blog_df.drop_duplicates(['blog_title', 'blog_content'], inplace=True)

lst_stopwords = corpus.stopwords.words('english')

def pre_process_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    text = str(text).lower().strip()
    text = re.sub(r'[^\w\s]', '', text)
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    if flg_lemm:
        lemmatizer = WordNetLemmatizer()
        lst_text = [lemmatizer.lemmatize(word) for word in lst_text]
    if flg_stemm:
        stemmer = PorterStemmer()
        lst_text = [stemmer.stem(word) for word in lst_text]
    return " ".join(lst_text)

blog_df['clean_blog_content'] = blog_df['blog_content'].apply(lambda x: pre_process_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))

# Content-Based Recommendation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(blog_df['clean_blog_content'])
cosine_sim = cosine_similarity(tfidf_matrix)

def get_content_based_recommendations(user_id):
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    high_rated_blogs = user_ratings[user_ratings['ratings'] >= 3.5]['blog_id'].values
    recommended_blogs = []
    for blog_id in high_rated_blogs:
        temp_id = blog_df[blog_df['blog_id'] == blog_id].index.values[0]
        similar_blogs = blog_df[cosine_sim[temp_id] > 0.2]['blog_id'].values
        recommended_blogs.extend([b for b in similar_blogs if b not in high_rated_blogs])
    return recommended_blogs

# Genre/Topic-Based Recommendation
def get_genre_recommendations(user_id, user_preferences):
    merged_df = pd.merge(ratings_df, blog_df[['blog_id', 'topic']], on='blog_id')
    top_topics_df = merged_df.groupby('userId').apply(
        lambda group: group.sort_values(by='ratings', ascending=False).drop_duplicates(subset='topic')['topic'].head(4).tolist()
    ).reset_index()
    top_topics_df.columns = ['userId', 'top_topics']
    top_topics_df['topic_str'] = top_topics_df['top_topics'].apply(lambda x: " ".join([topic.replace(" ", "_") for topic in x]))
    vectorizer = CountVectorizer()
    user_topic_matrix = vectorizer.fit_transform(top_topics_df['topic_str'])
    new_user_vector = vectorizer.transform([" ".join([t.replace(" ", "_") for t in user_preferences])])
    user_similarity = cosine_similarity(new_user_vector, user_topic_matrix).flatten()
    most_similar_user_id = top_topics_df.iloc[user_similarity.argmax()]['userId']
    recommended_blogs = merged_df[merged_df['userId'] == most_similar_user_id].sort_values(by='ratings', ascending=False)[['blog_id', 'topic']]
    return recommended_blogs['blog_id'].tolist(), top_topics_df

# Collaborative Filtering

def get_collaborative_recommendations(user_id):
    # Create a pivot table with users as rows and blogs as columns
    user_blog_matrix = ratings_df.pivot(index='userId', columns='blog_id', values='ratings').fillna(0)

    # Fit the NearestNeighbors model
    model_knn = NearestNeighbors(metric='cosine', algorithm='auto')
    model_knn.fit(user_blog_matrix)

    # Find the k nearest neighbors for the given user
    user_index = user_blog_matrix.index.tolist().index(user_id)
    distances, indices = model_knn.kneighbors([user_blog_matrix.iloc[user_index]], n_neighbors=6)  # Including the user itself

    # Aggregate blog ratings from neighbors (excluding the target user itself)
    similar_users_indices = indices.flatten()[1:]
    similar_users = user_blog_matrix.iloc[similar_users_indices]
    mean_ratings = similar_users.mean(axis=0)

    # Filter out blogs already rated by the target user
    rated_blogs = user_blog_matrix.loc[user_id]
    unrated_blogs = rated_blogs[rated_blogs == 0].index
    recommended_blogs = mean_ratings[unrated_blogs].sort_values(ascending=False).head(5)

    return recommended_blogs.index.tolist()


def display_user_history(user_id):
    user_rated_blogs = ratings_df[ratings_df['userId'] == user_id]
    if user_rated_blogs.empty:
        return f"No rating history found for user {user_id}."
    rated_blogs_details = pd.merge(user_rated_blogs, blog_df, on='blog_id')[['blog_title', 'ratings', 'topic']]
    return rated_blogs_details

def get_top_categories(user_id, user_preferences, top_topics_df):
    """Get the top categories for the given user or prompt for new categories."""
    if user_id in top_topics_df['userId'].values:
        # Get top topics for the user
        user_topics = top_topics_df[top_topics_df['userId'] == user_id]['top_topics'].values[0]
        print(f"Using top topics for user {user_id}: {user_topics}")
        return user_topics
    else:
        # Use the provided preferences
        print(f"User {user_id} not found. Using provided categories: {user_preferences}")
        return user_preferences

def sort_blogs_by_average_rating():
    """Sort blogs by their average ratings."""
    blog_ratings_avg = ratings_df.groupby('blog_id')['ratings'].mean().reset_index()
    blog_ratings_avg.columns = ['blog_id', 'avg_rating']
    sorted_blogs = pd.merge(blog_ratings_avg, blog_df, on='blog_id').sort_values(by='avg_rating', ascending=False)
    return sorted_blogs

# Unified Recommendation Function
def recommend_blogs(user_id, user_preferences):
    """Unified recommendation function with capped recommendations."""
    # Get recommendations and top_topics_df
    genre_recommendations, top_topics_df = get_genre_recommendations(user_id, user_preferences)

    # Get categories based on user_id or provided preferences
    user_categories = get_top_categories(user_id, user_preferences, top_topics_df)

    # Get recommendations from other methods
    content_recommendations = get_content_based_recommendations(user_id)
    collaborative_recommendations = get_collaborative_recommendations(user_id)

    # Combine all recommendations
    all_recommendations = set(content_recommendations + genre_recommendations + collaborative_recommendations)

    # Cap the recommendations to 20
    recommended_blogs = blog_df[blog_df['blog_id'].isin(all_recommendations)][['blog_id', 'blog_title', 'topic']]
    if len(recommended_blogs) < 20:
        # Fill the gap with top average-rated blogs from the user's categories
        sorted_blogs = sort_blogs_by_average_rating()
        top_category_blogs = sorted_blogs[sorted_blogs['topic'].isin(user_categories)]
        remaining_blogs = 20 - len(recommended_blogs)
        additional_blogs = top_category_blogs.head(remaining_blogs)
        recommended_blogs = pd.concat([recommended_blogs, additional_blogs])

    # Ensure the recommendations are capped to 20
    return recommended_blogs.head(20)

# Example usage
user_id = 16
user_preferences = ['Cryptocurrency', 'Artificial Intelligence', 'Flutter']

print("User's Rated Blogs:")
print(display_user_history(user_id))

print("\nRecommended Blogs for User:")
final_recommendations = recommend_blogs(user_id, user_preferences)
print(final_recommendations)

User's Rated Blogs:
                                          blog_title  ratings       topic
0                          All about the $WIFI token      2.0  Blockchain
1                      Lamden Weekly Digest [Mar 17]      5.0  Blockchain
2    Leman Network Upgrade Activation on March 30th!      0.5  Blockchain
3  Product Security or Drama? A Four-Factor Trust...      5.0    Security
4                     Android Application Pentesting      5.0     Android

Recommended Blogs for User:
Using top topics for user 16: ['Blockchain', 'Security', 'Android']
      blog_id                                         blog_title  \
496      4130                      Lamden Weekly Digest [Mar 10]   
713      4578                                CopyFi Explained!!!   
722      4778  Lockdrops — Are they really a good token launc...   
726      4730                Tachyon Protocol Weekly Report #178   
777      4563                          Trending Cryptocurrencies   
798      4545  SpaceX to Launch

  top_topics_df = merged_df.groupby('userId').apply(


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def get_collaborative_recommendations(user_id):
    """
    Collaborative filtering using scikit-learn's NearestNeighbors.
    Recommends blogs based on similar users' ratings.
    """
    # Create a pivot table with users as rows and blogs as columns
    user_blog_matrix = ratings_df.pivot(index='userId', columns='blog_id', values='ratings').fillna(0)

    # Fit the NearestNeighbors model
    model_knn = NearestNeighbors(metric='cosine', algorithm='auto')
    model_knn.fit(user_blog_matrix)

    # Find the k nearest neighbors for the given user
    user_index = user_blog_matrix.index.tolist().index(user_id)
    distances, indices = model_knn.kneighbors([user_blog_matrix.iloc[user_index]], n_neighbors=6)  # Including the user itself

    # Aggregate blog ratings from neighbors (excluding the target user itself)
    similar_users_indices = indices.flatten()[1:]
    similar_users = user_blog_matrix.iloc[similar_users_indices]
    mean_ratings = similar_users.mean(axis=0)

    # Filter out blogs already rated by the target user
    rated_blogs = user_blog_matrix.loc[user_id]
    unrated_blogs = rated_blogs[rated_blogs == 0].index
    recommended_blogs = mean_ratings[unrated_blogs].sort_values(ascending=False)

    return recommended_blogs.index.tolist()


In [None]:
user_id = 40
recommended_blogs = get_collaborative_recommendations(user_id)
print("Collaborative Recommendations for User:", recommended_blogs)

Collaborative Recommendations for User: [5651, 512, 2572, 2482, 9091, 2576, 5472, 607, 9404, 9402, 2600, 9333, 5614, 1474, 8586, 543, 2529, 5709, 2505, 7542, 9077, 7489, 9197, 2702, 9152, 4035, 8652, 9426, 5669, 5656, 1471, 9176, 1486, 6921, 6894, 6892, 6882, 6891, 6890, 6887, 6935, 6867, 6828, 6826, 6825, 6823, 6820, 6814, 6812, 6811, 6810, 6922, 13, 6936, 6939, 7080, 7073, 7061, 7055, 7052, 7048, 7047, 7043, 7040, 7022, 7020, 7015, 7014, 7013, 7012, 7003, 7002, 6999, 6988, 6984, 6981, 6969, 6802, 6960, 6958, 6964, 6748, 6793, 6545, 6629, 6623, 6615, 6604, 6602, 6595, 6590, 6583, 6579, 6578, 6570, 6560, 6548, 6535, 6778, 6526, 6521, 6519, 6518, 6516, 6513, 6508, 6501, 6500, 6498, 6494, 6488, 6482, 6631, 6632, 6634, 6636, 6776, 6770, 6757, 6756, 7085, 6742, 6738, 6736, 6735, 6723, 6715, 6707, 6693, 6692, 6676, 6668, 6667, 6662, 6656, 6655, 6653, 6652, 6648, 6645, 6644, 6643, 6639, 7083, 7118, 7089, 7572, 7622, 7621, 7620, 7611, 7610, 7607, 7598, 7596, 7595, 7590, 7578, 7576, 7570, 7631