In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
blogs_rating_df = pd.read_csv('data/Blog Ratings.csv')
blogs_df = pd.read_csv('data/Medium Blog Data.csv')
authors_df = pd.read_csv('data/Authors_Data_Cleaned.csv')

In [9]:
blogs_df[blogs_df['blog_id']==23]

Unnamed: 0,blog_id,author_id,blog_title,blog_content,blog_link,blog_img,topic,scrape_time
18,23,26,"Why AI art can be real art, opening new and ex...",Art has been an expression of creativity since...,https://medium.com/@themarco/why-ai-art-can-be...,https://miro.medium.com/fit/c/140/140/0*5tSk6n...,ai,2023-02-27 07:41:47


### Content-Based Recommender Using TF-IDF
#### Recommend blogs similar to those a user liked/favorited, based on textual content similarity.

In [None]:
# Step 1: Fit TF-IDF on blog content
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(blogs_df['blog_content'].fillna(''))

# Step 2: Function to recommend similar blogs to a given blog_id
def recommend_similar_blogs(blog_id, top_n=5):
    blog_idx = blogs_df[blogs_df['blog_id'] == blog_id].index[0]
    cosine_sim = cosine_similarity(tfidf_matrix[blog_idx], tfidf_matrix).flatten()
    similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]
    return blogs_df.iloc[similar_indices][['blog_id', 'blog_title', 'topic']]

# Example usage
print(recommend_similar_blogs(blog_id=23))  # Replace with different blog_id

      blog_id                                         blog_title  \
5634     5661  🤖🎨 Is AI Killing Creativity? How to Safeguard ...   
132       141  Discover the Best AI Art with Imaginative Arts...   
3122     3149       Unique Traditional Art VS Soaring AI Drawing   
3478     3505  Unleashing Creativity with AI: How Artificial ...   
1388     1415                                  Art of Leadership   

                 topic  
5634  machine-learning  
132                 ai  
3122                ai  
3478                ai  
1388                ai  


### User-User Collaborative Filtering (kNN)
#### Recommend blogs liked/ rated by similar users, based on the ratings matrix.

In [15]:
blogs_rating_df.head(2)

Unnamed: 0,blog_id,userId,ratings
0,9025,11,3.5
1,9320,11,5.0


In [12]:
authors_df.head(2)

Unnamed: 0,author_id,author_name
0,1,Yaksh
1,2,Xit


In [19]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Step 1: Create user-item rating matrix
user_blog_matrix = blogs_rating_df.pivot_table(
    index='userId',
    columns='blog_id',
    values='ratings'
).fillna(0)

print(user_blog_matrix)

blog_id  1     3     4     5     6     7     9     10    11    12    ...  \
userId                                                               ...   
10        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
11        0.0   0.0   5.0   0.0   0.0   0.0   0.0   5.0   0.0   0.0  ...   
12        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
13        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
14        0.0   0.0   0.0   0.0   2.0   0.5   0.0   5.0   3.5   0.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
5006      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5007      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5008      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5009      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5010      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

blog_id  97

In [26]:


# Step 2: Fit kNN model on user vectors
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(csr_matrix(user_blog_matrix.values))

# Step 3: Get top N users similar to a given user
def recommend_from_similar_users(userId, top_n=5):
    if userId not in user_blog_matrix.index:
        return f"User {userId} not found."

    user_vector = user_blog_matrix.loc[[userId]].values
    distances, indices = knn_model.kneighbors(user_vector, n_neighbors=top_n+1)

    similar_user_ids = user_blog_matrix.index[indices.flatten()[1:]]
    similar_users_data = blogs_rating_df[blogs_rating_df['userId'].isin(similar_user_ids)]

    # Recommend top blogs these users liked
    top_recs = similar_users_data[similar_users_data['ratings'] >= 3.5] \
        .groupby('blog_id').agg(avg_rating=('ratings', 'mean'), count=('ratings', 'count')) \
        .reset_index().sort_values(by='avg_rating', ascending=False).head(top_n)

    return blogs_df[blogs_df['blog_id'].isin(top_recs['blog_id'])][['blog_id', 'blog_title', 'topic']]

# Example usage
print(recommend_from_similar_users(userId=11))  # Replace with actual userId


      blog_id                                         blog_title  \
1036     1063  Here are 10 ideas that combine the strengths o...   
9305     9332  Hosting the Roost: 40 Jobs to Rule the Web on ...   
9308     9335                             Key Features of Kotlin   
9323     9350  How to Build a Responsive Accordion/Collapsibl...   
9389     9416  Top 6 Free JavaScript SEO Tools to Improve You...   

                topic  
1036             web3  
9305  web-development  
9308  web-development  
9323  web-development  
9389  web-development  


### Item-Item Collaborative Filtering
#### Instead of finding similar users, this model finds similar blogs based on how users rated them.

In [22]:
# Step 1: Create blog-user rating matrix
blog_user_matrix = blogs_rating_df.pivot_table(
    index='blog_id',
    columns='userId',
    values='ratings'
).fillna(0)

# Step 2: Fit kNN model on blog vectors
knn_item_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_item_model.fit(csr_matrix(blog_user_matrix.values))

# Step 3: Recommend similar blogs to a given blog
def recommend_similar_blogs_item_based(blog_id, top_n=5):
    if blog_id not in blog_user_matrix.index:
        return f"Blog {blog_id} not found."

    blog_vector = blog_user_matrix.loc[[blog_id]].values
    distances, indices = knn_item_model.kneighbors(blog_vector, n_neighbors=top_n + 1)

    similar_blog_ids = blog_user_matrix.index[indices.flatten()[1:]]  # skip itself

    return blogs_df[blogs_df['blog_id'].isin(similar_blog_ids)][['blog_id', 'blog_title', 'topic']]

print(recommend_similar_blogs_item_based(blog_id=1063)) 


      blog_id                                         blog_title topic
224       244  Aptos Labs brings Web3 to Gaming with its new ...  web3
953       980                                    LXDAO Weekly#42  web3
983      1010                  Uldor Monthly Update — March 2023  web3
986      1013                          Mission Dawn Newbie Guide  web3
4952     4979  Mobile Movement: Dialect Rugged By Apple, Sola...  web3


### Cold Start – User
#### Recommend blogs to new users based on popularity or recency.

In [None]:
def recommend_for_new_user(top_n=10):
    # Merge ratings with blog metadata
    merged_df = blogs_rating_df.merge(blogs_df, on='blog_id', how='inner')
    merged_df['scrape_time'] = pd.to_datetime(merged_df['scrape_time'])

    # Group by blog to get average rating and number of ratings
    stats_df = merged_df.groupby(['blog_id', 'blog_title']).agg(
        avg_rating=('ratings', 'mean'),/
        num_ratings=('ratings', 'count'),
        latest_time=('scrape_time', 'max')
    ).reset_index()

    # Weighted score = avg_rating * log(num_ratings + 1)
    stats_df['weighted_score'] = stats_df['avg_rating'] * np.log1p(stats_df['num_ratings'])

    # Sort by weighted score and recency
    return stats_df.sort_values(by=['weighted_score', 'latest_time'], ascending=[False, False]).head(top_n)

# Example usage
print(recommend_for_new_user(top_n=10))

      blog_id                                         blog_title  avg_rating  \
8535     8582  Top Mobile App Development Trends To Look Out ...    3.981132   
8636     8683       Deploying Docker Containers with Bind Mounts    3.811321   
9618     9668  10 Advanced Techniques Every Senior React Engi...    3.823529   
2653     2685  Backend with Spring & Kotlin: Spring Data with...    3.730769   
8328     8374           Ultimate CI/CD For Flutter Mobile Apps 🚀    3.987500   
8738     8785  Flutter Widget Lifecycle: Everything You Need ...    3.603448   
2697     2730                               ReactJS Notlarım — 2    3.636364   
2925     2959                                       Why FastAPI?    3.793478   
9470     9519  Redis vs. Other Databases: An In-Depth Compari...    3.793478   
8823     8870            Flutter Web — Runtime Docker variables.    3.577586   

      num_ratings         latest_time  weighted_score  
8535           53 2023-04-04 08:53:52       15.880672  
8636   

### Cold Start – Blog
#### Recommend new blogs to users based on content similarity with already liked blogs.

In [36]:
def recommend_new_blogs_for_user(user_id, top_n=5):
    # Step 1: Get blogs liked/favorited by user
    liked_blogs = blogs_rating_df[
        (blogs_rating_df['userId'] == user_id) & 
        (blogs_rating_df['ratings'] >= 3.5)
    ]['blog_id'].tolist()

    if not liked_blogs:
        return f"User {user_id} has no high-rated blog history."

    # Step 2: Prepare content
    blog_texts = blogs_df[['blog_id', 'blog_title', 'blog_content']].copy()
    blog_texts['text'] = blog_texts['blog_title'].fillna('') + ' ' + blog_texts['blog_content'].fillna('')

    # Step 3: TF-IDF
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(blog_texts['text'])

    # Step 4: Find content vector for liked blogs
    # Step 4: Find content vector for liked blogs
    liked_indices = blogs_df[blogs_df['blog_id'].isin(liked_blogs)].index
    liked_vector = tfidf_matrix[liked_indices].mean(axis=0).A1  # FIXED


    # Step 5: Compute similarity with all blogs
    similarities = cosine_similarity([liked_vector], tfidf_matrix).flatten()
    blog_texts['similarity'] = similarities

    # Step 6: Filter out already rated blogs
    already_seen = blogs_rating_df[blogs_rating_df['userId'] == user_id]['blog_id']
    recs = blog_texts[~blog_texts['blog_id'].isin(already_seen)]

    return recs.sort_values(by='similarity', ascending=False).head(top_n)[['blog_id', 'blog_title', 'similarity']]

print(recommend_new_blogs_for_user(user_id=11))

       blog_id                                         blog_title  similarity
1401      1428  Top 10 AI Tools You’ve Never Heard of — But Sh...    0.227465
124        133                              Product design and AI    0.212324
10109    10136  Don’t Be Left Behind: Tips for Surviving and S...    0.210500
3163      3190  10 Best AI Tools for Data Extraction, Content ...    0.206791
3363      3390  Revolutionizing product design with AI and psy...    0.204004
