# Introduction to Blog Recommendation System

In [None]:
!pip install nltk



In [None]:
# import required packages

import pandas as pd
import numpy as np
import nltk
import re
from nltk import corpus
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import wsd
from nltk.corpus import wordnet as wn

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
 # temp fix for lookup error.

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Let us load the data now

In [None]:
blog_df = pd.read_excel('/content/sampled_blogs.xlsx')
author_df = pd.read_excel('/content/sampled_authors.xlsx')
ratings_df = pd.read_excel('/content/sampled_ratings.xlsx')

The first dataset has following features,

* blog_id : Unique ID given to the blog
* author_id : Unique ID given to the author of the blog
* blog_title : Title of the Blog
* blog _ content : Brief Summary of what the blog content is about
* blog_link : link to the specific blog
* blog_img : image related to that blog
* blog_topic : domain it belongs to for eg. AI,Data Science etc.

The Second dataset has following features,

* author_id : Unique ID given to the author
* author_name : Name of the author

The Third dataset has following features,

* blog_id : ID of the blog
* user_id : ID of the User
* ratings : ratings given by the user

# Content Based Filtering

Let us first see how many blogs we have for each domain

In [None]:
blog_df['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
Android,100
App Development,100
Web Development,100
Software Development,100
Security,100
Natural Language Processing,100
Machine Learning,100
Information Security,100
Image Processing,100
Flutter,100


### Remove the columns from blog data that are not needed
Let us remove __author_id__, __blog_link__, __blog_img__ and __scrape_time__ from blog_df

In [None]:
blog_df.drop(['author_id','blog_link','blog_img','scrape_time'],axis='columns',inplace=True)

We need to remove duplicate blog data

In [None]:
blog_df.drop_duplicates(['blog_title','blog_content'],inplace=True)

### Preprocessing Text Data
It is necessary to remove the stopwords from blog content and also apply lemmatization to bring all the words to theirt root word this is the basic step we need to perform before we move forward

In [None]:
lst_stopwords=corpus.stopwords.words('english')
def pre_process_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    text=str(text).lower()
    text=text.strip()
    text = re.sub(r'[^\w\s]', '', text)
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text=[word for word in lst_text if word not in lst_stopwords]
    if flg_lemm:
        lemmatizer = WordNetLemmatizer()
        lst_text = [lemmatizer.lemmatize(word) for word in lst_text]
    if flg_stemm:
        stemmer = PorterStemmer()
        lst_text = [stemmer.stem(word) for word in lst_text]
    text=" ".join(lst_text)
    return text

In [None]:
blog_df['clean_blog_content'] = blog_df['blog_content'].apply(lambda x: pre_process_text(x,flg_stemm=False,flg_lemm=True,lst_stopwords=lst_stopwords))

### Using TFIDF Vectorizer to vectorize the blog content

In [None]:
tfidf_vecotorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vecotorizer.fit_transform(blog_df['clean_blog_content'])
print(tfidf_matrix.shape)

(2200, 10425)


Hence, thier are 25157 unique words or vectors used to describe total 10467 blogs we have in our dataset

# Using Cosine Similarity for content based filtering

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix)
print(cosine_sim)

[[1.         0.09840981 0.02885201 ... 0.         0.         0.        ]
 [0.09840981 1.         0.01466832 ... 0.         0.01738185 0.01183667]
 [0.02885201 0.01466832 1.         ... 0.         0.         0.02279981]
 ...
 [0.         0.         0.         ... 1.         0.01557258 0.08396031]
 [0.         0.01738185 0.         ... 0.01557258 1.         0.00700758]
 [0.         0.01183667 0.02279981 ... 0.08396031 0.00700758 1.        ]]


In [None]:
# Let us have the blogs rated by user with user id 12
user_rating = ratings_df[ratings_df['userId']==40]

# consider blogs with ratings greater than or equal to 3.5 just for simplification
blogs_to_consider = user_rating[user_rating['ratings']>=3.5]['blog_id']

# Now we need Id's of this blogs in form of a list
high_rated_blogs = blogs_to_consider.values

In [None]:
rated_blogs = blog_df[blog_df['blog_id'].isin(high_rated_blogs)]
rated_blogs

Unnamed: 0,blog_id,blog_title,blog_content,topic,clean_blog_content
1616,5625,How Continual Learning works part3(Machine Lea...,Addressing Catastrophic Forgetting in Federate...,Machine Learning,addressing catastrophic forgetting federated c...
1637,1526,“From Experimentation to Production: How MLflo...,Introduction to MLflow Welcome to another arti...,Machine Learning,introduction mlflow welcome another article le...
1641,1573,Exciting Developments with GPT-4: Language and...,"“In the next five years, computer programs tha...",Machine Learning,next five year computer program think read leg...
2003,9249,Export Data In CSV Using Rails and Angular.js,Hello folks! While looking for proper document...,Web Development,hello folk looking proper documentation export...
2023,9130,OSS-Open Source Software,Hello readers.. In this article I am going to ...,Web Development,hello reader article going tell open source si...
2086,9374,7 Practical Tips for Implementing BDD,Lots of software development teams are looking...,Web Development,lot software development team looking way impr...


Let us create a function to recommended blogs based on the how similar blogs are.

In [None]:
def get_similar_blog(high_rated_blogs):
    """
        Args:
            high_rated_blogs : list of blog id's of the blogs rated by the user
        Returns:
            recommended_blogs : list of blog id's of the blogs that are to be recommended
    """

    recommended_blogs = []

    for blog_id in high_rated_blogs:

        # Find out the index value of particular blog
        temp_id = blog_df[blog_df['blog_id'] == blog_id].index.values[0]

        # Find out the index value of all the blogs which have similarity greater than 0.95
        temp_blog_id = blog_df[cosine_sim[temp_id] > 0.2]['blog_id'].index.values

        # Check whether the blog is already recommended or not and also verify that it is not seen by user previously
        for b_id in temp_blog_id:
            if b_id not in recommended_blogs and b_id not in high_rated_blogs:
                recommended_blogs.append(b_id)

    return recommended_blogs

# Generating Recommendation

In [None]:
recommended_blogs=get_similar_blog(high_rated_blogs)

In [None]:
blog_df.iloc[recommended_blogs]

Unnamed: 0,blog_id,blog_title,blog_content,topic,clean_blog_content
1641,1573,Exciting Developments with GPT-4: Language and...,"“In the next five years, computer programs tha...",Machine Learning,next five year computer program think read leg...
1637,1526,“From Experimentation to Production: How MLflo...,Introduction to MLflow Welcome to another arti...,Machine Learning,introduction mlflow welcome another article le...
1150,1673,Federated Learning with PyTorch: Training a CN...,Federated Learning is a distributed machine le...,Deep Learning,federated learning distributed machine learnin...
1616,5625,How Continual Learning works part3(Machine Lea...,Addressing Catastrophic Forgetting in Federate...,Machine Learning,addressing catastrophic forgetting federated c...
67,8119,App compilation process in Flutter for Android...,Each Flutter project goes through a compilatio...,Android,flutter project go compilation process generat...
529,2267,LICENSE to code: Why one text file can ruin yo...,"At Instawork, we try to utilize (and give back...",Cloud Computing,instawork try utilize give back open source co...
1840,1184,5 Internal Threats that can impact your data p...,It is common to have data pipelines that read ...,Security,common data pipeline read data company interna...
2023,9130,OSS-Open Source Software,Hello readers.. In this article I am going to ...,Web Development,hello reader article going tell open source si...
2003,9249,Export Data In CSV Using Rails and Angular.js,Hello folks! While looking for proper document...,Web Development,hello folk looking proper documentation export...
279,3214,Applying AI: A Business Perspective,Are you looking for ways to transform your bus...,Artificial Intelligence,looking way transform business ai might intere...


#CONTEXT BASED BY CATEGORY


In [None]:
merged_df = pd.merge(ratings_df, blog_df[['blog_id', 'topic']], on='blog_id')
merged_df

Unnamed: 0,blog_id,userId,ratings,topic
0,4157,16,2.0,Blockchain
1,4402,16,5.0,Blockchain
2,4315,16,0.5,Blockchain
3,7558,31,5.0,Cloud Services
4,211,31,2.0,Cloud Services
...,...,...,...,...
11226,2804,5008,5.0,Backend Development
11227,2789,5008,5.0,Backend Development
11228,2677,5008,0.5,Backend Development
11229,1259,16,5.0,Security


In [None]:
def get_top_unique_topics(group):
    # Sort by ratings descending and drop duplicate topics
    unique_topics = group.sort_values(by='ratings', ascending=False).drop_duplicates(subset='topic')['topic']
    # Select top 4 unique topics
    return unique_topics.head(5).tolist()

In [None]:
top_topics_df = merged_df.groupby('userId').apply(get_top_unique_topics).reset_index()
top_topics_df.columns = ['userId', 'top_topics']

# Display the resulting DataFrame
print(top_topics_df)

top_topics_df.to_csv('Topics-by-user.csv')

      userId                           top_topics
0         16      [Blockchain, Security, Android]
1         31     [Cloud Services, Cryptocurrency]
2         39      [Security, Backend Development]
3         40  [Machine Learning, Web Development]
4         41                           [Security]
...      ...                                  ...
1160    4964                            [Flutter]
1161    4969    [Deep Learning, Machine Learning]
1162    4998   [Blockchain, Information Security]
1163    5006         [Blockchain, Cryptocurrency]
1164    5008                [Backend Development]

[1165 rows x 2 columns]


  top_topics_df = merged_df.groupby('userId').apply(get_top_unique_topics).reset_index()


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Merge and create top topics per user
merged_df = pd.merge(ratings_df, blog_df[['blog_id', 'topic']], on='blog_id')

def get_top_unique_topics(group):
    unique_topics = group.sort_values(by='ratings', ascending=False).drop_duplicates(subset='topic')['topic']
    return unique_topics.head(4).tolist()

top_topics_df = merged_df.groupby('userId').apply(get_top_unique_topics).reset_index()
top_topics_df.columns = ['userId', 'top_topics']

top_topics_df

  top_topics_df = merged_df.groupby('userId').apply(get_top_unique_topics).reset_index()


Unnamed: 0,userId,top_topics
0,16,"[Blockchain, Security, Android]"
1,31,"[Cloud Services, Cryptocurrency]"
2,39,"[Security, Backend Development]"
3,40,"[Machine Learning, Web Development]"
4,41,[Security]
...,...,...
1160,4964,[Flutter]
1161,4969,"[Deep Learning, Machine Learning]"
1162,4998,"[Blockchain, Information Security]"
1163,5006,"[Blockchain, Cryptocurrency]"


In [None]:
def expand_topics(topics_df):
    topics_df['topic_str'] = topics_df['top_topics'].apply(lambda x: " ".join([topic.replace(" ", "_") for topic in x]))
    return topics_df

expanded_topics_df = expand_topics(top_topics_df)

# Use CountVectorizer to create user-topic matrix
vectorizer = CountVectorizer()
user_topic_matrix = vectorizer.fit_transform(expanded_topics_df['topic_str'])

# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_topic_matrix)

In [None]:
def recommend_blogs(new_user_topics, top_topics_df, blogs_df, similarity_matrix):
    # Replace spaces with underscores in the new user's topics
    formatted_topics = [topic.replace(" ", "_") for topic in new_user_topics]

    # Convert new user topics to a vector
    new_user_vector = vectorizer.transform([" ".join(formatted_topics)])

    # Calculate similarity with existing users
    new_user_similarity = cosine_similarity(new_user_vector, similarity_matrix).flatten()

    # Find the most similar user
    most_similar_user_index = new_user_similarity.argmax()
    similar_user_id = top_topics_df.iloc[most_similar_user_index]['userId']

    # Fetch blogs rated highly by the most similar user
    recommended_blogs = merged_df[merged_df['userId'] == similar_user_id] \
        .sort_values(by='ratings', ascending=False)[['blog_id', 'topic']].drop_duplicates()

    return recommended_blogs

# Example usage: Recommend blogs for a new user with preferences
new_user_preferences = ['Cryptocurrency', 'Artificial Intelligence', 'Flutter']
recommendations = recommend_blogs(new_user_preferences, expanded_topics_df, blog_df, user_topic_matrix)

# Display recommendations
print("Recommended Blogs for the New User:")
print(recommendations)

Recommended Blogs for the New User:
     blog_id           topic
249     4578  Cryptocurrency
254     9018         Flutter
251     8792         Flutter
252     8946         Flutter
247     4778  Cryptocurrency
248     4545  Cryptocurrency
250     4730  Cryptocurrency
253     8804         Flutter
255     8830         Flutter


#Collab

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357273 sha256=84cfe304cbdb1340fd6b94b486e394a9922f234e93430e826e2db5c4a8bc7af0
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD, KNNWithMeans, KNNWithZScore
from surprise.model_selection import train_test_split
from collections import defaultdict

# Load datasets
ratings_data = pd.read_excel("sampled_ratings.xlsx")  # Replace with your file path
blogs_data = pd.read_excel("sampled_blogs.xlsx")  # Replace with your file path

# Define the reader format for the Surprise library
reader = Reader(rating_scale=(ratings_data['ratings'].min(), ratings_data['ratings'].max()))

# Create the Surprise dataset
data = Dataset.load_from_df(ratings_data[['userId', 'blog_id', 'ratings']], reader)

# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the SVD model
model = KNNWithMeans()
model.fit(trainset)

# Function to get top-N recommendations
def get_top_n_recommendations(predictions, n=5):
    # Map the predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and retrieve the top-N items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Predict ratings for all pairs (userId, blog_id) in the dataset
predictions = model.test(testset)

# Get top-N recommendations for all users
top_n_recommendations = get_top_n_recommendations(predictions, n=30)

# Generate recommendations for a specific user
def recommend_blogs(user_id, top_n_recommendations, blogs_data):
    if user_id not in top_n_recommendations:
        return f"No recommendations available for user {user_id}."

    # Get the top-N blog IDs for the user
    recommended_blog_ids = [iid for (iid, _) in top_n_recommendations[user_id]]

    # Retrieve the blog details
    recommended_blogs = blogs_data[blogs_data['blog_id'].isin(recommended_blog_ids)]
    return recommended_blogs[['blog_title', 'blog_link', 'author_id', 'topic']]

# Example: Recommend blogs for a specific user
user_id = 39  # Replace with a user ID from your dataset
recommended_blogs = recommend_blogs(user_id, top_n_recommendations, blogs_data)
print(recommended_blogs)


Computing the msd similarity matrix...
Done computing similarity matrix.
                                             blog_title  \
1834              AWS EKS : External Secrets With Vault   
1837            What is Decentralized Identity Solution   
1891  A Brief History of Zero Knowledge in Five Semi...   
1898  Create Biometric Authentication in Android | a...   

                                              blog_link  author_id     topic  
1834  https://medium.com/@patelsaheb/aws-eks-externa...        893  Security  
1837  https://medium.com/@nfting/what-is-decentraliz...        926  Security  
1891  https://medium.com/intotheblock/a-brief-histor...        901  Security  
1898  https://medium.com/@anafthdev_/create-biometri...       3866  Security  
