In [2]:
!pip install scikit-surprise

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357267 sha256=58e41eccbdf7e8daee90b60469247a07dfcb953fad2628c10b3af22ac2449371
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [3]:
# Step 1: Prepare the Dataset
# IT Subject Metadata
subjects = pd.DataFrame({
    'subject_id': [1, 2, 3, 4, 5],
    'name': ['Python', 'React', 'Agile', 'Singleton Pattern', 'SOLID Principles'],
    'category': ['Programming Language', 'Framework', 'Methodology', 'Design Pattern', 'Principles'],
    'description': [
        'High-level programming language used for web and software development',
        'JavaScript library for building user interfaces',
        'Iterative and incremental software development methodology',
        'Ensure a class has only one instance and provide a global access point',
        'Object-oriented principles: Single Responsibility, Open-Closed, Liskov Substitution, Interface Segregation, Dependency Inversion'
    ]
})

# User ratings
ratings_dict = {
    'user_id': [1, 1, 1, 2, 2, 3, 3, 4],
    'subject_id': [1, 2, 3, 4, 5, 1, 3, 5],
    'rating': [5, 4, 3, 5, 4, 5, 4, 2]
}
ratings = pd.DataFrame(ratings_dict)

In [4]:
# Step 2: Content-Based Filtering - Compute Similarity Matrix
# Use TF-IDF on the description
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(subjects['description'])

# Compute cosine similarity between subjects
content_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [5]:
# Step 3: Collaborative Filtering Using Surprise Library
# Load the data into the Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'subject_id', 'rating']], reader)

# Use SVD for collaborative filtering
model = SVD()
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7bd1d4be0730>

In [6]:
# Step 4: Hybrid Recommendation Function
def hybrid_recommendations(user_id, user_ratings, subjects, content_sim, model, alpha=0.5, top_n=3):
    """
    Hybrid recommendations combining content-based and collaborative filtering.
    """
    # Collaborative filtering scores
    collaborative_scores = np.zeros(len(subjects))
    for idx, subject_id in enumerate(subjects['subject_id']):
        collaborative_scores[idx] = model.predict(user_id, subject_id).est

    # Content-based filtering scores
    user_subjects = user_ratings[user_ratings['user_id'] == user_id]
    content_scores = np.zeros(len(subjects))
    for _, row in user_subjects.iterrows():
        rated_subject_idx = subjects[subjects['subject_id'] == row['subject_id']].index[0]
        content_scores += row['rating'] * content_sim[rated_subject_idx]

    # Normalize scores
    content_scores = content_scores / np.max(content_scores) if np.max(content_scores) > 0 else content_scores
    collaborative_scores = collaborative_scores / np.max(collaborative_scores) if np.max(collaborative_scores) > 0 else collaborative_scores

    # Combine scores using alpha
    hybrid_scores = alpha * content_scores + (1 - alpha) * collaborative_scores

    # Get top N recommendations
    recommended_indices = np.argsort(hybrid_scores)[::-1][:top_n]
    return subjects.iloc[recommended_indices]['name']

In [7]:
# Step 5: Test the Hybrid Recommendation System
user_id = 1
recommended_subjects = hybrid_recommendations(user_id, ratings, subjects, content_sim, model)
print(f"\nHybrid Recommendations for User {user_id}:")
print(recommended_subjects.to_list())


Hybrid Recommendations for User 1:
['Python', 'React', 'Agile']
