In [4]:
!pip install scikit-surprise



In [19]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import gc

In [3]:
user_interactions = pd.read_csv('/content/user_interaction.csv')
metadata = pd.read_csv('/content/metadata.csv')

In [4]:
user_interactions['updated_at'] = pd.to_datetime(user_interactions['updated_at'])
metadata['updated_at'] = pd.to_datetime(metadata['updated_at'])
metadata['published_at'] = pd.to_datetime(metadata['published_at'])

In [5]:
metadata['category_list'] = metadata['category_name'].str.split(',')

In [6]:
user_interactions = user_interactions.head(50000)
metadata = metadata.head(50000)


In [7]:
metadata

Unnamed: 0,author_id,pratilipi_id,category_name,reading_time,updated_at,published_at,category_list
0,-3418949279741297,1025741862639304,translation,0,2020-08-19 15:26:13,2016-09-30 10:37:04,[translation]
1,-2270332351871840,1377786215601277,translation,171,2021-01-21 16:27:07,2018-06-11 13:17:48,[translation]
2,-2270332352037261,1377786215601962,translation,92,2020-09-29 12:33:57,2018-06-12 04:19:12,[translation]
3,-2270332352521845,1377786215640994,translation,0,2019-10-17 09:03:37,2019-09-26 14:58:53,[translation]
4,-2270332349665658,1377786215931338,translation,47,2020-05-05 11:33:41,2018-11-25 12:28:23,[translation]
...,...,...,...,...,...,...,...
49995,-2270332352820241,1377786222866768,novels,647,2021-02-28 14:32:38,2021-02-28 14:32:38,[novels]
49996,-2270332336036519,1377786222867284,novels,2290,2021-02-28 15:32:07,2021-02-28 15:32:07,[novels]
49997,-2941348623349105,1377786222867535,novels,521,2021-03-03 06:31:09,2021-02-28 15:51:38,[novels]
49998,-2270332350017603,1377786222868047,novels,467,2021-03-01 15:37:48,2021-02-28 17:42:21,[novels]


In [8]:
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(
    user_interactions[['user_id', 'pratilipi_id', 'read_percent']],
    reader
)


In [9]:
trainset, testset = train_test_split(data, test_size=0.25)

In [10]:
svd_model = SVD(n_factors=50, n_epochs=20, random_state=42)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x78e93af6f090>

In [11]:
mlb = MultiLabelBinarizer()
category_matrix = mlb.fit_transform(metadata['category_list'])
category_df = pd.DataFrame(
    category_matrix,
    columns=mlb.classes_,
    index=metadata['pratilipi_id']
)

In [20]:
author_ids = metadata['author_id'].astype(str)  # Convert to string to be safe
author_encoder = OneHotEncoder(sparse_output=False)
author_matrix = author_encoder.fit_transform(author_ids.values.reshape(-1, 1))
author_df = pd.DataFrame(
    author_matrix,
    columns=[f'author_{i}' for i in range(author_matrix.shape[1])],
    index=metadata['pratilipi_id']
)


In [13]:
reading_time_df = pd.DataFrame(
    metadata['reading_time'] / metadata['reading_time'].max(),
    columns=['reading_time_norm'],
    index=metadata['pratilipi_id']
)

In [21]:
 content_features = pd.concat([category_df, author_df, reading_time_df], axis=1)

In [22]:
content_features = content_features.astype(np.float64)

In [23]:
content_features_sparse = csr_matrix(content_features.values)

In [24]:
current_date = metadata['updated_at'].max()
metadata['days_since_published'] = (current_date - metadata['published_at']).dt.days
metadata['recency_score'] = 1 - (metadata['days_since_published'] / metadata['days_since_published'].max())

In [25]:
pratilipi_engagement = user_interactions.groupby('pratilipi_id')['read_percent'].mean()

In [27]:
popularity_df = pd.DataFrame(index=metadata['pratilipi_id'].unique())
# Use groupby and mean to consolidate 'recency_score' for duplicate 'pratilipi_id'
popularity_df['recency_score'] = metadata.groupby('pratilipi_id')['recency_score'].mean()
popularity_df['engagement_score'] = pratilipi_engagement
popularity_df = popularity_df.fillna(0)
popularity_df['popularity_score'] = (0.7 * popularity_df['engagement_score'] +
                                    0.3 * popularity_df['recency_score'])

In [28]:
# Pre-compute user history for faster lookups
user_history_dict = {}
for user_id in user_interactions['user_id'].unique():
    user_history_dict[user_id] = set(user_interactions[user_interactions['user_id'] == user_id]['pratilipi_id'].unique())

In [29]:
def recommend_pratilipis(user_id, n_recommendations=5, top_similar_items=50):
    """
    Memory-efficient recommendation function that avoids creating large matrices
    """
    # Get user history
    user_history = user_history_dict.get(user_id, set())

    # If no history, return popular items
    if not user_history:
        top_popular = popularity_df.sort_values('popularity_score', ascending=False)
        return top_popular.index[:n_recommendations].tolist()

    # Get candidate items (most popular items + we'll add more below)
    candidates = set(popularity_df.sort_values('popularity_score', ascending=False).index[:100])

    # Add content-based recommendations for each item in user history
    for hist_item in user_history:
        # Skip if item not in our features
        if hist_item not in content_features.index:
            continue

        # Get item features
        item_features = content_features.loc[hist_item].values.reshape(1, -1)

        # Calculate similarity with all items
        if isinstance(item_features, np.ndarray) and item_features.size > 0:
            item_sparse = csr_matrix(item_features)
            sims = cosine_similarity(item_sparse, content_features_sparse).flatten()

            # Get top similar items
            similar_indices = np.argsort(sims)[-top_similar_items:]
            similar_items = [content_features.index[i] for i in similar_indices
                            if content_features.index[i] != hist_item]

            # Add to candidates
            candidates.update(similar_items)

    # Remove items user has already read
    candidates = candidates - user_history

    # Score candidates using the three components
    scores = []
    for item_id in candidates:
        # 1. Collaborative filtering score
        try:
            cf_score = svd_model.predict(user_id, item_id).est / 100  # Normalize to 0-1
        except:
            cf_score = 0

        # 2. Content-based score (average similarity with user history)
        cb_scores = []
        for hist_item in user_history:
            if hist_item in content_features.index and item_id in content_features.index:
                hist_features = content_features.loc[hist_item].values.reshape(1, -1)
                item_features = content_features.loc[item_id].values.reshape(1, -1)

                if isinstance(hist_features, np.ndarray) and hist_features.size > 0:
                    similarity = cosine_similarity(hist_features, item_features)[0][0]
                    cb_scores.append(similarity)

        cb_score = np.mean(cb_scores) if cb_scores else 0

        # 3. Popularity score
        pop_score = popularity_df.loc[item_id, 'popularity_score'] if item_id in popularity_df.index else 0

        # Calculate final score as weighted sum
        final_score = (0.7 * cf_score) + (0.2 * cb_score) + (0.1 * pop_score)
        scores.append((item_id, final_score))

    # Sort by score and return top N
    scores.sort(key=lambda x: x[1], reverse=True)
    recommendations = [item_id for item_id, _ in scores[:n_recommendations]]

    return recommendations


In [37]:
def evaluate_recommendations(test_users, k=5):
    hit_count = 0
    total_count = 0

    for user_id in test_users:
        # Get test items for user
        # Modified to access tuple elements by index
        user_test_items = set([row[1] for row in testset if row[0] == user_id])
        print(user_test_items)
        if user_test_items:
            # Get recommendations
            recommendations = recommend_pratilipis(user_id, n_recommendations=k)

            # Count hits
            hits = set(recommendations) & user_test_items
            hit_count += len(hits)


    hit_rate = hit_count
    return hit_rate

In [50]:
sample_user = user_interactions['user_id'].iloc[0]
print(type(sample_user))
recommendations = recommend_pratilipis(sample_user)
print(f"Recommended pratilipis for user {sample_user}: {recommendations}")
hit_rate = evaluate_recommendations([sample_user])
print(f"Hit rate: {hit_rate:.4f}")

<class 'numpy.int64'>
Recommended pratilipis for user 5506791961876448: [1377786228240459, 1377786228186211, 1377786228172971, 1377786228184553, 1377786228263313]
{1377786228262109}
Hit rate: 0.0000
