In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl (11.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m1m23.4 MB/s[0m eta [36m0:00:01[0m
Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl (11.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m1m12.4 MB/s[0m eta [36m0:00:01[0m
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset

# Step 1: Load Data
interactions = pd.read_csv('user_interaction.csv')
meta = pd.read_csv('metadata.csv')





In [2]:
# Convert timestamp columns to datetime objects
interactions['updated_at'] = pd.to_datetime(interactions['updated_at'])
meta['updated_at'] = pd.to_datetime(meta['updated_at'])
meta['published_at'] = pd.to_datetime(meta['published_at'])

# Merge interaction data with meta to bring in reading_time
merged_data = interactions.merge(meta[['pratilipi_id', 'reading_time', 'category_name']], on='pratilipi_id', how='left')



In [3]:
merged_data

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at,reading_time,category_name
0,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291,376.0,novels
1,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291,376.0,family
2,5506791961876448,1377786228262109,100.0,2022-03-22 10:29:57.291,376.0,romance
3,5506791971543560,1377786223038206,40.0,2022-03-19 13:49:25.660,361.0,romance
4,5506791971543560,1377786223038206,40.0,2022-03-19 13:49:25.660,361.0,suspense
...,...,...,...,...,...,...
4966616,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083,560.0,novels
4966617,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083,560.0,romance
4966618,5506791968781083,1377786226056467,100.0,2022-03-21 06:41:54.083,560.0,suspense
4966619,5506791956021363,1377786226666757,100.0,2022-03-20 08:59:49.346,727.0,novels


In [4]:
# Compute effective reading time as the interaction weight
merged_data['effective_read_time'] = merged_data['reading_time'] * (merged_data['read_percent'] / 100)

# Optional: Aggregate interactions for the same user and pratilipi (if multiple interactions exist)
aggregated = merged_data.groupby(['user_id', 'pratilipi_id']).agg({'effective_read_time': 'sum'}).reset_index()



In [5]:
aggregated

Unnamed: 0,user_id,pratilipi_id,effective_read_time
0,3257552805995172,1377786216957646,967.000000
1,3257552805995172,1377786220826675,0.000000
2,3257552805995172,1377786226782638,0.000000
3,3257552805995172,1377786227056508,1695.000000
4,3257552805995172,1377786227250750,0.000000
...,...,...,...
2499995,5506791996685224,1377786215645840,488.655572
2499996,5506791996685251,1377786216362064,6225.000000
2499997,5506791996685282,1377786222782765,1236.000000
2499998,5506791996685286,1377786216009820,1287.000000


In [6]:
# Step 2: Split the Data based on time (75% training, 25% testing)
merged_data = merged_data.sort_values('updated_at')
split_index = int(0.75 * len(merged_data))
train_data = merged_data.iloc[:split_index]
# Re-aggregate training data
train_aggregated = train_data.groupby(['user_id', 'pratilipi_id']).agg({'effective_read_time': 'sum'}).reset_index()

# Step 3: Build the Interaction Matrix and Item Features for LightFM
dataset = Dataset()

# Fit the dataset with all unique users and pratilipi items
dataset.fit(users=aggregated['user_id'].unique(), items=aggregated['pratilipi_id'].unique())

# Build the interactions matrix using effective_read_time as weights
train_interactions, _ = dataset.build_interactions(
    [(row['user_id'], row['pratilipi_id'], row['effective_read_time']) for _, row in train_aggregated.iterrows()]
)




In [7]:
# Prepare item features using the pratilipi category
# Fill missing category values if any
meta['category_name'] = meta['category_name'].fillna('Unknown')
dataset.fit_partial(items=meta['pratilipi_id'].unique(), item_features=meta['category_name'].unique())
item_features = dataset.build_item_features(
    [(row['pratilipi_id'], [row['category_name']]) for _, row in meta.iterrows()]
)



In [8]:
# Step 4: Train the LightFM Model using a hybrid approach
model = LightFM(loss='warp')
model.fit(train_interactions, item_features=item_features, epochs=30, num_threads=4)



<lightfm.lightfm.LightFM at 0x311bd51b0>

In [9]:
# Step 5: Recommendation Function
def recommend_pratilipis(model, dataset, user_id, item_features, num_recommendations=5):
    # Retrieve the internal mappings
    (user_map, item_map, _,_) = dataset.mapping()
    if user_id not in user_map:
        return []
    user_x = user_map[user_id]
    n_items = len(item_map)
    # Predict scores for all items for the given user
    scores = model.predict(user_x, list(range(n_items)), item_features=item_features)
    # Get the indices of the top scoring items
    top_items = sorted(range(n_items), key=lambda i: scores[i], reverse=True)[:num_recommendations]
    # Map back to the original pratilipi_ids
    inv_item_map = {v: k for k, v in item_map.items()}
    return [inv_item_map[i] for i in top_items]

# Example: Recommend pratilipis for user with id 1
recommendations = recommend_pratilipis(model, dataset, user_id=3257552805995172, item_features=item_features)
print("Recommended pratilipis for user 1:", recommendations)

Recommended pratilipis for user 1: [3873168935527076, 5506791947855408, 5506791947270270, 5255525210956452, 5506791947028960]


In [10]:
test_data = merged_data.iloc[split_index:]
test_aggregated = test_data.groupby(['user_id', 'pratilipi_id']).agg({'effective_read_time': 'sum'}).reset_index()

In [17]:
test_data

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at,reading_time,category_name,effective_read_time
1620122,5506791970576411,1377786217507154,100.0,2022-03-22 02:31:42.402,454.0,family,454.0
1620121,5506791970576411,1377786217507154,100.0,2022-03-22 02:31:42.402,454.0,drama,454.0
1424006,5506791959984045,1377786223202978,100.0,2022-03-22 02:31:42.434,640.0,social,640.0
1424004,5506791959984045,1377786223202978,100.0,2022-03-22 02:31:42.434,640.0,novels,640.0
1424005,5506791959984045,1377786223202978,100.0,2022-03-22 02:31:42.434,640.0,romance,640.0
...,...,...,...,...,...,...,...
886514,5506791959279525,1377786225901639,100.0,2022-03-23 00:08:16.603,640.0,horror,640.0
886512,5506791959279525,1377786225901639,100.0,2022-03-23 00:08:16.603,640.0,romance,640.0
1114834,5506791996088677,1377786223947072,84.0,2022-03-23 00:08:22.177,,,
847435,5506791980825783,1377786227076616,100.0,2022-03-23 00:08:24.364,,,


In [18]:
test_aggregated

Unnamed: 0,user_id,pratilipi_id,effective_read_time
0,3260243929637540,1377786221978573,0.000000
1,3260275089121956,1377786225146500,0.000000
2,3260275089121956,1377786225224696,562.000000
3,3260275089121956,1377786225289397,492.000000
4,3260275089121956,1377786225314175,0.000000
...,...,...,...
624301,5506791996685224,1377786215645840,488.655572
624302,5506791996685251,1377786216362064,6225.000000
624303,5506791996685282,1377786222782765,1236.000000
624304,5506791996685286,1377786216009820,1287.000000


In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(user_id, recommendations, test_data):
    """
    Evaluates the recommendation accuracy for a particular user by calculating precision, recall, and F1-score.
    
    Parameters:
    - user_id (int): The ID of the user to evaluate.
    - recommendations (list): List of recommended pratilipi IDs for the user.
    - test_data (DataFrame): The test dataset containing actual interactions.
    
    Returns:
    - A dictionary with precision, recall, and F1-score.
    """

    # Get the actual pratilipis the user interacted with in the test set
    actual_pratilipis = set(test_data[test_data['user_id'] == user_id]['pratilipi_id'])
    print(actual_pratilipis)

    # Convert recommendations to a set for comparison
    recommended_pratilipis = set(recommendations)

    # Calculate true positives, false positives, and false negatives
    true_positives = len(actual_pratilipis & recommended_pratilipis)  # Intersection of sets
    false_positives = len(recommended_pratilipis - actual_pratilipis)  # Recommended but not in actual
    false_negatives = len(actual_pratilipis - recommended_pratilipis)  # Actual but not recommended

    # Precision: How many of the recommended pratilipis were correct?
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0

    # Recall: How many of the actual pratilipis were recommended?
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0

    # F1-score: Harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "User ID": user_id,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    }

# Example Usage:
user_id = 5506791996685385  # Replace with the user ID you want to evaluate
actual_test_data = test_data  # Use the test dataset

# Generate recommendations for the user
recommendations = recommend_pratilipis(model, dataset, user_id, item_features)

# Evaluate recommendations
evaluation_results = evaluate_recommendations(user_id, recommendations, actual_test_data)

# Print results
print("Evaluation Results:", evaluation_results)


{1377786215676162}
Evaluation Results: {'User ID': 5506791996685385, 'Precision': 0.0, 'Recall': 0.0, 'F1-score': 0.0}
