In [2]:

!pip install implicit scikit-learn pandas numpy

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-win_amd64.whl.metadata (6.3 kB)
Downloading implicit-0.7.2-cp310-cp310-win_amd64.whl (748 kB)
   ---------------------------------------- 0.0/748.6 kB ? eta -:--:--
   -------------- ------------------------- 262.1/748.6 kB ? eta -:--:--
   ---------------------------------------- 748.6/748.6 kB 2.1 MB/s eta 0:00:00
Installing collected packages: implicit
Successfully installed implicit-0.7.2



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [7]:
def load_and_preprocess_data(user_interactions_path, meta_data_path):
    interactions_df = pd.read_csv(user_interactions_path)
    meta_df = pd.read_csv(meta_data_path)
    
    # Convert timestamps
    interactions_df['updated_at'] = pd.to_datetime(interactions_df['updated_at'])
    meta_df['updated_at'] = pd.to_datetime(meta_df['updated_at'])
    meta_df['published_at'] = pd.to_datetime(meta_df['published_at'])
    
    interactions_df = interactions_df.sort_values('updated_at')
    
    return interactions_df, meta_df

interactions_df, meta_df = load_and_preprocess_data('user_interaction.csv', 'metadata.csv')


In [8]:

print("User Interactions:")
print(interactions_df.head())
print("\nMeta Data:")
print(meta_df.head())

User Interactions:
                  user_id      pratilipi_id  read_percent  \
1033131  5506791954036110  1377786225804654         100.0   
1415300  5506791980439899  1377786228150074         100.0   
2318259  5506791979182708  1377786218415632         100.0   
952322   5506791996330389  1377786219497547         100.0   
2114134  5506791961370166  1377786224952303         100.0   

                     updated_at  
1033131 2022-03-18 15:14:41.827  
1415300 2022-03-18 15:14:42.120  
2318259 2022-03-18 15:14:42.134  
952322  2022-03-18 15:14:42.170  
2114134 2022-03-18 15:14:42.282  

Meta Data:
          author_id      pratilipi_id category_name  reading_time  \
0 -3418949279741297  1025741862639304   translation             0   
1 -2270332351871840  1377786215601277   translation           171   
2 -2270332352037261  1377786215601962   translation            92   
3 -2270332352521845  1377786215640994   translation             0   
4 -2270332349665658  1377786215931338   translation  

In [24]:
print("Checking data structure:")
print("\nColumns:", interactions_df.columns.tolist())
print("\nSample of data:")
print(interactions_df.head())
print("\nData info:")
print(interactions_df.info())

Checking data structure:

Columns: ['user_id', 'pratilipi_id', 'read_percent', 'updated_at']

Sample of data:
                  user_id      pratilipi_id  read_percent  \
1033131  5506791954036110  1377786225804654         100.0   
1415300  5506791980439899  1377786228150074         100.0   
2318259  5506791979182708  1377786218415632         100.0   
952322   5506791996330389  1377786219497547         100.0   
2114134  5506791961370166  1377786224952303         100.0   

                     updated_at  
1033131 2022-03-18 15:14:41.827  
1415300 2022-03-18 15:14:42.120  
2318259 2022-03-18 15:14:42.134  
952322  2022-03-18 15:14:42.170  
2114134 2022-03-18 15:14:42.282  

Data info:
<class 'pandas.core.frame.DataFrame'>
Index: 2500000 entries, 1033131 to 2186369
Data columns (total 4 columns):
 #   Column        Dtype         
---  ------        -----         
 0   user_id       int64         
 1   pratilipi_id  int64         
 2   read_percent  float64       
 3   updated_at    datet

In [26]:
def create_user_item_matrix(interactions_df):
    # Convert IDs to categorical codes to save memory
    users = pd.Categorical(interactions_df['user_id'])
    items = pd.Categorical(interactions_df['pratilipi_id'])
    
    # Create mappings
    user_to_idx = dict(zip(users.categories, range(len(users.categories))))
    item_to_idx = dict(zip(items.categories, range(len(items.categories))))
    
    # Create sparse matrix
    rows = interactions_df['user_id'].map(user_to_idx)
    cols = interactions_df['pratilipi_id'].map(item_to_idx)
    data = interactions_df['read_percent'].astype('float32')  # Convert to float32 to save memory
    
    sparse_matrix = csr_matrix(
        (data, (rows, cols)),
        shape=(len(user_to_idx), len(item_to_idx))
    )
    
    return sparse_matrix, user_to_idx, item_to_idx

# Split and create matrix
print("Splitting data...")
train_data, test_data = train_test_split_by_time(interactions_df)

print("Creating sparse matrix...")
sparse_matrix, user_to_idx, item_to_idx = create_user_item_matrix(train_data)

print("Matrix shape:", sparse_matrix.shape)
print("Matrix density:", (sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1])) * 100, "%")

Splitting data...
Creating sparse matrix...
Matrix shape: (213331, 219088)
Matrix density: 0.004011702435287433 %


In [27]:
def train_model(sparse_matrix, factors=100):
    print("Training model...")
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=0.1,
        iterations=20,
        calculate_training_loss=True,
        random_state=42
    )
    model.fit(sparse_matrix.T)
    return model

# Train the model
model = train_model(sparse_matrix)

Training model...


  0%|          | 0/20 [00:00<?, ?it/s]

In [31]:
def get_recommendations(model, sparse_matrix, user_idx, n_items=5):
    """
    Get recommendations for a user with dimension alignment
    """
    try:
        # Get user's interaction vector
        user_items = sparse_matrix[user_idx]
        
        # Ensure we only look at items within the model's known range
        n_items = min(n_items, model.item_factors.shape[0])
        
        recommendations, scores = model.recommend(
            userid=user_idx,
            user_items=user_items,
            N=n_items,
            filter_already_liked_items=True,
            items=range(model.item_factors.shape[0])  # Explicitly specify valid item range
        )
        
        return recommendations
        
    except Exception as e:
        print(f"Error getting recommendations for user {user_idx}")
        print(f"Error details: {str(e)}")
        return []


In [32]:
# Enhanced function to analyze recommendations
def analyze_recommendations(recommendations, meta_df, user_id):
    print(f"\nRecommendations for user {user_id}:")
    recommended_stories = meta_df[meta_df['pratilipi_id'].isin(recommendations)]
    
    if len(recommended_stories) > 0:
        # Group by story to show all categories for each
        for story_id in recommendations:
            story_data = recommended_stories[recommended_stories['pratilipi_id'] == story_id]
            if len(story_data) > 0:
                print(f"\nStory ID: {story_id}")
                print(f"Categories: {', '.join(story_data['category_name'].unique())}")
                print(f"Reading time: {story_data['reading_time'].iloc[0]} seconds")
    else:
        print("No story details found for recommendations")


In [33]:
print("\nTesting recommendations with fixed dimensions...")

for i in range(5):  # Test first 5 users
    try:
        user_id = list(user_to_idx.keys())[i]
        user_idx = user_to_idx[user_id]
        
        print(f"\nProcessing user {user_id} (index: {user_idx})")
        recommendations = get_recommendations(model, sparse_matrix, user_idx)
        
        if len(recommendations) > 0:
            original_ids = convert_to_original_ids(recommendations, item_to_idx)
            analyze_recommendations(original_ids, meta_df, user_id)
        else:
            print(f"No recommendations generated for user {user_id}")
            
    except Exception as e:
        print(f"Error processing user at index {i}")
        print(f"Error details: {str(e)}")
        continue
    


Testing recommendations with fixed dimensions...

Processing user 3257552805995172 (index: 0)

Recommendations for user 3257552805995172:

Story ID: 1377786224881573
Categories: novels, life, romance
Reading time: 683 seconds

Processing user 3257621147984548 (index: 1)

Recommendations for user 3257621147984548:

Story ID: 1377786225349769
Categories: relegion-and-spiritual, mythology, shortstories
Reading time: 150 seconds

Story ID: 1377786216799473
Categories: suspense, horror
Reading time: 182 seconds

Processing user 3260275089121956 (index: 2)

Recommendations for user 3260275089121956:

Story ID: 1377786222974562
Categories: crime, Indiawale, suspense
Reading time: 415 seconds

Story ID: 1377786224927001
Categories: novels, webseries
Reading time: 441 seconds

Story ID: 1377786222916063
Categories: family, romance
Reading time: 406 seconds

Processing user 3260433621754532 (index: 3)

Recommendations for user 3260433621754532:

Story ID: 1377786225234630
Categories: romance, s