In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Multiply, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, ndcg_score
import joblib

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
# 1. Enhanced Data Processing
class DataProcessor:
    def __init__(self, content_data, user_data, interaction_data):
        self.content_data = content_data
        self.user_data = user_data
        self.interaction_data = interaction_data
        self.content_embeddings = None
        self.user_embeddings = None
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.scaler = StandardScaler()

    def preprocess_data(self):
        # Process content data
        self.content_data['text_features'] = self.content_data['title'] + ' ' + self.content_data['description']
        tfidf_matrix = self.tfidf.fit_transform(self.content_data['text_features'])
        self.content_embeddings = tfidf_matrix.toarray()

        # Process user data
        user_features = self.user_data[['age']].values
        self.user_embeddings = self.scaler.fit_transform(user_features)

        # Process interaction data
        self.interaction_data['timestamp'] = pd.to_datetime(self.interaction_data['timestamp'])
        self.interaction_data['recency'] = (pd.Timestamp.now() - self.interaction_data['timestamp']).dt.days

    def get_user_profile(self, user_id):
        user_interactions = self.interaction_data[self.interaction_data['user_id'] == user_id].sort_values('timestamp').tail(100)
        if user_interactions.empty:
            return np.zeros(self.content_embeddings.shape[1])

        content_ids = user_interactions['content_id'].unique()
        weights = 1 / (1 + np.arange(len(content_ids))[::-1])

        #Convert 'content_id' to numpy array
        content_ids = user_interactions['content_id'].to_numpy()
        weighted_sum = np.sum(self.content_embeddings[content_ids] * weights[:, np.newaxis], axis=0) # Index with unique content IDs
        return weighted_sum / np.sum(weights)

    def get_content_embeddings(self):
        return self.content_embeddings

    def get_user_embeddings(self):
        return self.user_embeddings

In [4]:
# 3. Enhanced Training and Evaluation
class ModelTrainer:
    def __init__(self, model, data_processor):
        self.model = model
        self.data_processor = data_processor

    def prepare_data(self):
        interactions = self.data_processor.interaction_data
        users = interactions['user_id'].values
        items = interactions['content_id'].values
        labels = interactions['interaction'].values
        user_features = self.data_processor.get_user_embeddings()[users]
        item_features = self.data_processor.get_content_embeddings()[items]
        return users, items, user_features, item_features, labels

    def train(self, epochs=20, batch_size=256, validation_split=0.1):
        users, items, user_features, item_features, labels = self.prepare_data()
        self.model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        history = self.model.fit(
            [users, items, user_features, item_features],
            labels,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            verbose=2
        )
        return history

    def evaluate(self, test_data):
        users, items, user_features, item_features, labels = test_data
        predictions = self.model.predict([users, items, user_features, item_features])
        precision = precision_score(labels, predictions.round())
        recall = recall_score(labels, predictions.round())
        ndcg = ndcg_score(labels.reshape(1, -1), predictions.reshape(1, -1))
        return {'precision': precision, 'recall': recall, 'ndcg': ndcg}

In [5]:
# 4. Improved Real-time Update System
class RealTimeUpdater:
    def __init__(self, model, data_processor):
        self.model = model
        self.data_processor = data_processor

    def update_user_preferences(self, user_id, interaction_data):
        new_interactions = pd.DataFrame(interaction_data)
        self.data_processor.interaction_data = pd.concat([self.data_processor.interaction_data, new_interactions])
        self.data_processor.interaction_data = self.data_processor.interaction_data.sort_values('timestamp').groupby('user_id').tail(1000)  # Keep only the latest 1000 interactions per user

    def update_content_features(self, new_content):
        new_content_df = pd.DataFrame(new_content)
        self.data_processor.content_data = pd.concat([self.data_processor.content_data, new_content_df])
        self.data_processor.preprocess_data()  # Recompute content embeddings

    def adjust_recommendations(self, user_id, recommendations, recent_interactions):
        recent_items = set(recent_interactions['content_id'])
        adjusted_recs = [(item, score) for item, score in recommendations if item not in recent_items]
        return adjusted_recs[:len(recommendations)]

In [6]:
# 5. Enhanced Main Recommendation Pipeline
class RecommendationSystem:
    def __init__(self, data_processor, recommender, trainer, real_time_updater):
        self.data_processor = data_processor
        self.recommender = recommender
        self.trainer = trainer
        self.real_time_updater = real_time_updater

    def get_recommendations(self, user_id, top_n=10):
        user_features = self.data_processor.get_user_embeddings()[user_id]
        item_features = self.data_processor.get_content_embeddings()
        cf_recs = self.recommender.predict(user_id, user_features, item_features)
        cb_recs = self.content_based_recommendations(user_id)
        combined_recs = self.combine_recommendations(cf_recs, cb_recs)
        recent_interactions = self.data_processor.interaction_data[self.data_processor.interaction_data['user_id'] == user_id].sort_values('timestamp').tail(100)
        final_recs = self.real_time_updater.adjust_recommendations(user_id, combined_recs, recent_interactions)
        return final_recs[:top_n]

    def content_based_recommendations(self, user_id):
        user_profile = self.data_processor.get_user_profile(user_id)
        content_embeddings = self.data_processor.get_content_embeddings()
        content_similarities = cosine_similarity([user_profile], content_embeddings)[0]
        return list(enumerate(content_similarities))

    def combine_recommendations(self, cf_recs, cb_recs, cf_weight=0.7, cb_weight=0.3):
        combined = {}
        for content_id, score in cf_recs:
            combined[content_id] = score * cf_weight
        for content_id, score in cb_recs:
            if content_id in combined:
                combined[content_id] += score * cb_weight
            else:
                combined[content_id] = score * cb_weight
        return sorted(combined.items(), key=lambda x: x[1], reverse=True)

    def train_model(self, epochs=20, batch_size=256):
        print("Starting model training...")
        history = self.trainer.train(epochs=epochs, batch_size=batch_size)
        print("Model training completed.")
        return history

    def update_system(self, new_data):
        print("Updating system with new data...")
        self.real_time_updater.update_user_preferences(new_data['user_id'], new_data['interactions'])
        self.real_time_updater.update_content_features(new_data['new_content'])
        print("System update completed.")

    def evaluate_recommendations(self, test_data):
        print("Evaluating recommendation quality...")
        metrics = self.trainer.evaluate(test_data)
        print(f"Evaluation metrics: {metrics}")
        return metrics

    def save_model(self, filepath):
        self.recommender.model.save(filepath)
        joblib.dump(self.data_processor, filepath + '_data_processor.joblib')

    def load_model(self, filepath):
        self.recommender.model = tf.keras.models.load_model(filepath)
        self.data_processor = joblib.load(filepath + '_data_processor.joblib')

In [20]:
if __name__ == "__main__":
    # Initialize components with dummy data
    n_users, n_items = 1000, 1000
    dummy_content_data = pd.DataFrame({
        'content_id': range(n_items),
        'title': [f'Item {i}' for i in range(n_items)],
        'description': [f'Description for item {i}' for i in range(n_items)]
    })
    dummy_user_data = pd.DataFrame({
        'user_id': range(n_users),
        'age': np.random.randint(18, 80, n_users),
        'gender': np.random.choice(['M', 'F'], n_users)
    })
    dummy_interaction_data = pd.DataFrame({
        'user_id': np.random.randint(0, n_users, 10000),
        'content_id': np.random.randint(0, n_items, 10000),
        'interaction': np.random.choice([0, 1], 10000),
        'timestamp': pd.date_range(start='1/1/2020', periods=10000)
    })

    data_processor = DataProcessor(dummy_content_data, dummy_user_data, dummy_interaction_data)
    data_processor.preprocess_data()

    recommender = HybridRecommender()
    model = recommender.build_model(n_users, n_items, data_processor.user_embeddings.shape[1], data_processor.content_embeddings.shape[1])
    trainer = ModelTrainer(model, data_processor)
    real_time_updater = RealTimeUpdater(model, data_processor)

    # Create recommendation system
    rec_system = RecommendationSystem(data_processor, recommender, trainer, real_time_updater)

    # Train the model
    rec_system.train_model(epochs=5, batch_size=256)

    # Get recommendations for a user
    user_id = 123
    recommendations = rec_system.get_recommendations(user_id, top_n=10)
    print(f"Top 10 recommendations for user {user_id}:")
    for content_id, score in recommendations:
        print(f"Content ID: {content_id}, Score: {score}")

    # Update system with new data
    new_data = {
        'user_id': 123,
        'interactions': [{'user_id': 123, 'content_id': 456, 'interaction': 1, 'timestamp': pd.Timestamp.now()}],
        'new_content': [{'content_id': 1001, 'title': 'New Video', 'description': 'A brand new video'}]
    }
    rec_system.update_system(new_data)

Starting model training...
Epoch 1/5
36/36 - 2s - loss: 0.6931 - accuracy: 0.5014 - val_loss: 0.6931 - val_accuracy: 0.5080 - 2s/epoch - 54ms/step
Epoch 2/5
36/36 - 0s - loss: 0.6925 - accuracy: 0.5983 - val_loss: 0.6931 - val_accuracy: 0.5100 - 190ms/epoch - 5ms/step
Epoch 3/5
36/36 - 0s - loss: 0.6916 - accuracy: 0.6042 - val_loss: 0.6931 - val_accuracy: 0.5070 - 197ms/epoch - 5ms/step
Epoch 4/5
36/36 - 0s - loss: 0.6898 - accuracy: 0.8381 - val_loss: 0.6932 - val_accuracy: 0.4920 - 205ms/epoch - 6ms/step
Epoch 5/5
36/36 - 0s - loss: 0.6865 - accuracy: 0.9519 - val_loss: 0.6932 - val_accuracy: 0.4930 - 210ms/epoch - 6ms/step
Model training completed.
Top 10 recommendations for user 123:
Content ID: 7, Score: 0.4713316280120209
Content ID: 4, Score: 0.4708054560893372
Content ID: 0, Score: 0.47077616636689457
Content ID: 5, Score: 0.47003244941170963
Content ID: 8, Score: 0.4696873981231049
Content ID: 6, Score: 0.4693278271430329
Content ID: 3, Score: 0.4685968357795075
Content ID: 1