# Genre-Based Recommendation Analysis

This notebook analyzes the effectiveness of genre-enhanced recommendation systems by comparing:
- Original recommendations
- Adventure-enhanced recommendations (1000 & 2000 books)
- Mystery-enhanced recommendations (1000 & 2000 books)

We'll examine how many adventure/mystery books each user receives across different systems.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

# Set working directory to results folder
results_dir = '/home/moshtasa/Research/phd-svd-recsys/SVD/Book_Clean/results/svd_analysis'
os.chdir(results_dir)

print(f"Working directory: {os.getcwd()}")
print(f"Available files: {[f for f in os.listdir('.') if f.endswith('.csv')]}")

## 1. Data Loading and Genre Counting

In [None]:
def load_and_count_genres():
    """Load all datasets and count genre occurrences for each user"""
    
    # Define file patterns
    datasets = {
        'original': ['original_recommendations_top15.csv', 'original_recommendations_top25.csv', 'original_recommendations_top35.csv'],
        'adventure_1000': ['adventure_1000_recommendations_top15.csv', 'adventure_1000_recommendations_top25.csv', 'adventure_1000_recommendations_top35.csv'],
        'adventure_2000': ['adventure_2000_recommendations_top15.csv', 'adventure_2000_recommendations_top25.csv', 'adventure_2000_recommendations_top35.csv'],
        'mystery_1000': ['mystery_1000_recommendations_top15.csv', 'mystery_1000_recommendations_top25.csv', 'mystery_1000_recommendations_top35.csv'],
        'mystery_2000': ['mystery_2000_recommendations_top15.csv', 'mystery_2000_recommendations_top25.csv', 'mystery_2000_recommendations_top35.csv']
    }
    
    # Store counts for each user in each dataset
    genre_counts = {}
    
    for dataset_type, files in datasets.items():
        genre_counts[dataset_type] = {}
        
        for i, file in enumerate(files):
            top_n = [15, 25, 35][i]
            print(f"Processing {file}...")
            
            df = pd.read_csv(file)
            
            # Count Adventure and Mystery for each user
            adventure_counts = df[df['genre'] == 'Adventure'].groupby('user_id').size()
            mystery_counts = df[df['genre'] == 'Mystery'].groupby('user_id').size()
            
            # Get all users and set missing counts to 0
            all_users = df['user_id'].unique()
            adventure_dict = {user: adventure_counts.get(user, 0) for user in all_users}
            mystery_dict = {user: mystery_counts.get(user, 0) for user in all_users}
            
            genre_counts[dataset_type][f'top{top_n}'] = {
                'adventure': adventure_dict,
                'mystery': mystery_dict
            }
    
    return genre_counts

# Load the data
print("Loading data and counting genres...")
genre_counts = load_and_count_genres()
print("Data loading complete!")

## 2. Create Summary Tables

Generate tables showing raw counts of adventure/mystery books for each user across different systems.

In [None]:
def create_summary_tables(genre_counts):
    """Create summary tables for adventure and mystery counts"""
    
    # Get all users (should be consistent across datasets)
    all_users = list(genre_counts['original']['top15']['adventure'].keys())
    all_users.sort()
    
    summary_tables = {}
    
    # Create tables for each top_n
    for top_n in ['top15', 'top25', 'top35']:
        
        # Adventure table
        adventure_data = []
        mystery_data = []
        
        for user in all_users:
            # Adventure counts
            orig_adv = genre_counts['original'][top_n]['adventure'][user]
            adv1000_adv = genre_counts['adventure_1000'][top_n]['adventure'][user]
            adv2000_adv = genre_counts['adventure_2000'][top_n]['adventure'][user]
            
            # Mystery counts  
            orig_mys = genre_counts['original'][top_n]['mystery'][user]
            mys1000_mys = genre_counts['mystery_1000'][top_n]['mystery'][user]
            mys2000_mys = genre_counts['mystery_2000'][top_n]['mystery'][user]
            
            adventure_data.append({
                'user_id': user,
                'original': orig_adv,
                'adventure_1000': adv1000_adv,
                'adventure_2000': adv2000_adv
            })
            
            mystery_data.append({
                'user_id': user,
                'original': orig_mys,
                'mystery_1000': mys1000_mys,
                'mystery_2000': mys2000_mys
            })
        
        # Create DataFrames
        adventure_df = pd.DataFrame(adventure_data)
        mystery_df = pd.DataFrame(mystery_data)
        
        # Save tables
        adventure_df.to_csv(f'adventure_counts_{top_n}.csv', index=False)
        mystery_df.to_csv(f'mystery_counts_{top_n}.csv', index=False)
        
        # Store for analysis
        summary_tables[f'adventure_{top_n}'] = adventure_df
        summary_tables[f'mystery_{top_n}'] = mystery_df
        
        print(f"Saved adventure_counts_{top_n}.csv and mystery_counts_{top_n}.csv")
        
        # Show summary statistics
        print(f"\nAdventure {top_n} - Summary Statistics:")
        print(adventure_df[['original', 'adventure_1000', 'adventure_2000']].describe())
        
        print(f"\nMystery {top_n} - Summary Statistics:")
        print(mystery_df[['original', 'mystery_1000', 'mystery_2000']].describe())
        print("-" * 50)
    
    return summary_tables

# Create summary tables
summary_tables = create_summary_tables(genre_counts)

## 3. Create Percentage Increase Tables

Calculate percentage increase from original recommendations (baseline = 100%) to enhanced versions.

In [None]:
def create_percentage_increase_tables(genre_counts):
    """Create percentage increase tables"""
    
    all_users = list(genre_counts['original']['top15']['adventure'].keys())
    all_users.sort()
    
    percentage_tables = {}
    
    for top_n in ['top15', 'top25', 'top35']:
        
        adventure_pct_data = []
        mystery_pct_data = []
        
        for user in all_users:
            # Adventure percentages (original as baseline = 100%)
            orig_adv = genre_counts['original'][top_n]['adventure'][user]
            adv1000_adv = genre_counts['adventure_1000'][top_n]['adventure'][user]
            adv2000_adv = genre_counts['adventure_2000'][top_n]['adventure'][user]
            
            # Use original as 100%, calculate relative increases
            base_adv = max(orig_adv, 1)  # Avoid division by zero, treat 0 as 1 for percentage calculation
            pct_1000_adv = (adv1000_adv / base_adv) * 100
            pct_2000_adv = (adv2000_adv / base_adv) * 100
            
            # Mystery percentages
            orig_mys = genre_counts['original'][top_n]['mystery'][user]
            mys1000_mys = genre_counts['mystery_1000'][top_n]['mystery'][user]
            mys2000_mys = genre_counts['mystery_2000'][top_n]['mystery'][user]
            
            base_mys = max(orig_mys, 1)
            pct_1000_mys = (mys1000_mys / base_mys) * 100
            pct_2000_mys = (mys2000_mys / base_mys) * 100
            
            adventure_pct_data.append({
                'user_id': user,
                'original_pct': 100.0,
                'adventure_1000_pct': pct_1000_adv,
                'adventure_2000_pct': pct_2000_adv,
                'increase_1000': pct_1000_adv - 100,
                'increase_2000': pct_2000_adv - 100
            })
            
            mystery_pct_data.append({
                'user_id': user,
                'original_pct': 100.0,
                'mystery_1000_pct': pct_1000_mys,
                'mystery_2000_pct': pct_2000_mys,
                'increase_1000': pct_1000_mys - 100,
                'increase_2000': pct_2000_mys - 100
            })
        
        # Create DataFrames
        adventure_pct_df = pd.DataFrame(adventure_pct_data)
        mystery_pct_df = pd.DataFrame(mystery_pct_data)
        
        # Save percentage tables
        adventure_pct_df.to_csv(f'adventure_percentage_{top_n}.csv', index=False)
        mystery_pct_df.to_csv(f'mystery_percentage_{top_n}.csv', index=False)
        
        # Store for analysis
        percentage_tables[f'adventure_{top_n}'] = adventure_pct_df
        percentage_tables[f'mystery_{top_n}'] = mystery_pct_df
        
        print(f"Saved adventure_percentage_{top_n}.csv and mystery_percentage_{top_n}.csv")
        
        # Show percentage increase statistics
        print(f"\nAdventure {top_n} - Percentage Increase Statistics:")
        print(adventure_pct_df[['increase_1000', 'increase_2000']].describe())
        
        print(f"\nMystery {top_n} - Percentage Increase Statistics:")
        print(mystery_pct_df[['increase_1000', 'increase_2000']].describe())
        print("-" * 50)
    
    return percentage_tables

# Create percentage increase tables
percentage_tables = create_percentage_increase_tables(genre_counts)

## 4. Create Binned Visualizations

Generate visualizations with 10 bins of users, showing average genre counts across different recommendation systems.

In [None]:
def create_binned_visualizations(genre_counts):
    """Create visualizations with 10 bins of users"""
    
    all_users = list(genre_counts['original']['top15']['adventure'].keys())
    all_users.sort()
    n_users = len(all_users)
    bin_size = n_users // 10
    
    print(f"Total users: {n_users}, Users per bin: {bin_size}")
    
    # Create bins
    user_bins = []
    for i in range(10):
        start_idx = i * bin_size
        if i == 9:  # Last bin gets remaining users
            end_idx = n_users
        else:
            end_idx = (i + 1) * bin_size
        user_bins.append(all_users[start_idx:end_idx])
    
    # Generate plots
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle('Genre Recommendations Analysis: Average Counts by User Bins', fontsize=16, y=0.98)
    
    colors = ['#3498db', '#e74c3c', '#2ecc71']  # Blue, Red, Green
    
    for genre_idx, genre in enumerate(['adventure', 'mystery']):
        for top_idx, top_n in enumerate(['top15', 'top25', 'top35']):
            
            ax = axes[genre_idx, top_idx]
            
            # Calculate average counts for each bin
            bin_data = {
                'original': [],
                f'{genre}_1000': [],
                f'{genre}_2000': []
            }
            
            for bin_users in user_bins:
                # Average for each dataset
                orig_avg = np.mean([genre_counts['original'][top_n][genre][user] for user in bin_users])
                enh1000_avg = np.mean([genre_counts[f'{genre}_1000'][top_n][genre][user] for user in bin_users])
                enh2000_avg = np.mean([genre_counts[f'{genre}_2000'][top_n][genre][user] for user in bin_users])
                
                bin_data['original'].append(orig_avg)
                bin_data[f'{genre}_1000'].append(enh1000_avg)
                bin_data[f'{genre}_2000'].append(enh2000_avg)
            
            # Create bar plot
            x = np.arange(10)
            width = 0.25
            
            bars1 = ax.bar(x - width, bin_data['original'], width, 
                          label='Original', alpha=0.8, color=colors[0])
            bars2 = ax.bar(x, bin_data[f'{genre}_1000'], width, 
                          label=f'{genre.title()} 1000', alpha=0.8, color=colors[1])
            bars3 = ax.bar(x + width, bin_data[f'{genre}_2000'], width, 
                          label=f'{genre.title()} 2000', alpha=0.8, color=colors[2])
            
            ax.set_xlabel('User Bins (10 equal groups)', fontsize=10)
            ax.set_ylabel(f'Average {genre.title()} Count', fontsize=10)
            ax.set_title(f'{genre.title()} - Top {top_n[3:]}', fontsize=12, fontweight='bold')
            ax.set_xticks(x)
            ax.set_xticklabels([f'{i+1}' for i in range(10)])
            ax.legend(fontsize=9)
            ax.grid(True, alpha=0.3)
            
            # Add value labels on bars
            for bars in [bars1, bars2, bars3]:
                for bar in bars:
                    height = bar.get_height()
                    ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                           f'{height:.1f}', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.savefig('genre_analysis_visualization.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Visualization saved as 'genre_analysis_visualization.png'")

# Create visualizations
create_binned_visualizations(genre_counts)

## 5. Additional Analysis: Distribution of Improvements

In [None]:
# Create distribution plots for percentage improvements
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Distribution of Percentage Improvements', fontsize=16)

for genre_idx, genre in enumerate(['adventure', 'mystery']):
    for top_idx, top_n in enumerate(['top15', 'top25', 'top35']):
        ax = axes[genre_idx, top_idx]
        
        df = percentage_tables[f'{genre}_{top_n}']
        
        # Plot histograms
        ax.hist(df['increase_1000'], bins=50, alpha=0.7, label=f'{genre.title()} 1000', 
               color='red', density=True)
        ax.hist(df['increase_2000'], bins=50, alpha=0.7, label=f'{genre.title()} 2000', 
               color='green', density=True)
        
        ax.set_xlabel('Percentage Increase (%)')
        ax.set_ylabel('Density')
        ax.set_title(f'{genre.title()} - Top {top_n[3:]}')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Add statistics text
        mean_1000 = df['increase_1000'].mean()
        mean_2000 = df['increase_2000'].mean()
        ax.axvline(mean_1000, color='red', linestyle='--', alpha=0.8)
        ax.axvline(mean_2000, color='green', linestyle='--', alpha=0.8)
        ax.text(0.02, 0.98, f'Mean 1000: {mean_1000:.1f}%\nMean 2000: {mean_2000:.1f}%', 
               transform=ax.transAxes, verticalalignment='top', 
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('improvement_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Summary Statistics Table

In [None]:
# Create comprehensive summary statistics
summary_stats = []

for genre in ['adventure', 'mystery']:
    for top_n in ['top15', 'top25', 'top35']:
        df_counts = summary_tables[f'{genre}_{top_n}']
        df_pct = percentage_tables[f'{genre}_{top_n}']
        
        # Calculate statistics
        orig_mean = df_counts['original'].mean()
        enh1000_mean = df_counts[f'{genre}_1000'].mean()
        enh2000_mean = df_counts[f'{genre}_2000'].mean()
        
        pct_1000_mean = df_pct['increase_1000'].mean()
        pct_2000_mean = df_pct['increase_2000'].mean()
        
        # Users with improvements
        users_improved_1000 = (df_pct['increase_1000'] > 0).sum()
        users_improved_2000 = (df_pct['increase_2000'] > 0).sum()
        total_users = len(df_pct)
        
        summary_stats.append({
            'Genre': genre.title(),
            'Top_N': top_n[3:],
            'Original_Mean': f'{orig_mean:.2f}',
            'Enhanced_1000_Mean': f'{enh1000_mean:.2f}',
            'Enhanced_2000_Mean': f'{enh2000_mean:.2f}',
            'Avg_Improvement_1000_%': f'{pct_1000_mean:.1f}%',
            'Avg_Improvement_2000_%': f'{pct_2000_mean:.1f}%',
            'Users_Improved_1000': f'{users_improved_1000}/{total_users} ({users_improved_1000/total_users*100:.1f}%)',
            'Users_Improved_2000': f'{users_improved_2000}/{total_users} ({users_improved_2000/total_users*100:.1f}%)'
        })

summary_df = pd.DataFrame(summary_stats)
summary_df.to_csv('comprehensive_summary.csv', index=False)

print("Comprehensive Summary Statistics:")
print(summary_df.to_string(index=False))
print("\nSummary saved as 'comprehensive_summary.csv'")

## 7. Key Findings and Conclusions

Based on the analysis above, we can observe:

1. **Genre Enhancement Effectiveness**: How well do the genre-enhanced systems increase the number of relevant genre recommendations?

2. **User Distribution**: How improvements are distributed across different user groups (bins).

3. **Scale Effects**: Comparing the impact of 1000 vs 2000 enhanced books.

4. **Top-N Effects**: How the recommendation list size (15, 25, 35) affects the improvements.

The visualizations and tables generated provide comprehensive insights into the effectiveness of genre-based recommendation enhancement strategies.

In [None]:
print("\n" + "="*50)
print("GENRE ANALYSIS COMPLETE")
print("="*50)
print(f"Generated files:")
generated_files = [f for f in os.listdir('.') if f.endswith('.csv') and ('counts_' in f or 'percentage_' in f or 'comprehensive_' in f)]
for file in sorted(generated_files):
    print(f"  - {file}")
    
print(f"\nGenerated visualizations:")
viz_files = [f for f in os.listdir('.') if f.endswith('.png')]
for file in sorted(viz_files):
    print(f"  - {file}")