In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import os
import pickle
import joblib
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

SAVE_DIR = "Saved_files_new"



# ===== TOPIC VISUALIZATION FUNCTIONS =====

In [2]:


def get_topic_name(topic_idx, topic_names):
    """Fetch topic name using string keys and clean up quotes."""
    names = topic_names.get('all', topic_names)
    name = names.get(str(topic_idx), f"Topic {topic_idx}")
    return name.strip('"')

def plot_topic_keywords_with_names(topic_keywords, topic_names, top_words=10):
    """Plot keyword weights for each topic with generated names"""
    num_topics = len(topic_keywords)
    fig, axes = plt.subplots(num_topics, 1, figsize=(14, 5*num_topics))
    
    if num_topics == 1:
        axes = [axes]
    
    for idx, (topic_idx, keywords) in enumerate(topic_keywords.items()):
        words, weights = zip(*keywords['word_weights'][:top_words])
        ax = axes[idx]
        
        bars = ax.barh(words, weights, color='steelblue', alpha=0.7)
        topic_name = get_topic_name(topic_idx, topic_names)
        ax.set_title(f'Topic {topic_idx}: {topic_name}', fontsize=14, fontweight='bold')
        ax.set_xlabel('Word Weight', fontsize=12)
        ax.invert_yaxis()
        
        for bar, weight in zip(bars, weights):
            ax.text(weight + 0.001, bar.get_y() + bar.get_height()/2, 
                   f'{weight:.3f}', va='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()

def plot_topic_importance_with_names(df_analyzed, topic_names):
    """Plot topic importance using actual topic assignments"""
    topic_counts = df_analyzed['Primary_Topic'].value_counts()
    
    plt.figure(figsize=(12, 8))
    bars = plt.bar(range(len(topic_counts)), topic_counts.values, 
                   color='lightcoral', alpha=0.7)
    
    plt.xticks(range(len(topic_counts)), topic_counts.index, rotation=45, ha='right')
    plt.title('Topic Importance (Number of Papers per Topic)', fontsize=16, fontweight='bold')
    plt.xlabel('Topics', fontsize=12)
    plt.ylabel('Number of Papers', fontsize=12)
    
    for bar, count in zip(bars, topic_counts.values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                str(count), ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()

def plot_topic_trends_over_time(df, topic_col='Primary_Topic', year_col='year', top_n=10):
    """Plot topic trends over time"""
    topic_year = df.groupby([year_col, topic_col]).size().reset_index(name='count')
    top_topics = topic_year.groupby(topic_col)['count'].sum().nlargest(top_n).index
    topic_year = topic_year[topic_year[topic_col].isin(top_topics)]
    
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=topic_year, x=year_col, y='count', hue=topic_col, marker='o')
    plt.title('Topic Trends Over Time')
    plt.ylabel('Number of Papers')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()



# ===== METHOD VISUALIZATION FUNCTIONS =====

In [3]:

def plot_method_importance(df_analyzed, method_col='Method_Detected'):
    """Plot the number of papers assigned to each method."""
    # Filter out 'No_Method_Found' for cleaner visualization
    df_methods = df_analyzed[df_analyzed[method_col] != 'No_Method_Found']
    
    if df_methods.empty:
        print("No methods found for visualization")
        return
    
    method_counts = df_methods[method_col].value_counts().sort_values(ascending=False)
    
    plt.figure(figsize=(12, 8))
    bars = plt.bar(range(len(method_counts)), method_counts.values, 
                   color='mediumseagreen', alpha=0.7)
    plt.xticks(range(len(method_counts)), method_counts.index, rotation=45, ha='right')
    plt.title('Method Importance (Number of Papers per Method)', fontsize=16, fontweight='bold')
    plt.xlabel('Methods', fontsize=12)
    plt.ylabel('Number of Papers', fontsize=12)
    
    for bar, count in zip(bars, method_counts.values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                str(count), ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()

def plot_method_trends_over_time(df_analyzed, method_col='Method_Detected', year_col='year', top_n=8):
    """Plot method trends over time"""
    # Filter out 'No_Method_Found'
    df_methods = df_analyzed[df_analyzed[method_col] != 'No_Method_Found']
    
    if df_methods.empty:
        print("No methods found for visualization")
        return
    
    method_year = df_methods.groupby([year_col, method_col]).size().reset_index(name='count')
    top_methods = method_year.groupby(method_col)['count'].sum().nlargest(top_n).index
    method_year = method_year[method_year[method_col].isin(top_methods)]
    
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=method_year, x=year_col, y='count', hue=method_col, marker='o')
    plt.title('Method Trends Over Time')
    plt.ylabel('Number of Papers')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

def visualize_method_development(df_analyzed):
    """Create visualizations showing the development of identified methods over years"""
    # Filter out documents without methods
    df_methods = df_analyzed[df_analyzed['Method_Detected'] != 'No_Method_Found'].copy()
    
    if df_methods.empty:
        print("No methods found for visualization")
        return None
    
    # Ensure we have publication year data
    if 'publication_year' not in df_methods.columns:
        if 'year' in df_methods.columns:
            df_methods['publication_year'] = df_methods['year']
        else:
            print("No publication year data available")
            return None
    
    # Clean and prepare data
    df_methods = df_methods.dropna(subset=['publication_year', 'Method_Detected'])
    df_methods['publication_year'] = pd.to_numeric(df_methods['publication_year'], errors='coerce')
    df_methods = df_methods.dropna(subset=['publication_year'])
    
    print(f"Visualizing {len(df_methods)} papers with methods across {df_methods['publication_year'].nunique()} years")
    
    # Group by year and method
    method_year_counts = df_methods.groupby(['publication_year', 'Method_Detected']).size().reset_index(name='count')
    
    # Get top methods for better visualization
    top_methods = df_methods['Method_Detected'].value_counts().head(10).index.tolist()
    method_year_filtered = method_year_counts[method_year_counts['Method_Detected'].isin(top_methods)]
    
    return method_year_counts, method_year_filtered, top_methods

def create_method_timeline_plots(df_analyzed, output_dir=None):
    """Create multiple visualizations for method development over time"""
    result = visualize_method_development(df_analyzed)
    if result is None:
        return
    
    method_year_counts, method_year_filtered, top_methods = result
    
    # 1. Interactive Line Chart
    fig_line = px.line(
        method_year_filtered, 
        x='publication_year', 
        y='count',
        color='Method_Detected',
        title='Development of Research Methods Over Time',
        labels={
            'publication_year': 'Publication Year',
            'count': 'Number of Papers',
            'Method_Detected': 'Research Method'
        },
        markers=True
    )
    
    fig_line.update_layout(width=1200, height=600, hovermode='x unified')
    
    # 2. Method Emergence Timeline
    method_first_appearance = method_year_counts.groupby('Method_Detected')['publication_year'].min().reset_index()
    method_first_appearance = method_first_appearance.sort_values('publication_year')
    
    fig_emergence = px.scatter(
        method_first_appearance,
        x='publication_year',
        y='Method_Detected',
        size=[10] * len(method_first_appearance),
        title='Method Emergence Timeline',
        labels={'publication_year': 'First Appearance Year', 'Method_Detected': 'Research Method'}
    )
    
    fig_emergence.update_layout(width=1200, height=800)
    
    # Display plots
    print("=== METHOD DEVELOPMENT VISUALIZATIONS ===")
    fig_line.show()
    fig_emergence.show()
    
    # Save plots if output directory specified
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        fig_line.write_html(os.path.join(output_dir, "method_timeline.html"))
        fig_emergence.write_html(os.path.join(output_dir, "method_emergence.html"))
        print(f"✓ Visualizations saved to {output_dir}")
    
    # Summary statistics
    print("\n=== METHOD DEVELOPMENT SUMMARY ===")
    total_methods = len(method_year_counts['Method_Detected'].unique())
    year_range = f"{method_year_counts['publication_year'].min():.0f}-{method_year_counts['publication_year'].max():.0f}"
    
    print(f"Total unique methods identified: {total_methods}")
    print(f"Year range: {year_range}")
    print("Most common methods:")
    
    method_totals = method_year_counts.groupby('Method_Detected')['count'].sum().sort_values(ascending=False)
    for method, count in method_totals.head(5).items():
        print(f"  {method}: {count} papers")
    
    return fig_line, fig_emergence



# ===== AUTHOR AND VENUE VISUALIZATION =====

In [4]:


def plot_frequencies(file_prefix, n_items=10, save=False):
    """Plot author and venue frequencies"""
    try:
        file_prefix = file_prefix.replace('.csv', '')
        author_file = os.path.join(SAVE_DIR, f"{file_prefix}_author_analysis.csv")
        venue_file = os.path.join(SAVE_DIR, f"{file_prefix}_venue_frequencies.csv")
        
        try:
            authors_df = pd.read_csv(author_file, sep=';', encoding='utf-8')
            venues_df = pd.read_csv(venue_file, sep=';', encoding='utf-8')
        except FileNotFoundError as e:
            print(f"File not found: {e}")
            return
        
        top_authors = authors_df.nlargest(n_items, ['Frequency'])
        top_venues = venues_df.nlargest(n_items, ['Frequency'])
      
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
        
        # Plot authors
        sns.barplot(data=top_authors, x='Author', y='Frequency', ax=ax1)
        ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right', fontsize=14)
        ax1.set_title(f"Top {n_items} Authors")
        ax1.set_xlabel('')
        ax1.set_ylabel('Number of Publications')
        
        # Plot venues
        sns.barplot(data=top_venues, x='Venue', y='Frequency', ax=ax2)
        ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right', fontsize=14)
        ax2.set_title(f"Top {n_items} Venues")
        ax2.set_xlabel('')
        ax2.set_ylabel('Number of Publications')
        
        plt.tight_layout()
        if save:
            output_file = os.path.join(SAVE_DIR, f"{file_prefix}_frequency_plots.png")
            plt.savefig(output_file, dpi=300, bbox_inches='tight')
            print(f"Plot saved as: {output_file}")
        plt.show()
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")



# ===== DATA LOADING FUNCTIONS =====

In [5]:


def load_complete_analysis_data(date_string):
    """Load all saved analysis data including LDA model"""
    print(f"Loading complete analysis data for {date_string}...")
    
    try:
        # Load the analyzed dataframe
        df_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{date_string}_results.csv")
        df_analyzed = pd.read_csv(df_filename, sep=';', encoding='utf-8')
        
        # Load topic names
        topic_names_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{date_string}_topic_names.json")
        with open(topic_names_filename, 'r', encoding='utf-8') as f:
            topic_names = json.load(f)
        
        # Load topic keywords
        topic_keywords_filename = os.path.join(SAVE_DIR, f"topic_keywords_{date_string}.json")
        with open(topic_keywords_filename, 'r', encoding='utf-8') as f:
            topic_keywords_data = json.load(f)
        
        # Convert back to the expected format
        topic_keywords = {}
        for topic_idx, data in topic_keywords_data.items():
            topic_keywords[int(topic_idx)] = data
        
        print(f"Successfully loaded all components for {date_string}")
        
        return {
            'df_analyzed': df_analyzed,
            'topic_names': topic_names,
            'topic_keywords': topic_keywords
        }
        
    except Exception as e:
        print(f"Error loading data: {e}")
        return None



# ===== MAIN VISUALIZATION FUNCTION =====

In [6]:
# ===== MAIN VISUALIZATION FUNCTION =====

def create_complete_visualization_suite(df_analyzed, topic_names=None, topic_keywords=None, 
                                      file_prefix=None, output_dir=None):
    """Create complete visualization suite for your analysis results"""
    
    print("=== CREATING COMPLETE VISUALIZATION SUITE ===\n")
    
    # 1. Topic Visualizations (if topic data available)
    if topic_names and topic_keywords:
        print("1. Topic Analysis Visualizations:")
        plot_topic_keywords_with_names(topic_keywords, topic_names)
        plot_topic_importance_with_names(df_analyzed, topic_names)
        
        if 'year' in df_analyzed.columns:
            plot_topic_trends_over_time(df_analyzed, year_col='year')
    
    # 2. Method Visualizations
    if 'Method_Detected' in df_analyzed.columns:
        print("2. Method Analysis Visualizations:")
        plot_method_importance(df_analyzed)
        
        if 'year' in df_analyzed.columns:
            plot_method_trends_over_time(df_analyzed, year_col='year')
            
        # Method timeline plots
        create_method_timeline_plots(df_analyzed, output_dir)
    
    # 3. Author and Venue Analysis (if file prefix provided)
    if file_prefix:
        print("3. Author and Venue Analysis:")
        plot_frequencies(file_prefix, n_items=15, save=True)
    
    print("\n✓ Complete visualization suite created successfully!")





# ===== DO ANALYSIS =====

In [9]:

"""Main function to run visualizations"""
    
    # Load your analysis results
date_string = "2025_06_25reliability_resilience_power_systems"  # Update this
    
    # Option 1: Load from saved files
analysis_data = load_complete_analysis_data(date_string)
if analysis_data:
    create_complete_visualization_suite(
        df_analyzed=analysis_data['df_analyzed'],
        topic_names=analysis_data['topic_names'],
        topic_keywords=analysis_data['topic_keywords'],
        file_prefix=f"semantic_scholar_{date_string}",
        output_dir=os.path.join(SAVE_DIR, f"visualizations_{date_string}")
    )
    
    # Option 2: Use existing dataframe (if already loaded)
    # create_complete_visualization_suite(
    #     df_analyzed=your_df_analyzed,
    #     topic_names=your_topic_names,
    #     topic_keywords=your_topic_keywords,
    #     file_prefix="your_file_prefix",
    #     output_dir="your_output_dir"
    # )

Loading complete analysis data for 2025_06_25reliability_resilience_power_systems...
Error loading data: [Errno 2] No such file or directory: 'Saved_files_new\\semantic_scholar_2025_06_25reliability_resilience_power_systems_topic_names.json'
