In [None]:
!pip install mygene

In [None]:
!pip install goatools

In [None]:
import requests
import pandas as pd
import mygene
from collections import defaultdict

class KEGGGOIntegrator:
    def __init__(self):
        self.mg = mygene.MyGeneInfo()
        
    def get_kegg_pathways(self):
        """Get KEGG pathway to gene mappings for human"""
        print("Fetching KEGG pathway mappings...")
        url = "http://rest.kegg.jp/link/pathway/hsa"
        response = requests.get(url)
        
        pathway_dict = defaultdict(list)
        for line in response.text.strip().split('\n'):
            gene, pathway = line.split('\t')
            gene = gene.replace('hsa:', '')
            pathway = pathway.replace('path:', '')
            pathway_dict[gene].append(pathway)
            
        return pathway_dict
    
    def get_pathway_names(self):
        """Get KEGG pathway names"""
        print("Fetching KEGG pathway names...")
        url = "http://rest.kegg.jp/list/pathway/hsa"
        response = requests.get(url)
        
        pathway_names = {}
        for line in response.text.strip().split('\n'):
            pathway_id, description = line.split('\t')
            pathway_id = pathway_id.replace('path:', '')
            pathway_names[pathway_id] = description
            
        return pathway_names
    
    def get_gene_info(self, gene_ids):
        """Get gene information including GO terms using mygene"""
        print("Fetching gene information and GO terms...")
        gene_info = self.mg.getgenes(
            gene_ids,
            fields=['name', 'symbol', 'go.BP', 'go.MF', 'go.CC'],
            species='human'
        )
        return gene_info
    
    def process_go_terms(self, go_data, category):
        """Process GO terms from mygene response"""
        if not go_data or category not in go_data:
            return []
            
        terms = go_data[category]
        if isinstance(terms, dict):
            terms = [terms]
        
        return [{'id': term.get('id'), 'term': term.get('term')} 
                for term in terms if 'id' in term and 'term' in term]
    
    def create_integrated_dataset(self):
        """Create integrated dataset with KEGG pathways and GO terms"""
        # Get KEGG data
        pathway_dict = self.get_kegg_pathways()
        pathway_names = self.get_pathway_names()
        
        # Get gene information including GO terms
        gene_ids = list(pathway_dict.keys())
        gene_info_list = self.get_gene_info(gene_ids)
        
        # Create integrated dataset
        integrated_data = []
        
        for gene_info in gene_info_list:
            if not gene_info:
                continue
                
            gene_id = gene_info['_id']
            symbol = gene_info.get('symbol', '')
            
            # Get pathways for this gene
            pathways = pathway_dict.get(gene_id, [])
            
            # Get GO terms if available
            go_terms = gene_info.get('go', {})
            
            # Process each pathway
            for pathway in pathways:
                pathway_name = pathway_names.get(pathway, '')
                
                # Process each GO category
                for category in ['BP', 'MF', 'CC']:
                    go_category_terms = self.process_go_terms(go_terms, category)
                    
                    for go_term in go_category_terms:
                        integrated_data.append({
                            'gene_id': gene_id,
                            'gene_symbol': symbol,
                            'kegg_pathway_id': pathway,
                            'kegg_pathway_name': pathway_name,
                            'go_category': category,
                            'go_id': go_term['id'],
                            'go_term': go_term['term']
                        })
        
        # Convert to DataFrame
        df = pd.DataFrame(integrated_data)
        return df
    
    def analyze_radiation_response(self, df):
        """Analyze radiation response related pathways and terms"""
        # Define radiation response related keywords
        radiation_keywords = [
            'dna repair', 'damage response', 'cell cycle', 'apoptosis',
            'p53', 'atm', 'atr', 'radiation', 'oxidative stress',
            'double-strand break', 'homologous recombination'
        ]
        
        # Filter pathways and GO terms related to radiation response
        radiation_mask = df['kegg_pathway_name'].str.lower().str.contains('|'.join(radiation_keywords)) | \
                        df['go_term'].str.lower().str.contains('|'.join(radiation_keywords))
        
        radiation_df = df[radiation_mask].copy()
        
        # Group by pathway and GO category
        pathway_summary = radiation_df.groupby(
            ['kegg_pathway_id', 'kegg_pathway_name']
        )['gene_id'].nunique().reset_index()
        
        go_summary = radiation_df.groupby(
            ['go_category', 'go_id', 'go_term']
        )['gene_id'].nunique().reset_index()
        
        return radiation_df, pathway_summary, go_summary

def main():
    # Initialize integrator
    integrator = KEGGGOIntegrator()
    
    # Create integrated dataset
    print("Creating integrated KEGG-GO dataset...")
    integrated_df = integrator.create_integrated_dataset()
    
    # Save complete integrated dataset
    integrated_df.to_csv('kegg_go_integrated.csv', index=False)
    print("Saved integrated dataset to kegg_go_integrated.csv")
    
    # Analyze radiation response
    print("\nAnalyzing radiation response pathways and terms...")
    radiation_df, pathway_summary, go_summary = integrator.analyze_radiation_response(integrated_df)
    
    # Save radiation response analysis
    radiation_df.to_csv('radiation_response_integrated.csv', index=False)
    pathway_summary.to_csv('radiation_response_pathways.csv', index=False)
    go_summary.to_csv('radiation_response_go_terms.csv', index=False)
    
    # Print summary statistics
    print("\nSummary:")
    print(f"Total genes: {integrated_df['gene_id'].nunique()}")
    print(f"Total KEGG pathways: {integrated_df['kegg_pathway_id'].nunique()}")
    print(f"Total GO terms: {integrated_df['go_id'].nunique()}")
    print("\nRadiation response related:")
    print(f"Pathways: {len(pathway_summary)}")
    print(f"GO terms: {len(go_summary)}")
    
    # Print top radiation response pathways
    print("\nTop radiation response pathways:")
    print(pathway_summary.sort_values('gene_id', ascending=False).head())

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict

class DoseTimeVariantAnalyzer:
    def __init__(self):
        self.doses = ['A', 'B', 'C', 'D', 'E']
        self.timepoints = ['W1', 'W2', 'W3']
        
    def load_vep_files(self, input_dir):
        """Load all annotated VEP files"""
        data_dict = {}
        
        for dose in self.doses:
            for timepoint in self.timepoints:
                filename = f"annotated_d{dose}_{timepoint}_filtered_vep_results.csv"
                filepath = os.path.join(input_dir, filename)
                
                if os.path.exists(filepath):
                    print(f"Loading {filename}")
                    df = pd.read_csv(filepath)
                    # Remove rows where Gene_Name is NA or empty
                    df = df[df['Gene_Name'].notna() & (df['Gene_Name'] != 'N/A')]
                    data_dict[(dose, timepoint)] = df
                else:
                    print(f"Warning: {filename} not found")
                    
        return data_dict
    
    def calculate_variant_impact(self, consequence):
        """Calculate impact score for a single variant"""
        # High impact variants
        if any(term in consequence.lower() for term in ['stop_gained', 'frameshift', 'splice_acceptor', 'splice_donor']):
            return 4
        # Moderate impact variants
        elif any(term in consequence.lower() for term in ['missense', 'inframe']):
            return 3
        # Low impact variants
        elif any(term in consequence.lower() for term in ['synonymous', 'splice_region']):
            return 2
        # Modifier variants
        else:
            return 1
    
    def calculate_gene_scores(self, data_dict):
        """Calculate gene scores for each condition"""
        gene_scores = defaultdict(lambda: defaultdict(float))
        gene_variants = defaultdict(lambda: defaultdict(int))
        
        for (dose, timepoint), df in data_dict.items():
            condition = f"d{dose}_{timepoint}"
            
            # Group by gene
            for gene_name, gene_df in df.groupby('Gene_Name'):
                # Calculate total impact score for gene
                impact_score = sum(gene_df['VEP_Consequence'].apply(self.calculate_variant_impact))
                variant_count = len(gene_df)
                
                # Store both score and variant count
                gene_scores[gene_name][condition] = impact_score
                gene_variants[gene_name][condition] = variant_count
        
        return gene_scores, gene_variants
    
    def analyze_temporal_patterns(self, gene_scores):
        """Analyze temporal patterns for each dose"""
        temporal_patterns = {}
        
        for gene, scores in gene_scores.items():
            patterns = defaultdict(list)
            
            for dose in self.doses:
                # Get scores for all timepoints for this dose
                time_series = [scores.get(f"d{dose}_{t}", 0) for t in self.timepoints]
                patterns[dose] = time_series
                
            temporal_patterns[gene] = dict(patterns)
        
        return temporal_patterns
    
    def analyze_dose_response(self, gene_scores):
        """Analyze dose-response patterns for each timepoint"""
        dose_patterns = {}
        
        for gene, scores in gene_scores.items():
            patterns = defaultdict(list)
            
            for timepoint in self.timepoints:
                # Get scores for all doses at this timepoint
                dose_series = [scores.get(f"d{d}_{timepoint}", 0) for d in self.doses]
                patterns[timepoint] = dose_series
                
            dose_patterns[gene] = dict(patterns)
        
        return dose_patterns
    
    def export_results(self, temporal_patterns, dose_patterns, gene_variants, output_dir):
        """Export analysis results"""
        os.makedirs(output_dir, exist_ok=True)
        
        # Export temporal patterns
        temporal_df = []
        for gene, patterns in temporal_patterns.items():
            for dose, values in patterns.items():
                temporal_df.append({
                    'gene': gene,
                    'dose': dose,
                    'W1_score': values[0],
                    'W2_score': values[1],
                    'W3_score': values[2]
                })
        pd.DataFrame(temporal_df).to_csv(
            os.path.join(output_dir, 'temporal_patterns.csv'),
            index=False
        )
        
        # Export dose patterns
        dose_df = []
        for gene, patterns in dose_patterns.items():
            for timepoint, values in patterns.items():
                dose_df.append({
                    'gene': gene,
                    'timepoint': timepoint,
                    'dA_score': values[0],
                    'dB_score': values[1],
                    'dC_score': values[2],
                    'dD_score': values[3],
                    'dE_score': values[4]
                })
        pd.DataFrame(dose_df).to_csv(
            os.path.join(output_dir, 'dose_patterns.csv'),
            index=False
        )
        
        # Export variant counts
        variant_df = pd.DataFrame.from_dict(gene_variants, orient='index')
        variant_df.to_csv(os.path.join(output_dir, 'variant_counts.csv'))

def main():
    # Initialize analyzer
    analyzer = DoseTimePathwayAnalyzer()
    
    # Set directories
    input_dir = "annotated_vep"
    output_dir = "gene_analysis"
    
    # Load data
    print("Loading VEP files...")
    data_dict = analyzer.load_vep_files(input_dir)
    
    # Calculate gene scores
    print("Calculating gene scores...")
    gene_scores, gene_variants = analyzer.calculate_gene_scores(data_dict)
    
    # Analyze patterns
    print("Analyzing temporal patterns...")
    temporal_patterns = analyzer.analyze_temporal_patterns(gene_scores)
    
    print("Analyzing dose-response patterns...")
    dose_patterns = analyzer.analyze_dose_response(gene_scores)
    
    # Export results
    print("Exporting results...")
    analyzer.export_results(temporal_patterns, dose_patterns, gene_variants, output_dir)
    
    # Print summary statistics
    print("\nAnalysis Summary:")
    print(f"Total genes analyzed: {len(gene_scores)}")
    print("\nTop genes by variant count:")
    variant_counts = defaultdict(int)
    for gene, conditions in gene_variants.items():
        variant_counts[gene] = sum(conditions.values())
    top_genes = sorted(variant_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    for gene, count in top_genes:
        print(f"{gene}: {count} variants")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

class DoseTimeVisualizer:
    def __init__(self):
        self.temporal_data = None
        self.dose_data = None
        self.variant_data = None
        plt.style.use('seaborn-v0_8')
        
    def load_data(self, temporal_file, dose_file, variant_file):
        """Load the analysis results"""
        self.temporal_data = pd.read_csv(temporal_file)
        self.dose_data = pd.read_csv(dose_file)
        self.variant_data = pd.read_csv(variant_file, index_col=0)
        
    def plot_temporal_heatmap(self, top_n=50):
        """Create heatmap of temporal patterns for top genes"""
        # Get top genes by total score
        gene_scores = self.temporal_data.groupby('gene')[['W1_score', 'W2_score', 'W3_score']].sum()
        top_genes = gene_scores.sum(axis=1).nlargest(top_n).index
        
        # Prepare data for heatmap
        data_by_dose = {}
        for dose in sorted(self.temporal_data['dose'].unique()):
            dose_data = self.temporal_data[self.temporal_data['dose'] == dose]
            dose_data = dose_data[dose_data['gene'].isin(top_genes)]
            
            # Create matrix for this dose
            matrix_data = pd.DataFrame(index=top_genes)
            matrix_data['W1'] = dose_data.set_index('gene')['W1_score']
            matrix_data['W2'] = dose_data.set_index('gene')['W2_score']
            matrix_data['W3'] = dose_data.set_index('gene')['W3_score']
            data_by_dose[dose] = matrix_data.fillna(0)
        
        # Create plot
        n_doses = len(data_by_dose)
        fig, axes = plt.subplots(1, n_doses, figsize=(5*n_doses, 12))
        if n_doses == 1:
            axes = [axes]
        
        for i, (dose, data) in enumerate(data_by_dose.items()):
            sns.heatmap(data, ax=axes[i], cmap='YlOrRd',
                       cbar_kws={'label': 'Score'})
            axes[i].set_title(f'Dose {dose}')
            if i > 0:  # Only show y-labels for first plot
                axes[i].set_ylabel('')
        
        plt.tight_layout()
        return fig
    
    def plot_dose_response_curves(self, timepoint='W1', top_n=10):
        """Plot dose-response curves for top genes"""
        timepoint_data = self.dose_data[self.dose_data['timepoint'] == timepoint].copy()
        
        # Get dose columns
        dose_cols = ['dA_score', 'dB_score', 'dC_score', 'dD_score', 'dE_score']
        
        # Calculate total score and get top genes
        timepoint_data['total_score'] = timepoint_data[dose_cols].sum(axis=1)
        top_genes = timepoint_data.nlargest(top_n, 'total_score')
        
        # Create plot
        fig, ax = plt.subplots(figsize=(12, 8))
        
        for _, row in top_genes.iterrows():
            ax.plot(['A', 'B', 'C', 'D', 'E'], 
                   row[dose_cols].values,
                   marker='o', label=row['gene'])
        
        ax.set_xlabel('Dose')
        ax.set_ylabel('Score')
        ax.set_title(f'Dose Response Curves at {timepoint}')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        return fig
    
    def plot_variant_distribution(self, top_n=20):
        """Plot variant count distribution"""
        # Calculate total variants per gene
        total_variants = self.variant_data.sum(axis=1).sort_values(ascending=False)
        top_genes = total_variants.head(top_n)
        
        # Create plot
        fig, ax = plt.subplots(figsize=(12, 6))
        sns.barplot(x=top_genes.index, y=top_genes.values, ax=ax)
        ax.set_xlabel('Gene')
        ax.set_ylabel('Total Variants')
        ax.set_title('Distribution of Variants Across Top Genes')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        return fig
    
    def plot_temporal_profiles_all_doses(self):
        """Plot temporal profiles for top genes across all doses"""
        # Get top 5 genes by total score
        gene_scores = self.temporal_data.groupby('gene')[['W1_score', 'W2_score', 'W3_score']].sum()
        top_genes = gene_scores.sum(axis=1).nlargest(5).index
        
        # Create subplot for each dose
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        axes = axes.flatten()
        
        # Colors for consistency across plots
        colors = plt.cm.Set2(np.linspace(0, 1, len(top_genes)))
        
        for idx, dose in enumerate(['A', 'B', 'C', 'D', 'E']):
            ax = axes[idx]
            
            for gene, color in zip(top_genes, colors):
                gene_data = self.temporal_data[
                    (self.temporal_data['gene'] == gene) & 
                    (self.temporal_data['dose'] == dose)
                ]
                if len(gene_data) > 0:
                    scores = [gene_data['W1_score'].iloc[0],
                             gene_data['W2_score'].iloc[0],
                             gene_data['W3_score'].iloc[0]]
                    ax.plot(['W1', 'W2', 'W3'], scores, 
                           marker='o', label=gene, color=color)
            
            ax.set_xlabel('Time Point')
            ax.set_ylabel('Score')
            ax.set_title(f'Temporal Profiles (Dose {dose})')
            if idx == 0:  # Only show legend for first plot
                ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
            ax.grid(True, alpha=0.3)
        
        # Remove the last subplot (since we only have 5 doses)
        axes[-1].remove()
        
        plt.suptitle('Temporal Profiles Across All Doses', fontsize=16, y=1.02)
        plt.tight_layout()
        return fig
    
    def plot_gene_comparison(self, genes_of_interest, dose='A'):
        """Compare specific genes across all timepoints"""
        if not isinstance(genes_of_interest, list):
            genes_of_interest = [genes_of_interest]
            
        data = self.temporal_data[
            (self.temporal_data['gene'].isin(genes_of_interest)) &
            (self.temporal_data['dose'] == dose)
        ]
        
        # Create plot
        fig, ax = plt.subplots(figsize=(10, 6))
        
        for gene in genes_of_interest:
            gene_data = data[data['gene'] == gene]
            if len(gene_data) > 0:
                values = [gene_data['W1_score'].iloc[0],
                         gene_data['W2_score'].iloc[0],
                         gene_data['W3_score'].iloc[0]]
                ax.plot(['W1', 'W2', 'W3'], values, 
                       marker='o', linewidth=2, label=gene)
        
        ax.set_xlabel('Time Point')
        ax.set_ylabel('Score')
        ax.set_title(f'Gene Comparison (Dose {dose})')
        ax.legend()
        plt.grid(True)
        plt.tight_layout()
        return fig
    
    def analyze_and_plot(self, output_dir='visualization_results'):
        """Generate all plots and save them"""
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        # Generate and save plots
        plots = {
            'temporal_heatmap.png': self.plot_temporal_heatmap(),
            'dose_response_W1.png': self.plot_dose_response_curves('W1'),
            'dose_response_W2.png': self.plot_dose_response_curves('W2'),
            'dose_response_W3.png': self.plot_dose_response_curves('W3'),
            'variant_distribution.png': self.plot_variant_distribution(),
            'temporal_profiles_all_doses.png' : self.plot_temporal_profiles_all_doses()
        }
        
        for filename, fig in plots.items():
            fig.savefig(os.path.join(output_dir, filename), dpi=300, bbox_inches='tight')
            plt.close(fig)

def main():
    # Initialize visualizer
    visualizer = DoseTimeVisualizer()
    
    # Load data
    visualizer.load_data(
        'gene_analysis/temporal_patterns.csv',
        'gene_analysis/dose_patterns.csv',
        'gene_analysis/variant_counts.csv'
    )
    
    # Generate all visualizations
    visualizer.analyze_and_plot()
    
    print("Visualization complete! Check the 'visualization_results' directory.")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

class DosePatternAnalyzer:
    def __init__(self):
        self.temporal_data = None
        plt.style.use('seaborn-v0_8')
        
    def load_data(self, temporal_file):
        """Load temporal patterns data"""
        self.temporal_data = pd.read_csv(temporal_file)
        
    def identify_dose_specific_genes(self, zscore_threshold=1.5):
        """Identify genes that stand out at specific doses with lower threshold"""
        dose_specific = []
        
        for gene in self.temporal_data['gene'].unique():
            gene_data = self.temporal_data[self.temporal_data['gene'] == gene]
            
            for timepoint in ['W1', 'W2', 'W3']:
                score_col = f'{timepoint}_score'
                scores = []
                doses = []
                
                # Collect scores for all doses at this timepoint
                for dose in sorted(gene_data['dose'].unique()):  # Sort doses for consistency
                    dose_data = gene_data[gene_data['dose'] == dose]
                    if len(dose_data) > 0:
                        scores.append(dose_data[score_col].iloc[0])
                        doses.append(dose)
                
                if len(scores) > 1:
                    scores = np.array(scores)
                    zscores = stats.zscore(scores)
                    
                    # Record significant variations
                    for dose, score, zscore in zip(doses, scores, zscores):
                        dose_specific.append({
                            'gene': gene,
                            'dose': dose,
                            'timepoint': timepoint,
                            'score': score,
                            'zscore': zscore,
                            'mean_score': np.mean(scores),
                            'std_score': np.std(scores)
                        })
        
        return pd.DataFrame(dose_specific)
    
    def plot_dose_patterns(self, dose_specific):
        """Create comprehensive visualization of dose-specific patterns"""
        if len(dose_specific) == 0:
            print("No dose-specific patterns found!")
            return None
        
        # Create multiple visualizations
        fig = plt.figure(figsize=(20, 15))
        gs = plt.GridSpec(2, 2)
        
        # 1. Heatmap of gene expression across doses
        ax1 = fig.add_subplot(gs[0, :])
        pivot_data = pd.pivot_table(
            dose_specific,
            values='zscore',
            index='gene',
            columns=['dose', 'timepoint'],
            aggfunc='first',
            fill_value=0
        )
        
        # Sort genes by overall variation
        gene_var = pivot_data.std(axis=1)
        pivot_data = pivot_data.loc[gene_var.sort_values(ascending=False).index]
        
        # Take top 30 genes for better visualization
        sns.heatmap(pivot_data.head(30), cmap='RdBu_r', center=0,
                   cbar_kws={'label': 'Z-score'}, ax=ax1)
        ax1.set_title('Top 30 Genes with Dose-specific Patterns')
        
        # 2. Distribution of z-scores by dose
        ax2 = fig.add_subplot(gs[1, 0])
        sns.boxplot(data=dose_specific, x='dose', y='zscore', ax=ax2)
        ax2.set_title('Distribution of Z-scores by Dose')
        
        # 3. Count of significant patterns by timepoint
        ax3 = fig.add_subplot(gs[1, 1])
        sig_patterns = dose_specific[abs(dose_specific['zscore']) > 1.5]
        pattern_counts = sig_patterns.groupby(['timepoint', 'dose']).size().unstack()
        sns.heatmap(pattern_counts, annot=True, fmt='d', cmap='YlOrRd', ax=ax3)
        ax3.set_title('Number of Significant Patterns')
        
        plt.tight_layout()
        return fig
    
    def analyze_dose_response(self, dose_specific):
        """Analyze dose-response relationships"""
        # Group genes by their dose response patterns
        pattern_groups = defaultdict(list)
        
        for gene in dose_specific['gene'].unique():
            gene_data = dose_specific[dose_specific['gene'] == gene]
            
            # Calculate trend across doses
            for timepoint in ['W1', 'W2', 'W3']:
                timepoint_data = gene_data[gene_data['timepoint'] == timepoint]
                if len(timepoint_data) > 0:
                    # Sort by dose and get score pattern
                    scores = timepoint_data.sort_values('dose')['score'].values
                    if len(scores) > 1:
                        # Classify pattern (increasing, decreasing, or complex)
                        if np.all(np.diff(scores) > 0):
                            pattern_groups['increasing'].append(gene)
                        elif np.all(np.diff(scores) < 0):
                            pattern_groups['decreasing'].append(gene)
                        else:
                            pattern_groups['complex'].append(gene)
        
        return pattern_groups
    
    def generate_detailed_summary(self, dose_specific, pattern_groups):
        """Generate detailed summary of dose-specific patterns"""
        summary = []
        
        # Overall statistics
        summary.append("=== Overall Statistics ===")
        summary.append(f"Total genes analyzed: {len(dose_specific['gene'].unique())}")
        summary.append(f"Total significant patterns: {len(dose_specific)}")
        summary.append(f"Patterns by dose: {dose_specific.groupby('dose').size().to_dict()}")
        summary.append(f"Patterns by timepoint: {dose_specific.groupby('timepoint').size().to_dict()}")
        
        # Pattern groups
        summary.append("\n=== Dose Response Patterns ===")
        for pattern, genes in pattern_groups.items():
            summary.append(f"\n{pattern.title()} pattern:")
            summary.append(f"Number of genes: {len(genes)}")
            summary.append("Top genes: " + ", ".join(genes[:10]))
        
        # Top responding genes
        summary.append("\n=== Top Responding Genes ===")
        top_genes = dose_specific.groupby('gene')['zscore'].agg(
            lambda x: np.max(np.abs(x))
        ).sort_values(ascending=False).head(10)
        
        for gene, score in top_genes.items():
            summary.append(f"{gene}: max |z-score| = {score:.2f}")
        
        return '\n'.join(summary)

def main():
    from collections import defaultdict
    
    # Initialize analyzer
    analyzer = DosePatternAnalyzer()
    
    # Load data
    analyzer.load_data('gene_analysis/temporal_patterns.csv')
    
    # Identify dose-specific patterns
    print("Identifying dose-specific patterns...")
    dose_specific = analyzer.identify_dose_specific_genes(zscore_threshold=1.5)
    
    # Create visualizations
    print("Creating visualizations...")
    fig = analyzer.plot_dose_patterns(dose_specific)
    if fig is not None:
        fig.savefig('dose_patterns_analysis.png', bbox_inches='tight', dpi=300)
    
    # Analyze dose response patterns
    print("Analyzing dose response patterns...")
    pattern_groups = analyzer.analyze_dose_response(dose_specific)
    
    # Generate detailed summary
    print("Generating summary...")
    summary = analyzer.generate_detailed_summary(dose_specific, pattern_groups)
    with open('dose_patterns_summary.txt', 'w') as f:
        f.write(summary)
    
    print("\nAnalysis complete! Check:")
    print("- dose_patterns_analysis.png for visualizations")
    print("- dose_patterns_summary.txt for detailed analysis")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from collections import defaultdict
from statsmodels.stats.multitest import fdrcorrection

def adjust_pvalues(pvalues):
    """Apply FDR correction to p-values"""
    if len(pvalues) == 0:
        return []
    _, adjusted = fdrcorrection(pvalues)
    return adjusted
    
class IntegratedEnrichmentAnalyzer:
    def __init__(self):
        self.integrated_data = None
        self.temporal_data = None
        plt.style.use('seaborn-v0_8')
        
    def load_data(self, integrated_file, temporal_file):
        """Load integrated KEGG-GO mapping and temporal data"""
        self.integrated_data = pd.read_csv(integrated_file)
        self.temporal_data = pd.read_csv(temporal_file)
        print(f"Loaded {len(self.integrated_data)} integrated annotations")
        print(f"Loaded {len(self.temporal_data)} temporal data points")
        
    def get_significant_genes(self, zscore_threshold=1.5):
        """Identify genes showing significant patterns"""
        sig_genes = set()
        
        # Calculate gene-level statistics
        gene_stats = self.temporal_data.groupby('gene').agg({
            'W1_score': ['mean', 'std'],
            'W2_score': ['mean', 'std'],
            'W3_score': ['mean', 'std']
        }).fillna(0)
        
        # Find genes with significant variation
        for gene in gene_stats.index:
            gene_data = gene_stats.loc[gene]
            scores = [
                gene_data[('W1_score', 'mean')],
                gene_data[('W2_score', 'mean')],
                gene_data[('W3_score', 'mean')]
            ]
            if np.std(scores) > 0:  # Only if there's variation
                if np.max(np.abs(stats.zscore(scores))) > zscore_threshold:
                    sig_genes.add(gene)
        
        return sig_genes

    def analyze_kegg_enrichment(self, sig_genes):
        """Analyze KEGG pathway enrichment"""
        # Get unique KEGG pathways
        kegg_data = self.integrated_data.dropna(subset=['kegg_pathway_id'])
        
        results = []
        for pathway_id in kegg_data['kegg_pathway_id'].unique():
            # Get genes in pathway
            pathway_genes = set(
                kegg_data[kegg_data['kegg_pathway_id'] == pathway_id]['gene_symbol']
            )
            pathway_name = kegg_data[
                kegg_data['kegg_pathway_id'] == pathway_id
            ]['kegg_pathway_name'].iloc[0]
            
            # Calculate enrichment
            genes_in_pathway = len(sig_genes & pathway_genes)
            genes_not_in_pathway = len(sig_genes - pathway_genes)
            background_in_pathway = len(pathway_genes - sig_genes)
            background_not_in_pathway = len(
                set(self.temporal_data['gene']) - sig_genes - pathway_genes
            )
            
            # Fisher's exact test
            contingency = np.array([
                [genes_in_pathway, genes_not_in_pathway],
                [background_in_pathway, background_not_in_pathway]
            ])
            odds_ratio, pvalue = stats.fisher_exact(contingency)
            
            results.append({
                'pathway_id': pathway_id,
                'pathway_name': pathway_name,
                'genes_in_pathway': genes_in_pathway,
                'total_pathway_genes': len(pathway_genes),
                'pvalue': pvalue,
                'odds_ratio': odds_ratio
            })
        
        results_df = pd.DataFrame(results)
        if len(results_df) > 0:
            results_df['adjusted_pvalue'] = adjust_pvalues(results_df['pvalue'])
            results_df = results_df.sort_values('adjusted_pvalue')
        
        return results_df
        
    def analyze_go_enrichment(self, sig_genes):
        """Analyze GO term enrichment by category with corrected statistics"""
        results = []
        
        for category in ['BP', 'MF', 'CC']:
            # Filter GO terms by category
            category_data = self.integrated_data[
                self.integrated_data['go_category'] == category
            ]
            
            # Get all genes as background
            all_genes = set(self.temporal_data['gene'])
            
            for go_id in category_data['go_id'].unique():
                # Get genes with this GO term
                term_genes = set(
                    category_data[category_data['go_id'] == go_id]['gene_symbol']
                )
                term_name = category_data[
                    category_data['go_id'] == go_id
                ]['go_term'].iloc[0]
                
                # Calculate enrichment
                n_term = len(term_genes)
                n_sig = len(sig_genes)
                n_overlap = len(sig_genes & term_genes)
                n_background = len(all_genes)
                
                # Fisher's exact test with corrected contingency table
                contingency = np.array([
                    [n_overlap, n_sig - n_overlap],
                    [n_term - n_overlap, n_background - n_term - (n_sig - n_overlap)]
                ])
                
                try:
                    odds_ratio, pvalue = stats.fisher_exact(contingency)
                except:
                    continue
                    
                # Only keep terms with at least 5 genes and p-value < 0.05
                if n_overlap >= 5 and pvalue < 0.05:
                    results.append({
                        'go_id': go_id,
                        'go_term': term_name,
                        'category': category,
                        'genes_in_term': n_overlap,
                        'total_term_genes': n_term,
                        'pvalue': pvalue,
                        'odds_ratio': odds_ratio,
                        'fold_enrichment': (n_overlap/n_sig)/(n_term/n_background)
                    })
        
        results_df = pd.DataFrame(results)
        if len(results_df) > 0:
            # Apply FDR correction separately for each category
            for category in ['BP', 'MF', 'CC']:
                mask = results_df['category'] == category
                if mask.any():
                    results_df.loc[mask, 'adjusted_pvalue'] = fdrcorrection(
                        results_df.loc[mask, 'pvalue']
                    )[1]
            
            results_df = results_df.sort_values('adjusted_pvalue')
        
        return results_df
    
    def calculate_enrichment(self, zscore_threshold=1.5):
        """Calculate enrichment using integrated database"""
        # Get significant genes
        sig_genes = self.get_significant_genes(zscore_threshold)
        print(f"Found {len(sig_genes)} significant genes")
        
        if len(sig_genes) == 0:
            print("No significant genes found. Try lowering the zscore_threshold.")
            return pd.DataFrame(), pd.DataFrame()
        
        # Analyze KEGG pathway enrichment
        print("Analyzing KEGG pathway enrichment...")
        kegg_results = self.analyze_kegg_enrichment(sig_genes)
        print(f"Found {len(kegg_results)} enriched KEGG pathways")
        
        # Analyze GO term enrichment
        print("Analyzing GO term enrichment...")
        go_results = self.analyze_go_enrichment(sig_genes)
        print(f"Found {len(go_results)} enriched GO terms")
        
        return kegg_results, go_results
    
    def plot_enrichment_results(self, kegg_results, go_results, output_dir):
        """Create visualizations for enrichment results"""
        if len(kegg_results) == 0 and len(go_results) == 0:
            print("No enrichment results to plot.")
            return
            
        # Plot KEGG results
        if len(kegg_results) > 0:
            self.plot_kegg_enrichment(kegg_results, output_dir)
            
        # Plot GO results
        if len(go_results) > 0:
            self.plot_go_enrichment(go_results, output_dir)
            
    def plot_kegg_enrichment(self, kegg_results, output_dir, top_n=20):
        """Plot KEGG pathway enrichment results"""
        plt.figure(figsize=(15, 10))
        top_pathways = kegg_results.nsmallest(top_n, 'adjusted_pvalue')
        
        # Create bar plot
        y_pos = np.arange(len(top_pathways))
        plt.barh(y_pos, -np.log10(top_pathways['adjusted_pvalue']))
        
        # Customize plot
        plt.yticks(y_pos, 
                  [name.split(' - ')[0] for name in top_pathways['pathway_name']], 
                  fontsize=10)
        plt.xlabel('-log10(Adjusted p-value)')
        plt.title('Top Enriched KEGG Pathways')
        
        # Add gene counts
        for i, v in enumerate(top_pathways['genes_in_pathway']):
            plt.text(0.1, i, f'n={v}', fontsize=8, va='center')
        
        plt.tight_layout()
        plt.savefig(f"{output_dir}/kegg_enrichment.png", 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
    def plot_go_enrichment(self, go_results, output_dir, top_n=10):
        """Plot GO term enrichment results with corrected visualization"""
        categories = ['BP', 'MF', 'CC']
        
        fig, axes = plt.subplots(len(categories), 1, 
                                figsize=(15, 6*len(categories)))
        
        for i, category in enumerate(categories):
            cat_results = go_results[go_results['category'] == category]
            if len(cat_results) > 0:
                # Get top terms
                top_terms = cat_results.nsmallest(top_n, 'adjusted_pvalue')
                
                # Create bar plot with -log10 transformed p-values
                y_pos = np.arange(len(top_terms))
                log_p = -np.log10(top_terms['adjusted_pvalue'])
                
                axes[i].barh(y_pos, log_p)
                
                # Customize plot
                axes[i].set_yticks(y_pos)
                axes[i].set_yticklabels(top_terms['go_term'], fontsize=8)
                axes[i].set_xlabel('-log10(Adjusted p-value)')
                axes[i].set_title(f'Top Enriched {category} Terms')
                
                # Add gene counts
                for j, (_, row) in enumerate(top_terms.iterrows()):
                    axes[i].text(0.1, j, 
                               f"n={row['genes_in_term']} "
                               f"(FE={row['fold_enrichment']:.1f})", 
                               fontsize=8, 
                               va='center')
        
        plt.tight_layout()
        plt.savefig(f"{output_dir}/go_enrichment.png", 
                    dpi=300, bbox_inches='tight')
        plt.close()

def main():
    import os
    
    # Create output directory
    output_dir = "enrichment_visualization"
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize analyzer
    analyzer = IntegratedEnrichmentAnalyzer()
    
    # Load data
    analyzer.load_data(
        'kegg_go_integrated.csv',
        'gene_analysis/temporal_patterns.csv'
    )
    
    # Perform enrichment analysis
    kegg_results, go_results = analyzer.calculate_enrichment(zscore_threshold=1.0)
    
    # Create visualizations
    analyzer.plot_enrichment_results(kegg_results, go_results, output_dir)
    
    # Save numerical results
    if len(kegg_results) > 0:
        kegg_results.to_csv(f"{output_dir}/kegg_enrichment_results.csv", 
                          index=False)
    if len(go_results) > 0:
        go_results.to_csv(f"{output_dir}/go_enrichment_results.csv", 
                         index=False)
    
    print("\nAnalysis complete! Check the enrichment_visualization directory")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multitest import fdrcorrection

class RadiationPathwayAnalyzer:
    def __init__(self):
        # Define radiation-relevant KEGG pathways
        self.radiation_pathways = {
            'hsa03410': 'Base excision repair',
            'hsa03420': 'Nucleotide excision repair',
            'hsa03430': 'Mismatch repair',
            'hsa03440': 'Homologous recombination',
            'hsa03450': 'Non-homologous end-joining',
            'hsa03460': 'Fanconi anemia pathway',
            'hsa04115': 'p53 signaling pathway',
            'hsa04110': 'Cell cycle',
            'hsa04210': 'Apoptosis',
            'hsa04630': 'JAK-STAT signaling pathway',
            'hsa04066': 'HIF-1 signaling pathway',
            'hsa04064': 'NF-kappa B signaling pathway',
            'hsa04010': 'MAPK signaling pathway',
            'hsa04151': 'PI3K-Akt signaling pathway',
            'hsa04218': 'Cellular senescence',
            'hsa04216': 'Ferroptosis',
            'hsa04915': 'Estrogen signaling pathway'
        }
        
        # Define radiation-relevant GO terms
        self.radiation_go_terms = {
            # DNA Damage Response
            'GO:0006974': 'cellular response to DNA damage stimulus',
            'GO:0000077': 'DNA damage checkpoint',
            'GO:0006302': 'double-strand break repair',
            'GO:0006281': 'DNA repair',
            # Oxidative Stress
            'GO:0006979': 'response to oxidative stress',
            'GO:0000302': 'response to reactive oxygen species',
            # Cell Death
            'GO:0012501': 'programmed cell death',
            'GO:0097193': 'intrinsic apoptotic signaling pathway',
            # Cell Cycle
            'GO:0007049': 'cell cycle',
            'GO:0000075': 'cell cycle checkpoint',
            # Stress Response
            'GO:0033554': 'cellular response to stress',
            'GO:0080135': 'regulation of cellular response to stress'
        }
        
    def load_data(self, integrated_file, temporal_file):
        """Load integrated pathway and temporal data"""
        self.integrated_data = pd.read_csv(integrated_file)
        self.temporal_data = pd.read_csv(temporal_file)
        
    def analyze_radiation_pathways(self, sig_genes):
        """Analyze enrichment specifically for radiation-related pathways"""
        # Filter for radiation-specific KEGG pathways
        radiation_results = []
        
        for pathway_id, pathway_name in self.radiation_pathways.items():
            pathway_genes = set(
                self.integrated_data[
                    self.integrated_data['kegg_pathway_id'] == pathway_id
                ]['gene_symbol']
            )
            
            if len(pathway_genes) > 0:
                # Calculate enrichment
                genes_in_pathway = len(sig_genes & pathway_genes)
                genes_not_in_pathway = len(sig_genes - pathway_genes)
                background_in_pathway = len(pathway_genes - sig_genes)
                background_not_in_pathway = len(
                    set(self.temporal_data['gene']) - sig_genes - pathway_genes
                )
                
                # Fisher's exact test
                contingency = np.array([
                    [genes_in_pathway, genes_not_in_pathway],
                    [background_in_pathway, background_not_in_pathway]
                ])
                odds_ratio, pvalue = stats.fisher_exact(contingency)
                
                radiation_results.append({
                    'pathway_id': pathway_id,
                    'pathway_name': pathway_name,
                    'genes_in_pathway': genes_in_pathway,
                    'total_pathway_genes': len(pathway_genes),
                    'pvalue': pvalue,
                    'odds_ratio': odds_ratio
                })
        
        results_df = pd.DataFrame(radiation_results)
        if len(results_df) > 0:
            results_df['adjusted_pvalue'] = fdrcorrection(results_df['pvalue'])[1]
            results_df = results_df.sort_values('adjusted_pvalue')
        
        return results_df
    
    def analyze_radiation_go_terms(self, sig_genes):
        """Analyze enrichment specifically for radiation-related GO terms"""
        radiation_results = []
        
        for go_id, go_term in self.radiation_go_terms.items():
            term_genes = set(
                self.integrated_data[
                    self.integrated_data['go_id'] == go_id
                ]['gene_symbol']
            )
            
            if len(term_genes) > 0:
                # Calculate enrichment
                genes_with_term = len(sig_genes & term_genes)
                genes_without_term = len(sig_genes - term_genes)
                background_with_term = len(term_genes - sig_genes)
                background_without_term = len(
                    set(self.temporal_data['gene']) - sig_genes - term_genes
                )
                
                # Fisher's exact test
                contingency = np.array([
                    [genes_with_term, genes_without_term],
                    [background_with_term, background_without_term]
                ])
                odds_ratio, pvalue = stats.fisher_exact(contingency)
                
                radiation_results.append({
                    'go_id': go_id,
                    'go_term': go_term,
                    'genes_in_term': genes_with_term,
                    'total_term_genes': len(term_genes),
                    'pvalue': pvalue,
                    'odds_ratio': odds_ratio
                })
        
        results_df = pd.DataFrame(radiation_results)
        if len(results_df) > 0:
            results_df['adjusted_pvalue'] = fdrcorrection(results_df['pvalue'])[1]
            results_df = results_df.sort_values('adjusted_pvalue')
        
        return results_df
    
    def plot_radiation_enrichment(self, pathway_results, go_results, output_dir):
        """Create visualizations for radiation-specific enrichment"""
        # Plot radiation pathway enrichment
        if len(pathway_results) > 0:
            plt.figure(figsize=(12, 8))
            
            # Create bar plot
            y_pos = np.arange(len(pathway_results))
            plt.barh(y_pos, -np.log10(pathway_results['adjusted_pvalue']))
            
            # Customize plot
            plt.yticks(y_pos, pathway_results['pathway_name'], fontsize=10)
            plt.xlabel('-log10(Adjusted p-value)')
            plt.title('Radiation-Related KEGG Pathways')
            
            # Add gene counts
            for i, row in enumerate(pathway_results.itertuples()):
                plt.text(0.1, i, f'n={row.genes_in_pathway}', fontsize=8)
            
            plt.tight_layout()
            plt.savefig(f"{output_dir}/radiation_kegg_enrichment.png", dpi=300, bbox_inches='tight')
            plt.close()
        
        # Plot radiation GO term enrichment
        if len(go_results) > 0:
            plt.figure(figsize=(12, 8))
            
            # Create bar plot
            y_pos = np.arange(len(go_results))
            plt.barh(y_pos, -np.log10(go_results['adjusted_pvalue']))
            
            # Customize plot
            plt.yticks(y_pos, go_results['go_term'], fontsize=10)
            plt.xlabel('-log10(Adjusted p-value)')
            plt.title('Radiation-Related GO Terms')
            
            # Add gene counts
            for i, row in enumerate(go_results.itertuples()):
                plt.text(0.1, i, f'n={row.genes_in_term}', fontsize=8)
            
            plt.tight_layout()
            plt.savefig(f"{output_dir}/radiation_go_enrichment.png", dpi=300, bbox_inches='tight')
            plt.close()

def main():
    import os
    
    # Create output directory
    output_dir = "radiation_enrichment"
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize analyzer
    analyzer = RadiationPathwayAnalyzer()
    
    # Load data
    analyzer.load_data(
        'kegg_go_integrated.csv',
        'gene_analysis/temporal_patterns.csv'
    )
    
    # Get significant genes (using the same method as before)
    sig_genes = set(analyzer.temporal_data['gene'].unique())
    
    # Analyze radiation-specific pathways
    pathway_results = analyzer.analyze_radiation_pathways(sig_genes)
    go_results = analyzer.analyze_radiation_go_terms(sig_genes)
    
    # Create visualizations
    analyzer.plot_radiation_enrichment(pathway_results, go_results, output_dir)
    
    # Save results
    pathway_results.to_csv(f"{output_dir}/radiation_pathways.csv", index=False)
    go_results.to_csv(f"{output_dir}/radiation_go_terms.csv", index=False)
    
    print("Analysis complete! Check the radiation_enrichment directory")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

class RadiationVariantAnalyzer:
    def __init__(self):
        self.kegg_results = None
        self.go_results = None
        self.variant_data = None
        self.integrated_data = None
        
    def load_data(self, kegg_file, go_file, variant_file, integrated_file):
        """Load all required data files"""
        self.kegg_results = pd.read_csv(kegg_file)
        self.go_results = pd.read_csv(go_file)
        self.variant_data = pd.read_csv(variant_file)  # VEP analysis results
        self.integrated_data = pd.read_csv(integrated_file)
        
    def get_radiation_related_pathways(self):
        """Extract radiation-related pathways based on keywords"""
        radiation_keywords = [
            'repair', 'damage', 'cycle', 'apoptosis', 'p53', 
            'stress', 'checkpoint', 'senescence', 'death',
            'oxidative', 'DNA', 'radiation'
        ]
        
        # Filter KEGG pathways
        radiation_kegg = self.kegg_results[
            self.kegg_results['pathway_name'].str.lower().apply(
                lambda x: any(keyword in x.lower() for keyword in radiation_keywords)
            )
        ]
        
        # Filter GO terms
        radiation_go = self.go_results[
            self.go_results['go_term'].str.lower().apply(
                lambda x: any(keyword in x.lower() for keyword in radiation_keywords)
            )
        ]
        
        return radiation_kegg, radiation_go
    
    def get_genes_for_pathway(self, pathway_id):
        """Get genes associated with a KEGG pathway"""
        return set(self.integrated_data[
            self.integrated_data['kegg_pathway_id'] == pathway_id
        ]['gene_symbol'])
    
    def create_variant_impact_matrix(self, genes):
        """Create matrix of variant impact scores for given genes across conditions"""
        variant_data = []
        
        for gene in genes:
            gene_variants = self.variant_data[self.variant_data['gene'] == gene]
            if len(gene_variants) > 0:
                for _, row in gene_variants.iterrows():
                    variant_data.append({
                        'gene': gene,
                        'dose': row['dose'],
                        'W1_variants': row['W1_score'],
                        'W2_variants': row['W2_score'],
                        'W3_variants': row['W3_score']
                    })
        
        return pd.DataFrame(variant_data)
    
    def plot_pathway_variant_heatmap(self, pathway_id, pathway_name, output_dir):
        """Create heatmap showing variant impacts for genes in a pathway"""
        exposure_map = {
            ('A', 'W1'): 0.168, ('A', 'W2'): 0.336, ('A', 'W3'): 0.504,
            ('B', 'W1'): 1.68, ('B', 'W2'): 3.36, ('B', 'W3'): 5.04,
            ('C', 'W1'): 16.8, ('C', 'W2'): 33.6, ('C', 'W3'): 50.4,
            ('D', 'W1'): 168, ('D', 'W2'): 336, ('D', 'W3'): 504,
            ('E', 'W1'): 336, ('E', 'W2'): 672, ('E', 'W3'): 1008
        }
        
        genes = self.get_genes_for_pathway(pathway_id)
        variant_matrix = self.create_variant_impact_matrix(genes)
        
        if len(variant_matrix) > 0:
            plt.figure(figsize=(15, len(genes) * 0.3 + 2))
            
            pivot_data = pd.pivot_table(
                variant_matrix,
                values=['W1_variants', 'W2_variants', 'W3_variants'],
                index='gene',
                columns='dose'
            )
            
            g = sns.heatmap(pivot_data, cmap='RdBu_r', center=0,
                           xticklabels=True, yticklabels=True)
            
            # Get column information and create new labels
            exposure_labels = []
            for col in pivot_data.columns:
                dose = col[1]  # Get dose from multi-index
                week = col[0].split('_')[0]  # Extract week from column name
                exposure = exposure_map[(dose, week)]
                exposure_labels.append(f"{dose}-{week}\n{exposure}mG")
            
            g.set_xticklabels(exposure_labels, rotation=45, ha='right')
            plt.title(f'Variant Impact Scores in {pathway_name}')
            plt.tight_layout()
            plt.savefig(f"{output_dir}/pathway_{pathway_id}_variants.png",
                       dpi=300, bbox_inches='tight')
            plt.close()
            
            variant_matrix.to_csv(f"{output_dir}/pathway_{pathway_id}_variants.csv",
                                index=False)
    
    def plot_radiation_variant_patterns(self):
        """Plot patterns of variants in radiation-related pathways"""
        rad_kegg, rad_go = self.get_radiation_related_pathways()
        output_dir = "radiation_variant_patterns"
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        exposure_map = {
            ('A', 'W1'): 0.168, ('A', 'W2'): 0.336, ('A', 'W3'): 0.504,
            ('B', 'W1'): 1.68, ('B', 'W2'): 3.36, ('B', 'W3'): 5.04,
            ('C', 'W1'): 16.8, ('C', 'W2'): 33.6, ('C', 'W3'): 50.4,
            ('D', 'W1'): 168, ('D', 'W2'): 336, ('D', 'W3'): 504,
            ('E', 'W1'): 336, ('E', 'W2'): 672, ('E', 'W3'): 1008
        }
        
        # Create summary file
        with open(f"{output_dir}/radiation_pathway_summary.txt", 'w') as f:
            f.write("Radiation-Related Pathways Variant Analysis\n")
            f.write("=========================================\n\n")
            
            # KEGG Pathways
            f.write("KEGG Pathways:\n")
            for _, row in rad_kegg.iterrows():
                genes = self.get_genes_for_pathway(row['pathway_id'])
                f.write(f"\n{row['pathway_name']}:\n")
                f.write(f"Total genes with variants: {len(genes)}\n")
                f.write(f"Pathway enrichment p-value: {row['adjusted_pvalue']:.2e}\n")
                f.write("Affected genes: " + ", ".join(list(genes)[:10]) + "\n")
                
                # Create heatmap for this pathway
                self.plot_pathway_variant_heatmap(
                    row['pathway_id'],
                    row['pathway_name'],
                    output_dir
                )
        
        # Create overview heatmap of all radiation-related genes
        all_rad_genes = set()
        for _, row in rad_kegg.iterrows():
            all_rad_genes.update(self.get_genes_for_pathway(row['pathway_id']))
        
        if len(all_rad_genes) > 0:
            variant_matrix = self.create_variant_impact_matrix(all_rad_genes)
            
            pivot_data = pd.pivot_table(
                variant_matrix,
                values=['W1_variants', 'W2_variants', 'W3_variants'],
                index='gene',
                columns='dose'
            )
            
            g = sns.clustermap(pivot_data, cmap='RdBu_r', center=0,
                             xticklabels=True, yticklabels=True,
                             figsize=(15, len(all_rad_genes) * 0.2 + 2))
            
            # Get clustered column order
            clustered_cols = g.dendrogram_col.reordered_ind
            column_order = [pivot_data.columns[i] for i in clustered_cols]
            
            # Create exposure labels for clustered columns
            exposure_labels = []
            for col in column_order:
                dose = col[1]  # Get dose from multi-index
                week = col[0].split('_')[0]  # Extract week from column name
                exposure = exposure_map[(dose, week)]
                exposure_labels.append(f"{dose}-{week}\n{exposure}mG")
            
            g.ax_heatmap.set_xticklabels(exposure_labels, rotation=45, ha='right')
            plt.savefig(f"{output_dir}/all_radiation_genes_variants.png",
                       dpi=300, bbox_inches='tight')
            plt.close()

def main():
    analyzer = RadiationVariantAnalyzer()
    
    # Load data
    analyzer.load_data(
        'enrichment_visualization/kegg_enrichment_results.csv',
        'enrichment_visualization/go_enrichment_results.csv',
        'gene_analysis/temporal_patterns.csv',  # VEP analysis results
        'kegg_go_integrated.csv'
    )
    
    # Create visualizations
    analyzer.plot_radiation_variant_patterns()
    
    print("Analysis complete! Check the radiation_variant_patterns directory")

if __name__ == "__main__":
    main()

In [None]:
def plot_top_radiation_genes(temporal_file, kegg_results_file, integrated_file, output_file, top_n=15):
   # Load data
   temporal_data = pd.read_csv(temporal_file)
   kegg_results = pd.read_csv(kegg_results_file)
   integrated_data = pd.read_csv(integrated_file)
   
   # Define radiation-related pathways
   radiation_keywords = ['repair', 'damage', 'cycle', 'apoptosis', 'p53', 
                       'stress', 'checkpoint', 'DNA', 'radiation']
   
   rad_kegg = kegg_results[
       kegg_results['pathway_name'].str.lower().apply(
           lambda x: any(keyword in x.lower() for keyword in radiation_keywords)
       )
   ]
   
   # Get genes from radiation pathways
   rad_genes = set()
   for pathway_id in rad_kegg['pathway_id']:
       pathway_genes = set(integrated_data[
           integrated_data['kegg_pathway_id'] == pathway_id
       ]['gene_symbol'])
       rad_genes.update(pathway_genes)
   
   # Get variant scores for these genes
   gene_data = temporal_data[temporal_data['gene'].isin(rad_genes)]
   
   # Calculate total impact per gene
   gene_scores = gene_data.groupby('gene')[['W1_score', 'W2_score', 'W3_score']].sum().sum(axis=1)
   top_genes = gene_scores.nlargest(top_n).index
   
   # Create matrix for top genes
   top_data = temporal_data[temporal_data['gene'].isin(top_genes)]
   
   # Create pivot table 
   pivot_data = pd.pivot_table(
       top_data,
       values=['W1_score', 'W2_score', 'W3_score'],
       index='gene',
       columns='dose'
   )
   
   # Define exposure mapping
   exposure_map = {
       ('A', 'W1'): 0.168, ('A', 'W2'): 0.336, ('A', 'W3'): 0.504,
       ('B', 'W1'): 1.68, ('B', 'W2'): 3.36, ('B', 'W3'): 5.04,
       ('C', 'W1'): 16.8, ('C', 'W2'): 33.6, ('C', 'W3'): 50.4,
       ('D', 'W1'): 168, ('D', 'W2'): 336, ('D', 'W3'): 504,
       ('E', 'W1'): 336, ('E', 'W2'): 672, ('E', 'W3'): 1008
   }

   # Plot clustered heatmap
   g = sns.clustermap(pivot_data, 
                     cmap='RdBu_r', 
                     center=0,
                     xticklabels=True, 
                     yticklabels=True,
                     figsize=(12, 8))

   # Get clustered column order
   clustered_cols = g.dendrogram_col.reordered_ind
   column_order = [pivot_data.columns[i] for i in clustered_cols]

   # Create exposure labels maintaining cluster order
   exposure_labels = []
   for col in column_order:
       dose = col[1]  # Get dose from multi-index
       week = col[0].split('_')[0]  # Extract week from column name
       exposure = exposure_map[(dose, week)]
       exposure_labels.append(f"{dose}-{week}\n{exposure}mG")

   # Update x-axis labels
   ax = g.ax_heatmap
   ax.set_xticklabels(exposure_labels, rotation=45, ha='right')
   g.ax_heatmap.collections[0].colorbar.set_label('Variant Impact Score')
   plt.savefig(output_file, dpi=300, bbox_inches='tight')
   plt.close()
   
   return top_data, rad_kegg[rad_kegg['pathway_name'].str.contains('|'.join(radiation_keywords))]

# Run analysis
top_data, rad_pathways = plot_top_radiation_genes(
   'gene_analysis/temporal_patterns.csv',
   'enrichment_visualization/kegg_enrichment_results.csv',
   'kegg_go_integrated.csv', 'radiation_genes_heatmap.png'
)

# Print results
print("\nRadiation-related pathways:")
print(rad_pathways[['pathway_name', 'adjusted_pvalue']].to_string())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_go_terms_from_integrated():
    # Read the integrated data
    df = pd.read_csv('kegg_go_integrated.csv')
    
    # Define radiation-related GO terms we're interested in
    radiation_keywords = [
        'DNA damage', 'DNA repair', 'response to radiation',
        'cell cycle', 'apoptotic', 'stress response',
        'oxidative stress', 'double-strand break',
        'checkpoint', 'senescence', 'telomere'
    ]
    
    # Filter for radiation-related terms and get counts per category
    go_data = []
    for category in ['BP', 'MF', 'CC']:
        category_terms = df[df['go_category'] == category]
        
        # Filter for radiation-related terms
        radiation_terms = category_terms[
            category_terms['go_term'].str.lower().str.contains(
                '|'.join(radiation_keywords), na=False
            )
        ]
        
        # Count genes per term
        term_counts = radiation_terms.groupby(['go_id', 'go_term', 'go_category'])\
                                  ['gene_symbol'].nunique()\
                                  .reset_index()
        
        # Add to results
        if not term_counts.empty:
            go_data.append(term_counts)
    
    # Combine all categories
    if go_data:
        plot_df = pd.concat(go_data)
        
        # Sort by gene count within each category
        plot_df = plot_df.sort_values(['go_category', 'gene_symbol'], 
                                    ascending=[True, False])
        
        # Take top 10 terms from each category
        top_terms = pd.DataFrame()
        for category in ['BP', 'MF', 'CC']:
            category_top = plot_df[plot_df['go_category'] == category].head(10)
            top_terms = pd.concat([top_terms, category_top])
        
        # Create the visualization
        plt.figure(figsize=(15, 12))
        
        # Create three subplots, one for each category
        fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 20))
        
        # Plot for BP terms
        bp_terms = top_terms[top_terms['go_category'] == 'BP']
        bars1 = ax1.barh(bp_terms['go_term'], bp_terms['gene_symbol'], color='royalblue')
        ax1.set_title('Top Enriched BP Terms')
        # Add gene counts
        for i, bar in enumerate(bars1):
            ax1.text(5, i, f"n={int(bp_terms.iloc[i]['gene_symbol'])}", 
                    va='center', fontsize=8)
        
        # Plot for MF terms
        mf_terms = top_terms[top_terms['go_category'] == 'MF']
        bars2 = ax2.barh(mf_terms['go_term'], mf_terms['gene_symbol'], color='forestgreen')
        ax2.set_title('Top Enriched MF Terms')
        # Add gene counts
        for i, bar in enumerate(bars2):
            ax2.text(5, i, f"n={int(mf_terms.iloc[i]['gene_symbol'])}", 
                    va='center', fontsize=8)
        
        # Plot for CC terms
        cc_terms = top_terms[top_terms['go_category'] == 'CC']
        bars3 = ax3.barh(cc_terms['go_term'], cc_terms['gene_symbol'], color='indianred')
        ax3.set_title('Top Enriched CC Terms')
        # Add gene counts
        for i, bar in enumerate(bars3):
            ax3.text(5, i, f"n={int(cc_terms.iloc[i]['gene_symbol'])}", 
                    va='center', fontsize=8)
        
        # Add grid to all subplots
        for ax in [ax1, ax2, ax3]:
            ax.grid(axis='x', linestyle='--', alpha=0.7)
            ax.set_xlabel('Number of Genes')
        
        plt.tight_layout()
        
        # Save plot
        plt.savefig('go_terms_from_integrated.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # Print the data
        print("\nGO Term Statistics:")
        for category in ['BP', 'MF', 'CC']:
            print(f"\n{category} Terms:")
            category_data = top_terms[top_terms['go_category'] == category]
            print(category_data[['go_term', 'gene_symbol']].to_string())
        
        return top_terms
    
    else:
        print("No radiation-related GO terms found in the data")
        return None

# Run the analysis
go_term_stats = plot_go_terms_from_integrated()