In [23]:
import zipfile
import os

# Path to the uploaded file
zip_path = 'assesment_dataset_converter.zip'
extract_path = 'assesment_dataset_converter/'

# Create a directory to extract the zip file
os.makedirs(extract_path, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List the extracted files to understand the contents
extracted_files = os.listdir(extract_path)
##extracted_files

In [24]:
import pandas as pd
from scipy.stats import ttest_ind

# Load the dataset
dataset_path = 'assesment_dataset_converter/assesment_dataset.tsv'
dataset = pd.read_csv(dataset_path, sep='\t')

# Display the first few rows of the dataset
dataset.head()

# Filter out genes with no expression in any condition
filtered_dataset = dataset[(dataset.iloc[:, 1:] != 0).any(axis=1)].copy()

# Display the filtered dataset
##filtered_dataset.head()

# Separate the conditions
mock_reps = filtered_dataset[['mock_rep1', 'mock_rep2', 'mock_rep3']]
sars_cov_reps = filtered_dataset[['sars_cov_rep1', 'sars_cov_rep2', 'sars_cov_rep3']]

# Perform t-test between mock and sars_cov groups
p_values = ttest_ind(mock_reps, sars_cov_reps, axis=1, equal_var=False).pvalue

# Add p-values to the dataset using .loc
filtered_dataset.loc[:, 'p_value'] = p_values

# Filter genes with p-value < 0.05
significant_genes = filtered_dataset[filtered_dataset['p_value'] < 0.05]

# Display the significant genes
##significant_genes.head()

# Export the significant genes to a CSV file
output_path = 'assesment_dataset_converter/significant_genes.csv'
significant_genes.to_csv(output_path, index=False)

# Display the path to the saved file
##output_path

In [25]:
##pip install gprofiler-official

In [26]:
#from gprofiler import GProfiler

# Initialize the gProfiler
##gp = GProfiler(return_dataframe=True)

# Extract the list of significant genes (Ensembl IDs)
##significant_genes_list = significant_genes['converted_alias'].tolist()

# Perform the enrichment analysis
##enrich_results = gp.profile(organism='hsapiens', query=significant_genes_list, sources=['GO:MF', 'GO:BP', 'KEGG', 'REAC'])

# Save the enrichment results to a CSV file
##enrich_results.to_csv('assesment_dataset_converter/enrichment_results.csv', index=False)

# Display the first few rows of the results
##enrich_results.head()


In [27]:
# Get a list of Ensembl IDs to use as input in gProfiler website
# Extract the list of Ensembl IDs starting with "EN"
ensembl_ids = significant_genes['converted_alias']

# Convert to a list
ensembl_ids_list = ensembl_ids.tolist()

# Display the list
##ensembl_ids_list

output_file_path = 'assesment_dataset_converter/ensembl_ids.txt'

with open(output_file_path, 'w') as file:
    for ensembl_id in ensembl_ids_list:
        file.write(f"{ensembl_id}\n")


In [28]:
#Ensembl IDs -> converted_alias in significant_genes and converted_alias in converter
#Entrez IDs -> initial_alias in converter

# Load the converter file with the correct separator
converter_path = "assesment_dataset_converter/converter.tsv"
converter = pd.read_csv(converter_path, sep=',', quotechar='"')

# Save the corrected converter file to a new TSV file
converter.to_csv('assesment_dataset_converter/converter_corrected.tsv', sep='\t', index=False)

# Load the converter file
converter_corrected_path = 'assesment_dataset_converter/converter_corrected.tsv'
converter_corrected = pd.read_csv(converter_corrected_path, sep='\t')

# Merge the significant genes with the converter to get Entrez IDs
merged_df = significant_genes.merge(converter_corrected, left_on= significant_genes['converted_alias'], right_on=converter_corrected['converted_alias'])

# Extract Entrez IDs
entrez_ids = merged_df['initial_alias'].tolist()

# Display the list of Entrez IDs
##entrez_ids

output_file_path_entrez = 'assesment_dataset_converter/entrez_ids.txt'

with open(output_file_path_entrez, 'w') as file:
    for entrez_id in entrez_ids:
        file.write(f"{entrez_id}\n")


In [29]:
# Install bioservices if not already installed

##!pip install bioservices


In [30]:
# to map Entrez IDs to KEGG pathways:

##from bioservices import KEGG

# Create a KEGG service instance
##k = KEGG()

# Convert integers to strings using map() and list()
##string_entrez_ids = list(map(str, entrez_ids))
#print(string_entrez_ids)  

# Organism code (e.g., hsa for Homo sapiens)
##organism = "hsa"

# Fetch KEGG pathway mappings

# Open file for writing
##output_file = "pathways_output.txt"

##with open(output_file, 'w') as f:
##    for gene_id in string_entrez_ids:
##        try:
##            pathways = k.get_pathway_by_gene(gene_id, organism)
##            f.write(f"Entrez ID: {gene_id}\n")
##            for pathway in pathways:
##                f.write(f"{pathway}\n")
##        except Exception as e:
##            f.write(f"Error fetching pathway for gene {gene_id}: {e}\n")

##print(f"Results written to {output_file}")
