In [6]:
import pandas as pd
import gffutils

In [7]:
# Path to the existing GTF database
db_file = r"C:\Users\pg22\PycharmProjects\pythonProject\gencode_v36.db"

In [8]:
# Load the existing database
db = gffutils.FeatureDB(db_file)

In [9]:
# Read the gene lengths from a CSV file
gene_lengths_df = pd.read_csv('gene_lengths.csv', index_col=0)
gene_lengths_df.head()

Unnamed: 0,gene_length_kb
ENSG00000223972.5,2.289
ENSG00000227232.5,1.351
ENSG00000278267.1,0.068
ENSG00000243485.5,1.247
ENSG00000284332.1,0.138


In [10]:
# Convert the DataFrame to a dictionary
gene_lengths = gene_lengths_df.to_dict()['gene_length_kb']

In [11]:
# Read your RNA-seq data
rna_seq_file = r"C:\Users\pg22\OneDrive - King's College London\Documents\PhD Data\TCGA_TCIA\Genome\TCGA_70_Matched Transcrptome data\70 T data\1c091fe9-f2ca-478c-95be-b924f30c4c75\c93e41f3-3a23-4757-af8c-af11fab60b52.rna_seq.augmented_star_gene_counts.csv"
rna_seq_data = pd.read_csv(rna_seq_file, index_col=0)
rna_seq_data.head()

Unnamed: 0_level_0,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000000003.15,TSPAN6,protein_coding,5095,2519,2576,93.3645,30.8501,34.7426
ENSG00000000005.6,TNMD,protein_coding,30,17,14,1.6895,0.5582,0.6287
ENSG00000000419.13,DPM1,protein_coding,1463,734,729,100.7507,33.2907,37.4911
ENSG00000000457.14,SCYL3,protein_coding,413,406,401,4.9875,1.648,1.8559
ENSG00000000460.17,C1orf112,protein_coding,346,379,379,4.8174,1.5918,1.7926


In [12]:
# Merge the gene lengths with your RNA-seq data
merged_data = pd.merge(rna_seq_data, gene_lengths_df, left_on='gene_id', right_index=True)

In [13]:
# Calculate FPKM
total_mapped_reads = merged_data['stranded_first'].sum() / 1e6  # replace 'stranded_first' with 'stranded_second' if necessary
merged_data['fpkm'] = merged_data.apply(lambda row: (row['stranded_first'] / (row['gene_length_kb'] * total_mapped_reads)), axis=1)

In [14]:
# Calculate log2(FPKM + 1)
import math
merged_data['log2_fpkm'] = merged_data['fpkm'].apply(lambda x: math.log2(x + 1))

In [15]:
# Calculate TPM
merged_data['tpm'] = merged_data.apply(lambda row: (row['stranded_first'] / row['gene_length_kb']) * (1e6 / total_mapped_reads), axis=1)

In [16]:
# Output the result
merged_data.to_csv('TCGA1_strandedfpkm.csv', index=False)

In [17]:
# calculate gene level fpkm and tpm
gene_fpkm = merged_data.groupby('gene_name')['fpkm'].sum()
gene_tpm = merged_data.groupby('gene_name')['tpm'].sum()

In [18]:
# Remove genes with zero FPKM
gene_fpkm = gene_fpkm[gene_fpkm > 0]
gene_tpm = gene_tpm[gene_tpm > 0]

In [19]:
# output the result
gene_fpkm.to_csv('TCGA1_gene_fpkm.csv')
gene_tpm.to_csv('TCGA1_gene_tpm.csv')

In [16]:
# Calculate the sum of FPKM values over all genes
total_fpkm = merged_data['fpkm'].sum()

# Convert FPKM to TPM
merged_data['tpm'] = merged_data['fpkm'].apply(lambda x: x / total_fpkm * 1e6)

In [ ]:
import os
import pandas as pd
import gffutils
import math
# Main directory
main_dir = r"C:\Users\pg22\OneDrive - King's College London\Documents\PhD Data\TCGA_TCIA\Genome\TCGA_70_Matched Transcrptome data\70 T data"

# Create a new folder within the main directory
new_folder = os.path.join(main_dir, 'TCGA_70T_stranded_fpkm')
os.makedirs(new_folder, exist_ok=True)

# Path to the existing GTF database
db_file = r"C:\Users\pg22\PycharmProjects\pythonProject\gencode_v36.db"

# Load the existing database
db = gffutils.FeatureDB(db_file)
# Read the gene lengths from a CSV file
gene_lengths_df = pd.read_csv('gene_lengths.csv', index_col=0)

# Convert the DataFrame to a dictionary
gene_lengths = gene_lengths_df.to_dict()['gene_length_kb']
# Get a list of all CSV files in the main directory
csv_files = [f.path for f in os.scandir(main_dir) if f.is_file() and f.name.endswith('.csv')]

# Loop over each CSV file
for csv_file in csv_files:
    # Read the RNA-seq data
    rna_seq_data = pd.read_csv(csv_file, index_col=0)

    # Merge the gene lengths with the RNA-seq data
    merged_data = pd.merge(rna_seq_data, gene_lengths_df, left_on='gene_id', right_index=True)

    # Calculate FPKM
    total_mapped_reads = merged_data['stranded_first'].sum() / 1e6  # replace 'stranded_first' with 'stranded_second' if necessary
    merged_data['fpkm'] = merged_data.apply(lambda row: (row['stranded_first'] / (row['gene_length_kb'] * total_mapped_reads)), axis=1)

    # Calculate log2(FPKM + 1)
    merged_data['log2_fpkm'] = merged_data['fpkm'].apply(lambda x: math.log2(x + 1))

    # Calculate TPM
    merged_data['tpm'] = merged_data.apply(lambda row: (row['stranded_first'] / row['gene_length_kb']) * (1e6 / total_mapped_reads), axis=1)

    # Output the result
    output_file = os.path.join(new_folder, os.path.splitext(os.path.basename(csv_file))[0] + '_strandedfpkm.csv')
    merged_data.to_csv(output_file, index=False)

    # Calculate gene level fpkm and tpm
    gene_fpkm = merged_data.groupby('gene_name')['fpkm'].sum()
    gene_tpm = merged_data.groupby('gene_name')['tpm'].sum()

    # Remove genes with zero FPKM
    gene_fpkm = gene_fpkm[gene_fpkm > 0]
    gene_tpm = gene_tpm[gene_tpm > 0]

    # Output the result
    output_file_fpkm = os.path.join(new_folder, os.path.splitext(os.path.basename(csv_file))[0] + '_gene_fpkm.csv')
    output_file_tpm = os.path.join(new_folder, os.path.splitext(os.path.basename(csv_file))[0] + '_gene_tpm.csv')
    gene_fpkm.to_csv(output_file_fpkm)
    gene_tpm.to_csv(output_file_tpm)

In [ ]:
import os
import pandas as pd
import gffutils
import math

# Main directory
main_dir = r"C:\Users\pg22\OneDrive - King's College London\Documents\PhD Data\TCGA_TCIA\Genome\TCGA_70_Matched Transcrptome data\70 T data"

# Path to the existing GTF database
db_file = r"C:\Users\pg22\PycharmProjects\pythonProject\gencode_v36.db"

# Load the existing database
db = gffutils.FeatureDB(db_file)

# Read the gene lengths from a CSV file
gene_lengths_df = pd.read_csv('gene_lengths.csv', index_col=0)

# Convert the DataFrame to a dictionary
gene_lengths = gene_lengths_df.to_dict()['gene_length_kb']

# Get a list of all CSV files in the main directory
csv_files = [f.path for f in os.scandir(main_dir) if f.is_file() and f.name.endswith('.csv')]

# Loop over each CSV file
for csv_file in csv_files:
    # Read the RNA-seq data
    rna_seq_data = pd.read_csv(csv_file, index_col=0)

    # Merge the gene lengths with the RNA-seq data
    merged_data = pd.merge(rna_seq_data, gene_lengths_df, left_on='gene_id', right_index=True)

    # Calculate FPKM
    total_mapped_reads = merged_data['stranded_first'].sum() / 1e6  # replace 'stranded_first' with 'stranded_second' if necessary
    merged_data['fpkm'] = merged_data.apply(lambda row: (row['stranded_first'] / (row['gene_length_kb'] * total_mapped_reads)), axis=1)

    # Calculate log2(FPKM + 1)
    merged_data['log2_fpkm'] = merged_data['fpkm'].apply(lambda x: math.log2(x + 1))

    # Calculate TPM
    merged_data['tpm'] = merged_data.apply(lambda row: (row['stranded_first'] / row['gene_length_kb']) * (1e6 / total_mapped_reads), axis=1)

    # Output the result
    output_file = os.path.splitext(csv_file)[0] + '_strandedfpkm.csv'
    merged_data.to_csv(output_file, index=False)

    # Calculate gene level fpkm and tpm
    gene_fpkm = merged_data.groupby('gene_name')['fpkm'].sum()
    gene_tpm = merged_data.groupby('gene_name')['tpm'].sum()

    # Remove genes with zero FPKM
    gene_fpkm = gene_fpkm[gene_fpkm > 0]
    gene_tpm = gene_tpm[gene_tpm > 0]

    # Output the result
    output_file_fpkm = os.path.splitext(csv_file)[0] + '_gene_fpkm.csv'
    output_file_tpm = os.path.splitext(csv_file)[0] + '_gene_tpm.csv'
    gene_fpkm.to_csv(output_file_fpkm)
    gene_tpm.to_csv(output_file_tpm)