# RNA-Seq Pipeline
## Step 3: Differential Expression Analysis

This notebook performs differential expression analysis using:
- Normalized count matrix
- A sample metadata table
- log2 fold change
- Adjusted p-values


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

### Load normalized counts (TPM or filtered counts)

In [None]:
tpm_matrix = pd.read_csv('../results/tpm_normalized_matrix.csv', index_col=0)
tpm_matrix.head()

### Load metadata (manually created or downloaded)

In [None]:
metadata = pd.read_csv('../data/metadata.csv')
metadata

### Split samples by condition

In [None]:
group1 = metadata[metadata['condition'] == 'control']['sample']
group2 = metadata[metadata['condition'] == 'treatment']['sample']

### Differential Expression

In [None]:
results = []
for gene in tpm_matrix.index:
    x = tpm_matrix.loc[gene, group1]
    y = tpm_matrix.loc[gene, group2]
    
    lfc = np.log2((y.mean() + 1) / (x.mean() + 1))
    pval = ttest_ind(x, y, equal_var=False).pvalue
    
    results.append([gene, lfc, pval])

deg_df = pd.DataFrame(results, columns=['gene', 'log2FC', 'pvalue'])

# FDR correction
deg_df['padj'] = deg_df['pvalue'] * len(deg_df) / (deg_df['pvalue'].rank())

deg_df.head()

### Save results

In [None]:
deg_df.to_csv('../results/differential_expression_results.csv', index=False)
deg_df