# RNA-Seq Pipeline
## Step 2: Quality Control & Normalization

This notebook performs:
- Load count matrix
- Basic quality control
- Filter low-expression genes
- Normalize counts (TPM + DESeq2-style)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load GEO count matrix

In [None]:
# Adjust to your downloaded data
count_matrix = pd.read_csv('../data/GSE183947_series_matrix.txt', sep='\t', comment='!', index_col=0)
count_matrix.head()

### Quality Control: Boxplot

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=np.log1p(count_matrix))
plt.xticks(rotation=90)
plt.title("Log Transformed Counts - QC Boxplot")
plt.show()

### Filter low-expression genes

In [None]:
filtered = count_matrix[count_matrix.sum(axis=1) > 10]
filtered.shape

### TPM Normalization

In [None]:
def tpm(df):
    rpk = df.div(df.sum(axis=1), axis=0)
    scaling = rpk.sum(axis=0) / 1e6
    return rpk.div(scaling, axis=1)

tpm_matrix = tpm(filtered)
tpm_matrix.head()