### RefSeq genes preparation for metaplots generation

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
BED_col_names = ['chrom', 'chromStart', 'chromEnd', 'name', 
                 'score', 'strand', 'thickStart', 'thickEnd',
                 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']

In [None]:
SRC_DIR_PATH = Path(r"/path/to/genomes/")

In [None]:
genes = pd.read_csv(SRC_DIR_PATH.joinpath("UCSC_hg38_NCBI_RefSeq_UCSC.bed"), header=None, sep='\t', decimal='.')

In [None]:
genes.columns = BED_col_names

In [None]:
genes.info()
# genes.head(10)

#### Step 1
Remove non-canonical chromosomes

In [None]:
chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 
               'chr7', 'chr8', 'chr9', 'chrM', 'chrX', 'chrY', 
               'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 
               'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
               'chr20', 'chr21', 'chr22']

In [None]:
filter_1_genes = genes.loc[genes['chrom'].isin(chromosomes)]

In [None]:
filter_1_genes.info()

In [None]:
filter_1_genes.shape

#### Step 2
Remove all but first genes that are on the same strand and have the same start/end coordinates

In [None]:
filter_2_genes = filter_1_genes.drop_duplicates(subset=['chrom', 'chromStart', 'chromEnd', 'strand'], keep='first', inplace=False)

In [None]:
filter_2_genes.shape

In [None]:
filter_2_genes.head(10)

In [None]:
# Save this step if needed
# filter_2_genes.to_csv(SRC_DIR_PATH.joinpath("UCSC_hg38_NCBI_RefSeq_UCSC_filtered.bed"), 
#                       header=None, sep='\t', decimal='.', index=False)

#### Step intermediate
Add column containing gene length. This will be used in the following filtering steps.

In [None]:
filter_2_genes_copy = filter_2_genes.copy(deep=True)
filter_2_genes_copy['geneLength'] = filter_2_genes_copy['chromEnd'] - filter_2_genes_copy['chromStart']

In [None]:
filter_2_genes_copy.head()

#### Step 3
Find genes that are on the same strand and have the same start position. 
Filter out all of them but the longest one.

In [None]:
tmp_subset_genes = filter_2_genes_copy.loc[filter_2_genes_copy.duplicated(subset=['chrom', 'chromStart', 'strand'], keep=False)]

In [None]:
tmp_subset_genes.shape

In [None]:
tmp_subset_genes.head()

In [None]:
subset_chroms = tmp_subset_genes['chrom'].unique()

In [None]:
# tmp_subset_chroms = ['chr1']
idxs = []
for chrom in subset_chroms:
    for strand in ['-', '+']:
        positions = tmp_subset_genes.loc[(tmp_subset_genes['chrom'] == chrom) & (tmp_subset_genes['strand'] == strand)]['chromStart'].unique()
        for pos in positions:
            tmp_genes = tmp_subset_genes.loc[(tmp_subset_genes['chrom'] == chrom) & 
                                             (tmp_subset_genes['strand'] == strand) & 
                                             (tmp_subset_genes['chromStart'] == pos)]
            max_size_gene_idx = tmp_genes['geneLength'].idxmax()
            idxs.append(max_size_gene_idx)


In [None]:
# Simple check
len(idxs)

In [None]:
max_common_start_genes = tmp_subset_genes.loc[idxs]

In [None]:
max_common_start_genes.head()

In [None]:
# A set of genes without duplicates on chrom, chromStart, strand
filter_4_genes = filter_2_genes_copy.drop_duplicates(subset=['chrom', 'chromStart', 'strand'], 
                                                     keep=False, inplace=False)

In [None]:
print(filter_2_genes_copy.shape)
print(filter_4_genes.shape)

In [None]:
filter_5_genes = pd.concat([filter_4_genes, max_common_start_genes])

### Step 4

In [None]:
tmp_subset_genes_2 = filter_5_genes.loc[filter_5_genes.duplicated(subset=['chrom', 'chromEnd', 'strand'], keep=False)]

In [None]:
tmp_subset_genes_2.shape

In [None]:
subset_chroms_2 = tmp_subset_genes_2['chrom'].unique()

In [None]:
# tmp_subset_chroms = ['chr1']
idxs_2 = []
for chrom in subset_chroms_2:
    for strand in ['-', '+']:
        positions = tmp_subset_genes_2.loc[(tmp_subset_genes_2['chrom'] == chrom) & (tmp_subset_genes_2['strand'] == strand)]['chromEnd'].unique()
        for pos in positions:
            tmp_genes = tmp_subset_genes_2.loc[(tmp_subset_genes_2['chrom'] == chrom) & 
                                             (tmp_subset_genes_2['strand'] == strand) & 
                                             (tmp_subset_genes_2['chromEnd'] == pos)]
            max_size_gene_idx = tmp_genes['geneLength'].idxmax()
            idxs_2.append(max_size_gene_idx)


In [None]:
# Simple check
len(idxs_2)

In [None]:
max_common_end_genes = tmp_subset_genes_2.loc[idxs_2]

In [None]:
# A set of genes without duplicates on chrom, chromEnd, strand
filter_6_genes = filter_5_genes.drop_duplicates(subset=['chrom', 'chromEnd', 'strand'], 
                                                keep=False, inplace=False)

In [None]:
filter_7_genes = pd.concat([filter_6_genes, max_common_end_genes])

In [None]:
filter_7_genes.shape

In [None]:
print('Initial gene count :', filter_1_genes.shape[0])
print('Gene count with no duplicates (start, end): ', filter_2_genes.shape[0])
print('Gene count max length only (start or end): ', filter_7_genes.shape[0])

In [None]:
final_genes = filter_7_genes.drop(labels=['geneLength'], axis=1)

In [None]:
# Save this step if necessary
# final_genes.to_csv(SRC_DIR_PATH.joinpath("UCSC_hg38_NCBI_RefSeq_UCSC_filtered.bed"), 
#                       header=None, sep='\t', decimal='.', index=False)

In [None]:
filter_7_genes.head(5)

### Step 5

#### Find genes that have exactly the same CDS coordinates. Keep only the longer version of a gene.

In [None]:
tmp_subset_genes_3 = filter_7_genes.loc[filter_7_genes.duplicated(subset=['chrom', 'thickStart', 'thickEnd', 'strand'], keep=False)]

In [None]:
# To avoid KeyError Exception get a list of chromosomes from this subset of data
subset_chroms_3 = tmp_subset_genes_3['chrom'].unique()

In [None]:
idxs_3 = []
for chrom in subset_chroms_3:
    for strand in ['-', '+']:
        # Here we have a duplicates which have the same thickStart and thickEnd coordinates.
        # We can therefore use just one coordinate to selects unique entries (the ends will be also unique) 
        positions = tmp_subset_genes_3.loc[(tmp_subset_genes_3['chrom'] == chrom) & (tmp_subset_genes_3['strand'] == strand)]['thickStart'].unique()
        for pos in positions:
            tmp_genes = tmp_subset_genes_3.loc[(tmp_subset_genes_3['chrom'] == chrom) & 
                                             (tmp_subset_genes_3['strand'] == strand) & 
                                             (tmp_subset_genes_3['thickStart'] == pos)]
            max_size_gene_idx = tmp_genes['geneLength'].idxmax()
            idxs_3.append(max_size_gene_idx)

In [None]:
len(idxs_3)

In [None]:
max_common_cds_genes = tmp_subset_genes_3.loc[idxs_3]

In [None]:
# A set of genes without duplicates on chrom, strand, thickStart, thickEnd
filter_8_genes = filter_7_genes.drop_duplicates(subset=['chrom', 'thickStart', 'thickEnd', 'strand'], 
                                                keep=False, inplace=False)

In [None]:
# Merge above
filter_9_genes = pd.concat([filter_8_genes, max_common_cds_genes])

In [None]:
print(len(filter_7_genes), len(filter_8_genes), len(filter_9_genes))

In [None]:
# Sanity checks
# assert len(filter_9_genes.loc[filter_9_genes.duplicated(subset=['chrom', 'chromStart', 'chromEnd', 'strand'], keep=False)]) == 0
# assert len(filter_9_genes.loc[filter_9_genes.duplicated(subset=['chrom', 'thickStart', 'thickEnd', 'strand'], keep=False)]) == 0

In [None]:
filter_9_genes.shape

#### Find genes that have one of the CDS coordinates the same. Keep only the longer version of a gene.
Repeat steps as for chr start/end

#### Step 6
thickStart

In [None]:
tmp_subset_genes_4 = filter_9_genes.loc[filter_9_genes.duplicated(subset=['chrom', 'strand', 'thickStart'], keep=False)]

In [None]:
tmp_subset_genes_4.shape

In [None]:
subset_chroms_4 = tmp_subset_genes_4['chrom'].unique()

In [None]:
# tmp_subset_chroms = ['chr1']
idxs_4 = []
for chrom in subset_chroms_4:
    for strand in ['-', '+']:
        positions = tmp_subset_genes_4.loc[(tmp_subset_genes_4['chrom'] == chrom) & (tmp_subset_genes_4['strand'] == strand)]['thickStart'].unique()
        for pos in positions:
            tmp_genes = tmp_subset_genes_4.loc[(tmp_subset_genes_4['chrom'] == chrom) & 
                                             (tmp_subset_genes_4['strand'] == strand) & 
                                             (tmp_subset_genes_4['thickStart'] == pos)]
            max_size_gene_idx = tmp_genes['geneLength'].idxmax()
            idxs_4.append(max_size_gene_idx)


In [None]:
len(idxs_4)

In [None]:
max_common_thick_start_genes = tmp_subset_genes_4.loc[idxs_4]

In [None]:
# A set of genes without duplicates on chrom, chromEnd, strand
filter_10_genes = filter_9_genes.drop_duplicates(subset=['chrom', 'strand', 'thickStart'], 
                                                keep=False, inplace=False)

In [None]:
filter_11_genes = pd.concat([filter_10_genes, max_common_thick_start_genes])

In [None]:
filter_11_genes.shape

#### Step 7
thickEnd

In [None]:
tmp_subset_genes_5 = filter_11_genes.loc[filter_11_genes.duplicated(subset=['chrom', 'strand', 'thickEnd'], keep=False)]

In [None]:
tmp_subset_genes_5.shape

In [None]:
subset_chroms_5 = tmp_subset_genes_5['chrom'].unique()

In [None]:
# tmp_subset_chroms = ['chr1']
idxs_5 = []
for chrom in subset_chroms_5:
    for strand in ['-', '+']:
        positions = tmp_subset_genes_5.loc[(tmp_subset_genes_5['chrom'] == chrom) & (tmp_subset_genes_5['strand'] == strand)]['thickEnd'].unique()
        for pos in positions:
            tmp_genes = tmp_subset_genes_5.loc[(tmp_subset_genes_5['chrom'] == chrom) & 
                                             (tmp_subset_genes_5['strand'] == strand) & 
                                             (tmp_subset_genes_5['thickEnd'] == pos)]
            max_size_gene_idx = tmp_genes['geneLength'].idxmax()
            idxs_5.append(max_size_gene_idx)


In [None]:
len(idxs_5)

In [None]:
max_common_thick_end_genes = tmp_subset_genes_5.loc[idxs_5]

In [None]:
# A set of genes without duplicates on chrom, chromEnd, strand
filter_12_genes = filter_11_genes.drop_duplicates(subset=['chrom', 'strand', 'thickEnd'], 
                                                keep=False, inplace=False)

In [None]:
filter_13_genes = pd.concat([filter_12_genes, max_common_thick_end_genes])

In [None]:
filter_13_genes.shape

In [None]:
final_genes_2 = filter_13_genes.drop(labels=['geneLength'], axis=1)

In [None]:
final_genes_2.to_csv(SRC_DIR_PATH.joinpath("UCSC_hg38_NCBI_RefSeq_UCSC_filtered_29k.bed"), 
                     header=None, sep='\t', decimal='.', index=False)