In [3]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import os, sys
import pandas as pd
import pyarrow

In [3]:
PATH = "/wynton/group/ahituv/fongsl/src/MPRAflow"
FILE = os.path.join(PATH, "null_mpra_barcodes_per_candidate-no_repeats-no_jackpots.feather")

In [26]:
df = pd.read_feather(FILE)

df.head()

Unnamed: 0,coord,n_barcodes
0,15-firstorder.63414|WTC11_seq51996_F|inactive,193
1,15-firstorder.2355935|HepG2_DNasePeakNoPromote...,149
2,15-firstorder.1412646|K562_peak57048|inactive,300
3,15-firstorder.192228|WTC11_seq41747_F|inactive,256
4,15-firstorder.1426742|HepG2_DNasePeakNoPromote...,305


In [27]:
df["n_barcodes"].sum()

10563139

In [28]:
df.describe()

Unnamed: 0,n_barcodes
count,56504.0
mean,186.944977
std,120.58553
min,0.0
25%,105.0
50%,165.0
75%,244.0
max,1124.0


In [29]:
len(df['coord'].unique())

56504

In [30]:
df[["null_id", "peak_id", "activity"]] = df["coord"].str.split('|', expand=True)

In [31]:
df.head()

Unnamed: 0,coord,n_barcodes,null_id,peak_id,activity
0,15-firstorder.63414|WTC11_seq51996_F|inactive,193,15-firstorder.63414,WTC11_seq51996_F,inactive
1,15-firstorder.2355935|HepG2_DNasePeakNoPromote...,149,15-firstorder.2355935,HepG2_DNasePeakNoPromoter50640,inactive
2,15-firstorder.1412646|K562_peak57048|inactive,300,15-firstorder.1412646,K562_peak57048,inactive
3,15-firstorder.192228|WTC11_seq41747_F|inactive,256,15-firstorder.192228,WTC11_seq41747_F,inactive
4,15-firstorder.1426742|HepG2_DNasePeakNoPromote...,305,15-firstorder.1426742,HepG2_DNasePeakNoPromoter11362,inactive


In [32]:
list(df)

['coord', 'n_barcodes', 'null_id', 'peak_id', 'activity']

In [33]:
df["n_barcodes"].astype(int)
df.dtypes

coord         object
n_barcodes     int64
null_id       object
peak_id       object
activity      object
dtype: object

In [34]:
df.loc[df["coord"].str.contains("C:SLEA"), "activity"] = "ctrl"

In [35]:
df.shape

(56504, 5)

In [36]:
df.groupby('activity')["n_barcodes"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
active,1188.0,1.439394,3.130769,0.0,1.0,1.0,1.0,73.0
ctrl,51.0,207.27451,145.698604,3.0,137.0,175.0,244.5,944.0
inactive,54013.0,190.834781,118.630436,0.0,109.0,168.0,246.0,1124.0


In [24]:
df.loc[df["activity"].astype(str).str.contains("V_COUPTF_Q6")]

Unnamed: 0,coord,n_barcodes,null_id,peak_id,activity


# investigate design

In [40]:
DESIGN = "/wynton/group/ahituv/fongsl/src/MPRAflow/work/e8/06701317043d44bccc7e92326b2e79/design_rmIllegalChars.fa"

In [41]:
v = 0
with open(DESIGN, "r") as reader:
    for value in SimpleFastaParser(reader):
        name, seq = value
        print(seq[:15], seq[-15:], name)
        v +=1
        if v> 100:
            break

AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter10535|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter10586|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter10714|inactive
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter10849|inactive
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter11263|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter11270|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter11362|inactive
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter12111|inactive
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter1269|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter13177|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter14822|active
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter15346|inactive
AGGACCGGATCAACT CATTGCGTGAACCGA endog|HepG2_DNasePeakNoPromoter16515|inactive
A

# no jackpots or repeats

In [19]:
NO_JACKPOTS = os.path.join(PATH, "null_mpra_barcodes_per_candidate-no_repeats.feather")

In [20]:
nj = pd.read_feather(NO_JACKPOTS)
nj.describe()

Unnamed: 0,n_barcodes
count,56504.0
mean,188.933686
std,121.437834
min,1.0
25%,106.0
50%,167.0
75%,246.0
max,1130.0


In [21]:
len(nj['coord'].unique())

56504

# pickle

In [30]:
PICKLE = pd.read_pickle(os.path.join(PATH, "null_mpra_coords_to_barcodes.pickle"))

In [31]:
p = pd.DataFrame(PICKLE.items())

In [32]:
p.columns =["bc", "n"]

In [33]:
p.head()

Unnamed: 0,bc,n
0,15-firstorder.63414|WTC11_seq51996_F|inactive,"[AAACATTAGTTTCTT, GAAGAGGGCCAAACC, GCAGTTGTGTA..."
1,15-firstorder.2355935|HepG2_DNasePeakNoPromote...,"[TTGGAATGACCAACA, TGACTGAGAAGGCTA, TACTTGGACAT..."
2,15-firstorder.1412646|K562_peak57048|inactive,"[TTTCTTTGGGATTAA, ATATTGCTCCCTGCG, TTAAGGAGATG..."
3,15-firstorder.192228|WTC11_seq41747_F|inactive,"[CTATGTTATTACAAA, GATCCTACATAGAGT, TACATTACGTG..."
4,15-firstorder.1426742|HepG2_DNasePeakNoPromote...,"[GAACTTGTCTGGCTA, CTATGAAAGAGATCC, CAAAACATAGT..."


In [34]:
len(set(p["bc"]))

56504

In [None]:
# there are 10358215 bc

# FASTP run 

In [2]:
FP="/wynton/group/ahituv/fongsl/projects/nullomers/data/20230815_nullomer_MPRA/assoc_fastp/null_mpra_coords_to_barcodes.pickle"

In [4]:
pickle = pd.read_pickle(FP)
p = pd.DataFrame(pickle.items())

In [5]:
p.shape

(1, 2)

In [6]:
p.head()

Unnamed: 0,0,1
0,endog|HepG2_DNasePeakNoPromoter10535|active,"[AAACATTAGTTTCTT, TTGGAATGACCAACA, TTTCTTTGGGA..."


In [3]:
a='TGATTCTTACTTCGTGCTCCAAGACGTCTGTTCTTTCCTGCTTTGTGACGCAGGCCTCCGTGCTGGCATTTGTGAAATGCAGATAACCTGTCAAACGTTGTCGACGGCCAGTACTCTGGTTGTAATCTCCTGAGGCCAGGAGCTGGGTCTTCAGGTCTTCACTTTCTCTCTATGTCCCCAACACAGCTCTGGGGGCGGGG'
len(a)

200