# Smaller dataset with each samples randomised and presented at number of 5 or less if avaliable


In [19]:
import pandas as pd

file_name = "gene_counts_exp_B34_B44_B51_B61_B64.csv"

In [20]:
data = pd.read_csv(file_name)
data = data.loc[data["sample"] != "BA028_IMDM_03"]  # Drop for low seq depth

In [21]:
data.shape

(128, 19993)

In [22]:
included_sample_names = [
    "_LPS_",
    "_IMDM_",
    "_Pam3_",
    "_PGN_",
    "_LTA_",
    "_R848_",
    "_Fla-PA_",
    "_CL-307_",
    "_CRX-527_",
]

To have a balanced data set we can limit the number of each sample ot 5 or less.

In [44]:
import random


def select_samples(
    name, n=None, shuffle=True, samples=None
):  # Takes 5 instances of ach sample using python's random function
    index_list = []

    for index, sample_description in enumerate(samples):
        if name in sample_description:
            print(name)
            index_list.append(index)

    if shuffle:
        random.shuffle(index_list)

    return index_list[:n]

In [None]:
samples = data["sample"]

index_list = []

for name in included_sample_names:
    index_list.extend(select_samples(name, n=5, samples=samples, shuffle=True))

In [56]:
data_for_ml = data.iloc[index_list]

In [57]:
data_for_ml.shape

(41, 19993)

In [26]:
indices_of_double_pyrogens = []

for name in included_sample_names:
    for index, sample in samples.items():
        if f"{name}-" in sample:
            indices_of_double_pyrogens.append(index)

data.drop(index=indices_of_double_pyrogens, axis=0, inplace=True)

In [None]:
shuffled_data = data_for_ml.sample(frac=1).reset_index(drop=True)

In [78]:
shuffled_data["sample"] = shuffled_data["sample"].apply(
    lambda name: name.split("_", 2)[1]
)

In [None]:
shuffled_data

In [80]:
shuffled_data.to_csv("gene_counts_NN_training.csv", index=False)

Here, I woudl like to have all samples included.

In [None]:
index_list = []
for name in included_sample_names:
    index_list.extend(select_samples(name, samples=samples, shuffle=True))

In [88]:
data_for_ml = data.iloc[index_list]

In [89]:
data_for_ml = data_for_ml.sample(frac=1).reset_index(drop=True)

In [73]:
data_for_ml["sample"] = data_for_ml["sample"].apply(lambda name: name.split("_", 2)[1])

In [93]:
data_for_ml

Unnamed: 0,sample,HUNK,KLHL8,ZNF576,UGT1A5,FCF1,SLC2A7,FABP12,TAF12,GLRX,...,ASIC2,TSN,EVA1A,NFATC1,PORCN,GALNT17,DEPDC4,REL,RPL23,AHSP
0,BA029_LPS_08,0.0,139.0,52.0,0.0,303.0,0.0,0.0,97.0,110.0,...,0.0,393.0,0.0,492.0,32.0,0.0,0.0,3464.0,3584.0,1.0
1,BA029_LPS_06,0.0,146.0,50.0,0.0,243.0,0.0,0.0,67.0,110.0,...,0.0,355.0,0.0,380.0,39.0,0.0,0.0,2784.0,2734.0,0.0
2,BA046_CRX-527_11,0.0,35.0,5.0,0.0,63.0,0.0,0.0,14.0,20.0,...,0.0,55.0,0.0,94.0,2.0,0.0,0.0,484.0,746.0,0.0
3,BA028_R848_13,0.0,72.0,31.0,0.0,138.0,0.0,0.0,50.0,68.0,...,0.0,155.0,0.0,150.0,11.0,0.0,0.0,2048.0,1699.0,0.0
4,BA063_Pam3_17,0.0,277.0,95.0,0.0,532.0,0.0,0.0,157.0,184.0,...,0.0,802.0,0.0,909.0,60.0,0.0,1.0,5576.0,6945.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,BA029_PGN_16,0.0,76.0,22.0,0.0,170.0,0.0,0.0,88.0,55.0,...,0.0,237.0,0.0,397.0,18.0,0.0,0.0,1887.0,2432.0,1.0
61,BA046_CRX-527_10,0.0,68.0,14.0,0.0,102.0,0.0,0.0,25.0,23.0,...,0.0,116.0,0.0,159.0,22.0,0.0,0.0,876.0,872.0,1.0
62,BA042_IMDM_1,0.0,80.0,48.0,0.0,181.0,0.0,0.0,87.0,64.0,...,0.0,304.0,0.0,335.0,23.0,0.0,0.0,2209.0,2601.0,0.0
63,BA028_Pam3_10,0.0,142.0,33.0,0.0,292.0,0.0,0.0,83.0,115.0,...,0.0,430.0,0.0,485.0,23.0,0.0,1.0,2608.0,4372.0,0.0


In [92]:
data_for_ml.to_csv("gene_counts_DESeq2_65_samples.csv", index=False)

In [75]:
deseq2_metadata = pd.DataFrame(columns=["condition", "group"])

In [77]:
deseq2_metadata["condition"] = data_for_ml["sample"]

In [81]:
for index, sample in enumerate(deseq2_metadata["condition"]):
    if sample == "IMDM":
        deseq2_metadata["group"][index] = "negative_control"
    else:
        deseq2_metadata["group"][index] = "sample"

In [90]:
deseq2_metadata.insert(0, "sample", data_for_ml["sample"])

In [94]:
deseq2_metadata.to_csv("metadata_DESeq2_65_samples.csv", index=False)