# Dataset Processing
Creates the CSV file for use in machine learning from OSD datasets

In [37]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from urllib.request import urlretrieve

In [38]:
data = dict()
meta = dict()

In [39]:
def read_meta_data(dataset):
  url = 'https://osdr.nasa.gov/geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=OSD-' + dataset + '_metadata_OSD-' + dataset + '-ISA.zip'
  filename = dataset + '-meta.zip'
  urlretrieve(url, filename)
  !unzip -o {filename} > /dev/null
  df = pd.read_csv('s_OSD-' + dataset + '.txt', sep='\t', header=0)
  return df

In [40]:
meta["RNA-SEQ"] = read_meta_data('514')
meta["IHC"] = read_meta_data('592')

In [41]:
data["RNA-SEQ"] = pd.read_csv("GLDS-514_rna_seq_Normalized_Counts_GLbulkRNAseq.csv")
data["IHC"] = pd.read_csv("LSDS-31_IHC_LSDS-31_IHC_LSDS-31_Immunohistochemistry_mhatreTRANSFORMED.csv")

In [42]:
drop_cols = data["IHC"].columns[4:data["IHC"].columns.size].tolist()
drop_cols.append("alsda_id")

In [43]:
def fill_na_with_group_avg(df):
    # Extract the group from the 'source_name' column
    df['group'] = df['source_name'].apply(lambda x: x.split('_')[0])
    
    # For each column that needs NaN values filled
    for col in ['th_positive_cells', 'repo_glial_cells']:
        # Calculate the group average for each group
        group_avg = df.groupby('group')[col].transform('mean')
        
        # Fill NaN values with the group average
        df[col] = df[col].fillna(group_avg)
    
    # Drop the temporary 'group' column
    df.drop(columns=['group'], inplace=True)

In [44]:
cols = data["IHC"].columns[2:data["IHC"].columns.size]

# drop unnecessary columns
data["IHC"].drop(columns=drop_cols, inplace=True)

# rename source
data["IHC"]["source_name"] = data["IHC"]["source_name"].str[3:]

# remove post return datavalues
data["IHC"] = data["IHC"][~data["IHC"]['source_name'].str.contains('25')].copy()

fill_na_with_group_avg(data["IHC"])

# # remove duplicates
data["IHC"] = data["IHC"].drop_duplicates(subset='source_name', keep='first').copy()

data["IHC"]

Unnamed: 0,source_name,th_positive_cells,repo_glial_cells
0,Earth_F1,101.0,429.714286
1,Earth_F2,103.0,401.0
2,Earth_F3,86.0,351.0
3,Earth_F4,101.0,515.0
4,Earth_F5,98.0,429.714286
5,Earth_F6,110.0,351.0
6,Earth_F7,115.0,397.0
7,Earth_F8,97.0,550.0
8,Earth_F9,102.0,295.0
9,Earth_M1,85.0,456.0


# RNA-Seq Dataset Description
- FBgn... (FlyBase Genes)
  - [FlybBase](http://flybase.org/) is the primary repository of genetic and molecular data of the insect family Drosophilidae
- RR..._transposable_element
  - Supposedly from FB as well

In [45]:
data["RNA-SEQ"].head(data["RNA-SEQ"].size)

Unnamed: 0.1,Unnamed: 0,Earth_M1,Earth_M2,Earth_M3,Earth_M4,Earth_F1,Earth_F2,Earth_F3,Earth_F4,SFug_M1,SFug_M2,SFug_M3,SFug_M4,SFug_F1,SFug_F2,SFug_F3,SFug_F4,SF1g_M1,SF1g_M2,SF1g_M3,SF1g_M4,SF1g_F1,SF1g_F2,SF1g_F3,SF1g_F4
0,FBgn0000003,268.326940,289.610821,307.904018,767.736468,363.014096,223.520592,237.296136,434.895552,617.512755,397.275093,494.114191,102.733961,540.691165,1227.123863,905.229891,414.117771,1487.063462,1557.208531,1377.858569,1120.621546,234.505650,395.502182,350.925252,450.650643
1,FBgn0000008,1211.805611,1163.875378,1055.794866,1091.560477,1066.283531,1169.359160,986.334465,1181.038372,1340.080214,1189.849152,1132.596904,1302.634134,972.791161,1163.510890,1248.041362,1258.849049,1173.876133,1119.218149,1097.009587,1118.483285,1319.362577,1321.783049,1167.903969,1464.003454
2,FBgn0000014,3.721252,6.748211,6.457626,0.000000,0.810243,4.260959,2.195389,2.844821,4.401245,6.346743,1.979916,0.000000,0.000000,3.030498,0.000000,0.000000,5.328744,0.000000,14.785793,3.172961,6.647281,0.000000,1.016876,2.664084
3,FBgn0000015,0.000000,1.572493,9.666983,1.898347,14.056209,2.461707,13.225248,7.404185,7.305775,5.718576,3.049767,2.560166,2.050603,4.914730,4.797240,9.462406,1.212701,2.167621,3.200357,3.612382,5.795728,2.208960,16.095113,27.152112
4,FBgn0000017,5792.651152,6025.273956,5669.638583,5646.796932,4716.378972,4988.898115,5071.258761,5147.098499,6702.973198,5317.186322,6582.326137,7718.806957,4004.071358,4965.857566,5206.478374,6145.212985,5983.464767,5956.888079,5813.034554,6157.063014,5643.030168,6117.758433,5151.717030,5756.307903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16670,RR51007_transposable_element,509.601736,666.527630,526.283078,599.806296,1756.944820,1187.817505,958.243298,1092.694204,893.252793,1337.368491,1276.860919,1264.340096,2573.488747,1830.530951,2445.944358,2307.839620,916.500071,574.561196,650.926226,600.877625,959.500452,917.583120,1840.085761,1383.760349
16671,RR51048_transposable_element,0.000000,0.000000,4.138006,0.000000,0.000000,0.000000,0.000000,0.845202,1.121934,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.928045,0.000000,0.000000,4.043984,0.000000,0.000000,0.000000,0.773629
16672,RR51093_transposable_element,241.838826,137.937182,0.000000,47.196831,56.801274,231.254518,267.554499,0.000000,11.222187,0.000000,0.000000,40.612327,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.521423,87.510941,17.216557,83.533404
16673,RR51475_transposable_element,219.591935,189.546519,256.521685,307.164944,331.171984,214.531486,319.041196,388.738942,230.046175,233.036031,388.234729,469.276630,227.867665,484.523064,385.516275,489.361768,231.908087,195.666005,261.678927,186.813936,262.355013,221.269969,300.773415,439.338250


# IHC Dataset Description
- Anti-Elav (Developmental Studies Hybridoma Bank, Catalog # 7E8A10):
> This is a rat monoclonal antibody against the Elav (embryonic lethal abnormal vision) protein in Drosophila. Elav is a marker for most differentiated neuronal cells in the central and peripheral nervous system. It's used to identify neurons at all stages of development once they have differentiated. This antibody is particularly useful for studying neuronal development and differentiation in the central nervous system.
- Anti-Cc3 (Cell Signaling Technology, Catalog # 9661):
> While not directly mentioned in the search results, this antibody typically targets cleaved caspase-3 (Cc3), which is an indicator of apoptosis. In the context of the central nervous system, it can be used to study programmed cell death in neurons and other neural cells during development or in pathological conditions.
- Anti-8-oxo-dG (R&D Systems, Catalog # 4354-MC050):
> This antibody targets 8-oxo-2'-deoxyguanosine (8-oxo-dG), a marker of oxidative DNA damage. In the central nervous system, it can be used to study oxidative stress in neurons and glial cells, which is relevant in various neurodegenerative diseases and aging processes.
- Anti-Repo (Developmental Studies Hybridoma Bank, Catalog # 8D12):
> This antibody targets the Repo (Reversed polarity) protein, which is a marker for glial cells. It's often used in contrast with Elav staining to distinguish between neurons and glial cells in the central nervous system. This antibody is valuable for studying glial cell development and function in the nervous system.
- Anti-TH (EMD Millipore, Catalog # AB152):
> This antibody targets tyrosine hydroxylase (TH), the rate-limiting enzyme in the synthesis of catecholamines. In the central nervous system, it's used to identify dopaminergic and noradrenergic neurons. This antibody is particularly useful for studying disorders involving these neurotransmitter systems, such as Parkinson's disease.


In [46]:
data["IHC"].head()

Unnamed: 0,source_name,th_positive_cells,repo_glial_cells
0,Earth_F1,101.0,429.714286
1,Earth_F2,103.0,401.0
2,Earth_F3,86.0,351.0
3,Earth_F4,101.0,515.0
4,Earth_F5,98.0,429.714286


# Dataset for ML 
- Aligned RNA-SEQ & IHC Data

In [47]:
pd.set_option('display.max_columns', 10)

# set columns to genes
df = data["RNA-SEQ"].copy().T
df.columns = df.iloc[0]
df = df.iloc[1:]
df.reset_index(inplace=True)
df.rename(columns={'index': 'source_name'}, inplace=True)
df.rename_axis("", axis=1, inplace=True)

source_names = df['source_name']

# filter data["IHC"] based on the source_names list
filtered_IHC = data["IHC"][data["IHC"]['source_name'].isin(source_names)]
filtered_IHC.set_index('source_name', inplace=True)
df.set_index('source_name', inplace=True)
filtered_IHC = filtered_IHC.reindex(df.index)
filtered_IHC.reset_index(inplace=True)


# add IHC values to df
df = df.join(filtered_IHC.set_index('source_name'), on='source_name').copy()
df.reset_index(inplace=True)
df.drop(columns=["source_name"], inplace=True)

df.to_csv("filtered_data.csv")
df

Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014,FBgn0000015,FBgn0000017,...,RR51093_transposable_element,RR51475_transposable_element,RR51477_transposable_element,th_positive_cells,repo_glial_cells
0,268.32694,1211.805611,3.721252,0.0,5792.651152,...,241.838826,219.591935,13.899158,85.0,456.0
1,289.610821,1163.875378,6.748211,1.572493,6025.273956,...,137.937182,189.546519,16.018844,75.0,332.0
2,307.904018,1055.794866,6.457626,9.666983,5669.638583,...,0.0,256.521685,7.447111,95.0,424.0
3,767.736468,1091.560477,0.0,1.898347,5646.796932,...,47.196831,307.164944,10.164972,90.0,368.0
4,363.014096,1066.283531,0.810243,14.056209,4716.378972,...,56.801274,331.171984,4.813748,101.0,429.714286
5,223.520592,1169.35916,4.260959,2.461707,4988.898115,...,231.254518,214.531486,17.646249,103.0,401.0
6,237.296136,986.334465,2.195389,13.225248,5071.258761,...,267.554499,319.041196,2.238993,86.0,351.0
7,434.895552,1181.038372,2.844821,7.404185,5147.098499,...,0.0,388.738942,4.225253,101.0,515.0
8,617.512755,1340.080214,4.401245,7.305775,6702.973198,...,11.222187,230.046175,5.611235,80.0,454.0
9,397.275093,1189.849152,6.346743,5.718576,5317.186322,...,0.0,233.036031,7.768467,70.0,454.0
