# Metadata Preparation
This notebook carries out the semi-manual metadata cleaning steps that aim to correct any noticed errors remaining after the RiboSeq Metadata [R scripts](https://github.com/Roleren/riboseq_metadata)

In [1]:
import pandas as pd 


In [2]:
latest_metadata_original_path = "/home/jack/projects/Metadata/data/standardized_columns_final_2024-04-22.csv"

In [3]:
latest_metadata_new_path = "/home/jack/projects/riboseqorg_metadata/temp_files/standardized_columns_final_2024-10-10.csv"

In [4]:
original_df = pd.read_csv(latest_metadata_original_path)
new_df = pd.read_csv(latest_metadata_new_path)


  original_df = pd.read_csv(latest_metadata_original_path)
  new_df = pd.read_csv(latest_metadata_new_path)


In [5]:
# human run accessions
original_human = original_df[original_df['ScientificName'] == 'Homo sapiens']
new_human = new_df[new_df['ScientificName'] == 'Homo sapiens']

original_human_accessions = set(original_human['Run'])
new_human_accessions = set(new_human['Run'])

In [6]:
combined = new_human_accessions | original_human_accessions
len(combined)

4448

In [7]:
base_path = '/home/DATA/RiboSeqOrg-DataPortal-Files/RiboSeqOrg/collapsed_reads'

In [8]:
with open('human_runs.txt', 'w') as f:
    for i in combined:
        f.write(f"{i}\n")

In [9]:

new_not_in_original = new_df[~new_df['Run'].isin(original_df['Run'])]
original_not_in_new = original_df[~original_df['Run'].isin(new_df['Run'])]

In [10]:
print(len(new_not_in_original))
print(len(original_not_in_new))

525
1428


In [11]:
latest_metadata = pd.concat([original_df, new_not_in_original], ignore_index=True)

In [12]:
latest_metadata.head()

Unnamed: 0,Run,spots,bases,avgLength,size_MB,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,...,Single-end/Pair-end,Sample_inf,collection_method,inhibitors,knock-out,hours in fe-deficient conditions,sample pair,nuclease treatment,rna or_ribo,ScientificName.1
0,DRR244662,50599092,3845530992,76.0,1273,Ribosome profiling,RNA-Seq,RANDOM,TRANSCRIPTOMIC,SINGLE,...,,,,,,,,,,
1,DRR244663,50768935,3858439060,76.0,1269,Ribosome profiling,RNA-Seq,RANDOM,TRANSCRIPTOMIC,SINGLE,...,,,,,,,,,,
2,DRR255640,53029459,1837645023,34.0,792,,RNA-Seq,size fractionation,TRANSCRIPTOMIC,SINGLE,...,,,,,,,,,,
3,DRR255641,60864576,2145161510,35.0,874,,RNA-Seq,size fractionation,TRANSCRIPTOMIC,SINGLE,...,,,,,,,,,,
4,DRR255642,55434616,1923586815,34.0,828,,RNA-Seq,size fractionation,TRANSCRIPTOMIC,SINGLE,...,,,,,,,,,,


In [13]:
ribocrypt_metadata = "/home/jack/Downloads/RiboCrypt_Metadata_13_09_24.csv"

## Trips-Viz Metadata Matching

In [14]:
files = pd.read_csv('/home/jack/projects/RiboSeqOrg-DataPortal/data/trips_files.csv',
                    names=['file_id', 'organism_id', 'study_id', 'file_name', 'file_description', 'file_type', 'owner', 'mapped_reads', 'control', 'cell_line'])
studies = pd.read_csv('/home/jack/projects/RiboSeqOrg-DataPortal/data/trips_studies.csv',
                      names=['study_id', 'organism_id', 'study_name', 'paper_authors', 'srp_nos', 'paper_year', 'paper_pmid', 'paper_link', 'gse_nos', 'adapters', 'paper_title', 'description', 'private', 'owner'])
# filtered = pd.read_csv(latest_metadata_path)
ribocrypt = pd.read_csv(ribocrypt_metadata)

In [15]:
files.shape

(12725, 10)

In [16]:
studies.head()

Unnamed: 0,study_id,organism_id,study_name,paper_authors,srp_nos,paper_year,paper_pmid,paper_link,gse_nos,adapters,paper_title,description,private,owner
,0,1,2,3,4,5.0,6.0,7,8,9,10,11,12,13
0.0,-1,0,,,,,,,,,,,0,0
1.0,1,10,Nedialkova15,"Nedialkova DD, Leidel SA",SRP056647,2015.0,26052047.0,"""https://linkinghub.elsevier.com/retrieve/pii/...",GSE67387,,Optimization of Codon Translation Rates via tR...,Ribosome profiling of wild-type and tRNA modif...,0,1
2.0,2,7,Dunn13,"Dunn JG, Foo CK, Belletier NG, Gavis ER, Weiss...",SRP028243,2013.0,24302569.0,"""https://doi.org/10.7554/eLife.01179""",GSE49197,CTGTAGGCACCATCAAT,Ribosome profiling reveals pervasive and regul...,12 samples of Drosophila ribosome profiling an...,0,1
3.0,3,8,Nakahigashi16,"Nakahigashi K, Takai Y, Kimura M, Abe N, Nakay...",DRP003075,2016.0,,https://academic.oup.com/dnaresearch/article-l...,Nakahigashi16,,Comprehensive identification of translation st...,,0,1


In [17]:
public_studies = studies[studies['private'] == 0]
public_study_ids = list(public_studies['study_id'].values)

In [18]:
files = files[files['study_id'].isin(public_study_ids)]
files = files[files['file_type'] == 'riboseq']
files.shape

(2649, 10)

In [19]:
organisms = pd.read_csv('/home/jack/projects/RiboSeqOrg-DataPortal/data/trips_orgnaims.csv',
                        names=['organism_id', 'organism_name', 'transcriptome_list', 'gwips_databasename', 'gwips_clade', 'gwips_organism', 'gwips_database', 'default_transcript', 'private', 'owner'])
                        

In [20]:
organisms.head()

Unnamed: 0,organism_id,organism_name,transcriptome_list,gwips_databasename,gwips_clade,gwips_organism,gwips_database,default_transcript,private,owner
0,1,saccharomyces_cerevisiae,sgd,yeast,yeast,S.+cerevisiae,sacCer3,YPR122W,0,1
1,2,mycoplasma_hyorhinis,Ensembl_release37,mycoplasma_hyorhinis,mycoplasma_hyorhinis,M.+hyorinis,mh,AFX74150,1,1
2,3,mus_musculus,Gencode_M14,mouse,mammal,Mouse,mm10,ENSMUST00000037796,0,1
3,4,homo_sapiens,Gencode_v25,homo_sapiens,mammal,Human,hg38,ENST00000558401,0,1
4,5,homo_sapiens_polio,Ensembl_2011,homo_sapiens,mammal,Human,hg38,polio,1,1


In [21]:
with open('/home/jack/projects/RiboSeqOrg-DataPortal/data/trips_model.tsv', 'w') as f:
    for row in files.iterrows():
        match = latest_metadata[latest_metadata['Run'] == row[1]['file_name'].split('.')[0]]
        if len(match['Run'].values) == 0:
            run = 'NA'
            bioproject = 'NA'
        else:
            run = match['Run'].values[0]
            bioproject = match['BioProject'].values[0]
        organism = organisms[organisms['organism_id'] == row[1]['organism_id']]
        if organism.shape[0] == 0:
            continue
        study = studies[studies['study_id'] == row[1]['study_id']]
        f.write(f"{bioproject}\t{run}\t{row[1]['file_id']}\t{row[1]['file_name'].split('.')[0]}\t{study['study_name'].values[0]}\t{study['srp_nos'].values[0]}\t{study['gse_nos'].values[0]}\t{study['paper_pmid'].values[0]}\t{organism['organism_name'].values[0]}\t{organism['transcriptome_list'].values[0]}\n")

## Metadata Cleanup

In [22]:
latest_metadata.shape

(15365, 143)

In [23]:
# List of column pairs to update
column_pairs = [('TISSUE', 'TISSUE_st'), ('CELL_LINE', 'CELL_LINE_st'), ('INHIBITOR', 'INHIBITOR_st'),
                ('CONDITION', 'CONDITION_st'), ('REPLICATE', 'REPLICATE_st'), ('LIBRARYTYPE', 'LIBRARYTYPE_st'),
                ('FRACTION', 'FRACTION_st'), ('TIMEPOINT', 'TIMEPOINT_st'), ('ScientificName', 'scientific_name')]

for main_col, source_col in column_pairs:
    # Update the main column with non-NaN values from the source column
    if source_col in latest_metadata.columns:
        latest_metadata.loc[latest_metadata[source_col].notnull(), main_col] = latest_metadata.loc[latest_metadata[source_col].notnull(), source_col]

# then drop the source columns
latest_metadata.drop(columns=[col for _, col in column_pairs if col in latest_metadata.columns], inplace=True)

In [24]:
latest_metadata['CELL_LINE'].count()

6257

In [25]:
import numpy as np

In [26]:
# join the ribocrypt metadata with the latest metadata based on Run column
merged = pd.merge(latest_metadata, ribocrypt[['Run', 'CELL_LINE', 'INHIBITOR', 'CONDITION', 'AUTHOR']], on='Run', how='inner')

for idx, row in merged.iterrows():
    if row['CELL_LINE_x'] != row['CELL_LINE_y'] and row['CELL_LINE_y'] is not np.nan and row['CELL_LINE_x'] is np.nan and row['CELL_LINE_y'] != "NONE":
        latest_metadata.loc[latest_metadata['Run'] == row['Run'], 'CELL_LINE'] = row['CELL_LINE_y']
    if row['INHIBITOR_x'] != row['INHIBITOR_y'] and row['INHIBITOR_y'] is not np.nan and row['INHIBITOR_x'] is np.nan and row['INHIBITOR_y'] != "NONE":
        latest_metadata.loc[latest_metadata['Run'] == row['Run'], 'INHIBITOR'] = row['INHIBITOR_y']
    if row['AUTHOR_x'] != row['AUTHOR_y'] and row['AUTHOR_y'] is not np.nan and row['AUTHOR_x'] is np.nan and row['AUTHOR_y'] != "NONE":
        latest_metadata.loc[latest_metadata['Run'] == row['Run'], 'AUTHOR'] = row['AUTHOR_y']
    if row['CONDITION_x'] != row['CONDITION_y'] and row['CONDITION_y'] is not np.nan and row['CONDITION_x'] is np.nan and row['CONDITION_y'] != "NONE":
        if row['CONDITION_y'] == 'WT':
            latest_metadata.loc[latest_metadata['Run'] == row['Run'], 'CONDITION'] = 'Control'
        else:
            latest_metadata.loc[latest_metadata['Run'] == row['Run'], 'CONDITION'] = 'Test'


In [27]:
import numpy as np

In [28]:
latest_metadata.loc[latest_metadata['Study_Pubmed_id'] == 1, 'Study_Pubmed_id'] = np.nan
latest_metadata.loc[latest_metadata['AUTHOR'] == 'Makar', 'AUTHOR'] = np.nan
latest_metadata['AUTHOR'].value_counts()


AUTHOR
Atger          884
Bazzini        526
Marcel         200
Chou           165
Wang           161
              ... 
Cai              2
Weinberg         1
Hanson           1
Gupta            1
Nakahigashi      1
Name: count, Length: 508, dtype: int64

In [29]:
nan_percentages = latest_metadata.isna().mean() * 100

#print any that are not 0% na 
na_percent_df = nan_percentages[nan_percentages > 0]
na_percent_df.to_csv("../data/na_percent.csv")

In [30]:
len(latest_metadata['CELL_LINE'].unique())

525

In [31]:
# replace anything in the CELL_LINE column that with lymphoblastoid in cell line 
# There were many with GMXXXXlymphoblasoid cell line as the cell line 
# Create a boolean mask without NaN/NA values
mask = latest_metadata['CELL_LINE'].str.contains('lymphoblastoid', case=False, na=False)

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = 'lymphoblastoid'

In [32]:
latest_metadata['CELL_LINE'].value_counts().to_csv("test.tsv", sep="\t")

In [33]:
len(latest_metadata.columns)

134

In [34]:
mask = latest_metadata['CELL_LINE'].str.contains('fibroblast', case=False, na=False)

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = 'Fibroblast'

In [35]:
mask = latest_metadata['CELL_LINE'].str.contains('neuroblast', case=False, na=False)

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = 'Neuroblast'

In [36]:
mask = latest_metadata['CELL_LINE'].str.contains('myoblast', case=False, na=False)

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = 'Myoblast'

In [37]:
mask = latest_metadata['CELL_LINE'].str.contains('mouse lymphoid Ba/F3 cells', case=False, na=False)

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = 'Ba/F3'

In [38]:
latest_metadata.loc[latest_metadata['translation inhibitor'] == "8 µg/mL harringtonine for 10 min, then 0.1 mg/mL cycloheximide in lysis buffer", 'translation inhibitor'] = 'harr'

In [39]:
# drop columns that are not helpful
unwanted = [ 
    'Run.1',
    'BioProject.1',
    'name',
    'not_unique',
    'translation inhibitor',
    'mouse line',
    'type',
    'disease state',
    'diagnosis',
    'cell status',
    'isolates',
    'lncrna probes',
    'sequencing type',
    'knockdown or knockout',
    'input',
    'fragmentation',
    'lentivirus',
    'animal group',
    'cell stage',
    'biol replicate',
    'tech replicate',
    'tretment',
    'progenitor cell type',
    'geographic location (country and/or sea)',
    'specimen with known storage state',
    'rnasei treatment',
    'model',
    'identity',
    'cdna type',
    'primer set',
    'lentivirally transduced transgenes',
    'genotype/variaion',
    ]

drop_list = [col for col in latest_metadata.columns if col in unwanted]

for col in latest_metadata.columns:
    if "Experimental" in col:
        drop_list.append(col)
latest_metadata.drop(columns=drop_list, inplace=True)

In [40]:
len(latest_metadata.columns)

87

In [41]:
latest_metadata.to_csv("../data/standardised_10-10-24.csv", index=False)

In [42]:
high_priority_columns = [
    'Run',
    'BioProject',
    'Study_Pubmed_id',
    'AUTHOR',
    'TISSUE',
    'CELL_LINE',
    'INHIBITOR',
    'CONDITION',
    'REPLICATE',
    'LIBRARYTYPE',
    'FRACTION',
    'TIMEPOINT',
    'ScientificName',
    'Sex',
]

In [43]:
combined = latest_metadata['CELL_LINE'] + "-" + latest_metadata['TISSUE'] + '-' + latest_metadata['ScientificName']
counts = combined.value_counts()
counts.to_csv('../data/counts.tsv', sep='\t', header=False)

## Specific Corrections

S2, a Drosophila embryonic cell line is often misidentified when Sample 2 is meant. As a result all non drosophila S2 cell line entries can be overwritten

In [44]:
import numpy as np

In [45]:
# Create a boolean mask without NaN/NA values where cell line is "S2" and organism is not Drosophila melanogaster
mask = (latest_metadata['CELL_LINE'] == 'S2') & (latest_metadata['ScientificName'] != 'Drosophila melanogaster')

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan



In [46]:
mask = latest_metadata['CELL_LINE'] == "TSC2"

latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan


In [47]:
# Create a boolean mask without NaN/NA values where cell line is "S2" and organism is not Drosophila melanogaster
mask = (latest_metadata['CELL_LINE'] == 'H1') & (latest_metadata['ScientificName'] == 'Escherichia coli')

# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [48]:
mask = (latest_metadata['CELL_LINE'] == 'PC3') & (latest_metadata['ScientificName'] == 'Neurospora crassa')

# latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [49]:
mask = (latest_metadata['CELL_LINE'] == 'H1') & (latest_metadata['TISSUE'] == 'embryo') & (latest_metadata['ScientificName'] != 'Homo sapiens')

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan


In [50]:
mask = (latest_metadata['TISSUE'] == 'stem') & (latest_metadata['ScientificName'] == 'Mus musculus')

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [51]:
mask = (latest_metadata['TISSUE'] == 'stem') & (latest_metadata['ScientificName'] == 'Homo sapiens')

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [52]:
mask = (latest_metadata['TISSUE'] == 'leaf') & (latest_metadata['CELL_LINE'] == 'H9')

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [53]:
mask = (latest_metadata['TISSUE'] != 'kidney') & (latest_metadata['CELL_LINE'] == 'HEK293')

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [54]:
mask = (latest_metadata['TISSUE'] != 'cervix') & (latest_metadata['CELL_LINE'] == 'HeLa')

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [55]:
mask = (latest_metadata['TISSUE'] == 'EBV-transformed lymphoblastoid cells') 

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan

# latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [56]:
mask = (latest_metadata['CELL_LINE'] == 'HEK293') & (latest_metadata['TISSUE'] == 'kidney') & (latest_metadata['ScientificName'] != 'Homo sapiens') 

latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan

latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [57]:
mask = (latest_metadata['CELL_LINE'].str.endswith("-cell")) & (latest_metadata['TISSUE'] == "embryo")
mask = mask.fillna(False)
latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]


Unnamed: 0,Run,CELL_LINE,TISSUE,ScientificName
9708,SRR18113903,,embryo,Homo sapiens
9709,SRR18113904,,embryo,Homo sapiens
9710,SRR18113900,,embryo,Homo sapiens
9711,SRR18113901,,embryo,Homo sapiens
9712,SRR18113897,,embryo,Homo sapiens
9713,SRR18113898,,embryo,Homo sapiens
9725,SRR18113797,,embryo,Homo sapiens
9726,SRR18113798,,embryo,Homo sapiens
9727,SRR18113795,,embryo,Homo sapiens
9728,SRR18113796,,embryo,Homo sapiens


In [58]:
mask = (latest_metadata['CELL_LINE'].str.endswith("derived tumor")) & (latest_metadata['TISSUE'] == "Glioblastoma")
mask = mask.fillna(False)
latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
latest_metadata.loc[mask, 'TISSUE'] = np.nan

In [59]:
mask = (latest_metadata['CELL_LINE'] == "HCT116") & (latest_metadata['TISSUE'] != "colon")
mask = mask.fillna(False)
latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]
# Update the 'CELL_LINE' column for rows where the mask is True
# latest_metadata.loc[mask, 'CELL_LINE'] = np.nan
# latest_metadata.loc[mask, 'TISSUE'] = np.nan

Unnamed: 0,Run,CELL_LINE,TISSUE,ScientificName
8208,SRR13679796,HCT116,oocyte,Homo sapiens
8209,SRR13679797,HCT116,oocyte,Homo sapiens
8224,SRR13679812,HCT116,oocyte,Homo sapiens
8225,SRR13679813,HCT116,oocyte,Homo sapiens
8226,SRR13679814,HCT116,oocyte,Homo sapiens
8227,SRR13679815,HCT116,oocyte,Homo sapiens


In [60]:
latest_metadata.to_csv("../data/standardised_10-10-24_updated.csv", index=False)

In [61]:
combined = latest_metadata['CELL_LINE'] + "-" + latest_metadata['TISSUE'] + '-' + latest_metadata['ScientificName']
counts = combined.value_counts()
counts.to_csv('../data/counts.tsv', sep='\t', header=False)

In [62]:
mask = (latest_metadata['TISSUE'] == "embryo") & (latest_metadata['ScientificName'] == "Mus musculus") & (latest_metadata['CELL_LINE'] == np.nan)
mask = mask.fillna(False)
latest_metadata[mask][['Run', 'CELL_LINE', 'TISSUE', 'ScientificName']]

Unnamed: 0,Run,CELL_LINE,TISSUE,ScientificName


In [63]:
# convert all INHIBITOR to lowercase
latest_metadata['INHIBITOR'] = latest_metadata['INHIBITOR'].str.lower()

In [64]:
mask = (latest_metadata['INHIBITOR'] == "no treatment") | (latest_metadata['INHIBITOR'] == "none") | (latest_metadata['INHIBITOR'] == "no erythromycin")
latest_metadata.loc[mask, 'INHIBITOR'] = "untreated"

In [65]:
mask = (latest_metadata['INHIBITOR'].str.endswith("thapsigargin"))
mask = mask.fillna(False)
latest_metadata.loc[mask, 'INHIBITOR'] = "thapsigargin"

  mask = mask.fillna(False)


In [66]:
# Make all INHIBITOR entries that are not 'chx', 'harr', ''lactim' np.nan
accepted_list = [
    'untreated',
    'chx',
    'harr',
    'lactim',
    'chx_harr',
    'chx_lactim',
    'frozen',
    'tetracycline',
    'thapsigargin',
    'anisomycin',
    'tunicamycin',
]
mask = ~(latest_metadata['INHIBITOR'].isin(accepted_list) | latest_metadata['INHIBITOR'].str.endswith("in"))
mask = mask.fillna(False)

latest_metadata.loc[mask, 'INHIBITOR'] = np.nan

In [67]:
mask = latest_metadata['INHIBITOR'].str.endswith("min")
mask = mask.fillna(False)

latest_metadata.loc[mask, 'INHIBITOR'] = np.nan


  mask = mask.fillna(False)


In [68]:
latest_metadata['INHIBITOR'].value_counts().to_csv('../data/inhib_counts.tsv', sep='\t', header=False)

In [69]:
latest_metadata.to_csv("../data/standardised_10-10-24_updated.csv", index=False)

In [70]:
mask = latest_metadata['LIBRARYTYPE'].str.startswith("Ribosome")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "RFP"

  mask = mask.fillna(False)


In [71]:
mask = latest_metadata['LIBRARYTYPE'].str.contains("ibosome", case=False)
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "RFP"

  mask = mask.fillna(False)


In [72]:
mask = latest_metadata['LIBRARYTYPE'].str.startswith("40S")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "SSU"

  mask = mask.fillna(False)


In [73]:
mask = latest_metadata['LIBRARYTYPE'].str.startswith("small ribosomal subunit")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "SSU"

  mask = mask.fillna(False)


In [74]:
mask = latest_metadata['LIBRARYTYPE'].str.startswith("80S")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "LSU"

  mask = mask.fillna(False)


In [75]:
mask = latest_metadata['LIBRARYTYPE'].str.startswith("large ribosomal subunit ")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "LSU"

  mask = mask.fillna(False)


In [76]:
mask = latest_metadata['LIBRARYTYPE'].str.startswith("Ribotag")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'LIBRARYTYPE'] = "RiboTag"

  mask = mask.fillna(False)


In [77]:
latest_metadata['LIBRARYTYPE'].value_counts().to_csv('../data/libtype_counts.tsv', sep='\t', header=False)

In [78]:
latest_metadata.to_csv("../data/standardised_10-10-24_updated.csv", index=False)

In [79]:
organism_list = [
    "Salmonella enterica",
    "Escherichia coli",
    "Saccharomyces cerevisiae",
    "Zymomonas mobilis",
    "Oryza sativa",
    "Streptomyces avermitilis",
    "Mycobacterium tuberculosis",
    "Streptomyces tsukubensis",
    "Staphylococcus aureus",
    "Trypanosoma cruzi",
    "Lacticaseibacillus rhamnosus",
    "Bacillus subtilis",
    "Caulobacter vibrioides",
    "Pseudomonas aeruginosa",
    "Mycobacteroides abscessus",
    "Schizosaccharomyces pombe",
    "Mycoplasmoides gallisepticum",
    "Plasmodium falciparum",
    "Streptomyces coelicolor",
    "Flavobacterium johnsoniae",
    "Mycoplasma pneumoniae",
    "Cryptococcus neoformans",
    "Mycolicibacterium smegmatis",
    "Sinorhizobium meliloti",
    "Bacteroides thetaiotaomicron",
    "Vibrio natriegens",
    "Vibrio vulnificus"
]

for organism in organism_list:
    mask = latest_metadata['ScientificName'].str.startswith(organism)
    mask = mask.fillna(False)
    latest_metadata.loc[mask, 'ScientificName'] = organism


In [80]:
mask = latest_metadata['ScientificName'].str.startswith("Severe acute respiratory syndrome coronavirus 2")
mask = mask.fillna(False)
latest_metadata.loc[mask, 'ScientificName'] = "SARS-CoV2"

In [81]:
latest_metadata['ScientificName'].value_counts().to_csv('../data/ScientificName_counts.tsv', sep='\t', header=False)

In [82]:
latest_metadata['INHIBITOR'].value_counts().to_csv('../data/inhib_counts.tsv', sep='\t', header=False)

In [None]:
latest_metadata

In [83]:
latest_metadata.to_csv("../data/standardised_10-10-24_updated.csv", index=False)

In [84]:
import pandas as pd

# df = pd.read_csv("/home/jack/HEK_metadata.csv")

df = pd.read_csv("/home/jack/Downloads/RiboSeqOrg_Metadata_C57.csv")

df.columns

  df = pd.read_csv("/home/jack/Downloads/RiboSeqOrg_Metadata_C57.csv")


Index(['process_status', 'FASTA_file', 'BioProject', 'GEO', 'Run', 'spots',
       'bases', 'avgLength', 'size_MB', 'Experiment', 'LibraryName',
       'LibraryStrategy', 'LibrarySelection', 'LibrarySource', 'LibraryLayout',
       'InsertSize', 'InsertDev', 'Platform', 'Model', 'SRAStudy',
       'Study_Pubmed_id', 'Sample', 'BioSample', 'SampleType', 'TaxID',
       'ScientificName', 'SampleName', 'CenterName', 'Submission', 'MONTH',
       'YEAR', 'AUTHOR', 'sample_source', 'sample_title', 'LIBRARYTYPE',
       'REPLICATE', 'CONDITION', 'INHIBITOR', 'BATCH', 'TIMEPOINT', 'TISSUE',
       'CELL_LINE', 'FRACTION', 'ENA_first_public', 'ENA_last_update',
       'INSDC_center_alias', 'INSDC_center_name', 'INSDC_first_public',
       'INSDC_last_update', 'INSDC_status', 'ENA_checklist', 'GEO_Accession',
       'Experiment_Date', 'date_sequenced', 'submission_date', 'date', 'STAGE',
       'GENE', 'Sex', 'Strain', 'Age', 'Infected', 'Disease', 'Genotype',
       'Feeding', 'Temperature', '

In [85]:
# subeset so that process status is Complete and LIBRARYTYPE is Ribo-Seq 

df = df[(df['process_status'] == 'Completed') & (df['LIBRARYTYPE'] == 'Ribo-Seq') & (df['Genotype'] == 'WT')]

In [86]:
df['BioProject'].value_counts()

BioProject
PRJNA697913     64
PRJNA725118     38
PRJNA604580     30
PRJNA809587     16
PRJNA1049048    16
                ..
PRJNA927820      1
PRJNA748058      1
PRJNA503400      1
PRJNA358868      1
PRJNA923627      1
Name: count, Length: 61, dtype: int64