## Calculating Proteomes CAIR

* This module is for reading CSV-format files and grouping entries with regard to each organism and calculates the CAIR for each UniProt Organism ID.

##### Two output files will be generated:
> A) "All species CAIRs.csv" containing all Organism IDs, their correspondent CAIRs, and overall residue frequencies.

> B) "Complete proteome CAIRs.csv" containing non-redundant proteomes and their correspondent CAIRs.

In [2]:
import pandas as pd
from numpy import log2

In [8]:
def entry_to_species(sprot_input_file='Entries sprot.csv', trembl_input_file='Entries trembl.csv',
                     outfile='All species residues.csv', chunksize=8000000, merge='True'): # refer to runCAIR
    out = pd.DataFrame() # creating an empty datafarme
    data = pd.read_csv(trembl_input_file, chunksize=chunksize) # reading data in chunks (to avoid RAM insufficiency)

    for chunk in data:
        chunk = chunk.drop(columns=['CAIR']) # removing CAIRs for each protein (not needed anymore)
        group = chunk.groupby('Organism_ID').sum() # grouping by organisms for each chunk
        out = out.append(group) # filling out the dataframe

    if merge == 'True': # whether to merge the Swiss-Prot file to the prepared TrEMBL file or not
        data2 = pd.read_csv(sprot_input_file).drop(columns=['CAIR']) # reading the Swiss-Prot file
        data2 = data2.groupby('Organism_ID').sum() # grouping by organisms
        out = pd.concat([out, data2]) # adding the Swiss-Prot to the TrEMBL

    out = out.groupby('Organism_ID').sum() # grouping by organisms
    out.to_csv(outfile) # writing the CSV file
    

def species_cair(input_file='All species residues.csv', outfile='Complete proteome CAIRs.csv', proteomes_file="proteomes-redundant_no.tab"): # refer to runCAIR
    data = pd.DataFrame(pd.read_csv(input_file, dtype=float)) # reading the input
    length = data.Len # defining lengths for frequency calculations
    cair = pd.DataFrame((log2((data["A"] / length) ** (-data["A"] / length)) + log2(
        (data["C"] / length) ** (-data["C"] / length)) + log2(
        (data["D"] / length) ** (-data["D"] / length)) + log2(
        (data["E"] / length) ** (-data["E"] / length)) + log2((data["F"] / length) ** (-data["F"] / length)) + log2(
        (data["G"] / length) ** (-data["G"] / length)) + log2(
        (data["H"] / length) ** (-data["H"] / length)) + log2((data["I"] / length) ** (-data["I"] / length)) + log2(
        (data["K"] / length) ** (-data["K"] / length)) + log2(
        (data["L"] / length) ** (-data["L"] / length)) + log2((data["M"] / length) ** (-data["M"] / length)) + log2(
        (data["N"] / length) ** (-data["N"] / length)) + log2(
        (data["O"] / length) ** (-data["O"] / length)) + log2((data["P"] / length) ** (-data["P"] / length)) + log2(
        (data["Q"] / length) ** (-data["Q"] / length)) + log2(
        (data["R"] / length) ** (-data["R"] / length)) + log2((data["S"] / length) ** (-data["S"] / length)) + log2(
        (data["T"] / length) ** (-data["T"] / length)) + log2(
        (data["U"] / length) ** (-data["U"] / length)) + log2((data["V"] / length) ** (-data["V"] / length)) + log2(
        (data["W"] / length) ** (-data["W"] / length)) + log2((data["Y"] / length) ** (-data["Y"] / length))) / log2(22)) # calculating CAIRs
    cair.columns = ["CAIR"] # assigning the column name

    Organism_ID = pd.DataFrame(data.Organism_ID)
    Organism_CAIR = Organism_ID.join(cair)
    Proteomes = pd.read_csv(proteomes_file, sep="\t") # reading the proteomes list file
    Sp_CAIR = Organism_CAIR.merge(Proteomes, left_on="Organism_ID", right_on="Organism ID").drop_duplicates('Organism_ID',
                                                                                                          keep='last') # merging taxonomy data with species CAIRs
    Sp_CAIR['First_hierarchy'] = Sp_CAIR['Taxonomic lineage'].str.split(', ', expand=True)[0] # extracting first hierarchy of organisms (superkingdoms)
    Sp_CAIR['Taxonomic lineage'] = Sp_CAIR['Taxonomic lineage'].str.split(', ', expand=True)[1] # extracting second hierarchy of organisms (phyla in most cases)
    Sp_CAIR = Sp_CAIR[Sp_CAIR.First_hierarchy != 'Viruses'].drop(columns=['First_hierarchy', 'Proteome ID', 'Organism ID']).rename(
        columns={'Taxonomic lineage': 'Second_hierarchy'}).reset_index(drop=True) # filtering viruses out and dropping unnecessary columns and renaming

    code = [] # creating an empty list for analysis codes
    for row in Sp_CAIR['Second_hierarchy']: # assining codes according to the tree of life
        if row == 'Proteobacteria':
            code.append('1')
        elif row == 'Candidatus Hydrogenedentes':
            code.append('2')
        elif row == 'Candidatus Abyssubacteria':
            code.append('3')
        elif row == 'Spirochaetes':
            code.append('4')
        elif row == 'Deferribacteres':
            code.append('5')
        elif row == 'Chrysiogenetes':
            code.append('6')
        elif row == 'Acidobacteria':
            code.append('7')
        elif row == 'Thermodesulfobacteria':
            code.append('8')
        elif row == 'Nitrospirae':
            code.append('9')
        elif row == 'Nitrospinae/Tectomicrobia group':
            code.append('10')
        elif row == 'Elusimicrobia':
            code.append('11')
        elif row == 'Candidatus Omnitrophica':
            code.append('12')
        elif row == 'Planctomycetes':
            code.append('13')
        elif row == 'Chlamydiae':
            code.append('14')
        elif row == 'Lentisphaerae':
            code.append('15')
        elif row == 'Candidatus Aureabacteria':
            code.append('16')
        elif row == 'Kiritimatiellaeota':
            code.append('17')
        elif row == 'Verrucomicrobia':
            code.append('18')
        elif row == 'Candidatus Aegiribacteria':
            code.append('19')
        elif row == 'Candidatus Latescibacteria':
            code.append('20')
        elif row == 'Gemmatimonadetes':
            code.append('21')
        elif row == 'Candidatus Fermentibacteria':
            code.append('22')
        elif row == 'Candidatus Marinimicrobia':
            code.append('23')
        elif row == 'candidate division LCP-89':
            code.append('24')
        elif row == 'Calditrichaeota':
            code.append('25')
        elif row == 'Rhodothermaeota':
            code.append('26')
        elif row == 'Balneolaeota':
            code.append('27')
        elif row == 'Ignavibacteriae':
            code.append('28')
        elif row == 'Candidatus Kryptonia':
            code.append('29')
        elif row == 'Chlorobi':
            code.append('30')
        elif row == 'Bacteroidetes':
            code.append('31')
        elif row == 'Candidatus Kapabacteria':
            code.append('32')
        elif row == 'Candidatus Cloacimonetes':
            code.append('33')
        elif row == 'Fibrobacteres':
            code.append('34')
        elif row == 'Synergistetes':
            code.append('35')
        elif row == 'Fusobacteria':
            code.append('36')
        elif row == 'Deinococcus-Thermus':
            code.append('37')
        elif row == 'Coprothermobacterota':
            code.append('38')
        elif row == 'Thermotogae':
            code.append('39')
        elif row == 'Aquificae':
            code.append('40')
        elif row == 'Caldiserica/Cryosericota group':
            code.append('41')
        elif row == 'Dictyoglomi':
            code.append('42')
        elif row == 'Firmicutes':
            code.append('43')
        elif row == 'Tenericutes':
            code.append('44')
        elif row == 'Candidatus Eremiobacteraeota':
            code.append('45')
        elif row == 'Abditibacteriota':
            code.append('46')
        elif row == 'Armatimonadetes':
            code.append('47')
        elif row == 'Thermobaculum':
            code.append('48')
        elif row == 'Chloroflexi':
            code.append('49')
        elif row == 'Candidatus Dormibacteraeota':
            code.append('50')
        elif row == 'Actinobacteria':
            code.append('51')
        elif row == 'Cyanobacteria':
            code.append('52')
        elif row == 'Candidatus Melainabacteria':
            code.append('53')
        elif row == 'Candidatus Margulisbacteria':
            code.append('54')
        elif row == 'Candidatus Saganbacteria':
            code.append('55')
        elif row == 'Candidatus Saccharibacteria' or\
                row == 'candidate division SR1' or\
                row == 'Candidatus Dependentiae' or\
                row == 'Candidatus Gracilibacteria' or\
                row == 'Candidatus Atribacteria' or\
                row == 'Candidatus Parcubacteria' or\
                row == 'Candidatus Bipolaricaulota' or\
                row == 'Candidatus Poribacteria' or\
                row == 'unclassified Parcubacteria group' or\
                row == 'Candidatus Aminicenantes' or\
                row == 'Candidatus Coatesbacteria' or\
                row == 'Candidatus Eisenbacteria' or\
                row == 'Candidatus Aerophobetes' or\
                row == 'Candidatus Riflebacteria' or\
                row == 'Candidatus Wolfebacteria' or\
                row == 'Candidatus Nomurabacteria' or\
                row == 'Candidatus Roizmanbacteria' or\
                row == 'Candidatus Uhrbacteria' or\
                row == 'Candidatus Yanofskybacteria' or\
                row == 'Candidatus Levybacteria' or\
                row == 'Candidatus Colwellbacteria' or\
                row == 'candidate division WWE3' or\
                row == 'Candidatus Sungbacteria' or\
                row == 'Candidatus Woykebacteria' or\
                row == 'Candidatus Komeilibacteria' or\
                row == 'Candidatus Falkowbacteria' or\
                row == 'Candidatus Moranbacteria' or\
                row == 'Candidatus Curtissbacteria' or\
                row == 'Candidatus Giovannonibacteria' or\
                row == 'Candidatus Dojkabacteria' or\
                row == 'Candidatus Harrisonbacteria' or\
                row == 'Candidatus Magasanikbacteria' or\
                row == 'Candidatus Beckwithbacteria' or\
                row == 'Candidatus Fraserbacteria' or\
                row == 'Candidatus Kaiserbacteria' or\
                row == 'Candidatus Niyogibacteria' or\
                row == 'Candidatus Yonathbacteria' or\
                row == 'Candidatus Azambacteria' or\
                row == 'Candidatus Portnoybacteria' or\
                row == 'Candidatus Berkelbacteria' or\
                row == 'Candidatus Doudnabacteria' or\
                row == 'Candidatus Zambryskibacteria' or\
                row == 'Candidatus Staskawiczbacteria' or\
                row == 'Candidatus Woesebacteria' or\
                row == 'Candidatus Lloydbacteria' or\
                row == 'Candidatus Nealsonbacteria' or\
                row == 'Candidatus Microgenomates' or\
                row == 'Candidatus Taylorbacteria' or\
                row == 'Candidatus Vogelbacteria' or\
                row == 'Candidatus Buchananbacteria' or\
                row == 'Candidatus Gottesmanbacteria' or\
                row == 'Candidatus Jorgensenbacteria' or\
                row == 'Candidatus Rokubacteria' or\
                row == 'Candidatus Peregrinibacteria' or\
                row == 'Candidatus Dadabacteria' or\
                row == 'Candidatus Kerfeldbacteria' or\
                row == 'Candidatus Desantisbacteria' or\
                row == 'Candidatus Ryanbacteria' or\
                row == 'Candidatus Pacebacteria' or\
                row == 'Candidatus Daviesbacteria' or\
                row == 'Candidatus Amesbacteria' or\
                row == 'Candidatus Tagabacteria' or\
                row == 'Candidatus Shapirobacteria' or\
                row == 'candidate division CPR1' or\
                row == 'Candidatus Adlerbacteria' or\
                row == 'Candidatus Spechtbacteria' or\
                row == 'candidate division Kazan-3B-28' or\
                row == 'Candidatus Terrybacteria' or\
                row == 'Candidatus Wildermuthbacteria' or\
                row == 'candidate division NC10' or\
                row == 'Candidatus Campbellbacteria' or\
                row == 'Candidatus Collierbacteria' or\
                row == 'Candidatus Wirthbacteria' or\
                row == 'Candidatus Brennerbacteria' or\
                row == 'Candidatus Kuenenbacteria' or\
                row == 'Candidatus Veblenbacteria' or\
                row == 'candidate division KSB1' or\
                row == 'Candidatus Glassbacteria' or\
                row == 'Candidatus Firestonebacteria' or\
                row == 'Candidatus Delongbacteria' or\
                row == 'Candidatus Lindowbacteria' or\
                row == 'candidate division TA06' or\
                row == 'Candidatus Liptonbacteria' or\
                row == 'Candidatus Jacksonbacteria' or\
                row == 'Candidatus Blackburnbacteria' or\
                row == 'Candidatus Abawacabacteria' or\
                row == 'Candidatus Wallbacteria' or\
                row == 'Candidatus Schekmanbacteria' or\
                row == 'Candidatus Hydrothermae' or\
                row == 'candidate division WOR-3' or\
                row == 'Candidatus Sumerlaeota' or\
                row == 'candidate division KSB3' or\
                row == 'Candidatus Andersenbacteria' or\
                row == 'candidate division WS5' or\
                row == 'Candidatus Edwardsbacteria' or\
                row == 'Candidatus Chisholmbacteria' or\
                row == 'Candidatus Fischerbacteria' or\
                row == 'candidate division KD3-62' or\
                row == 'candidate division CPR3' or\
                row == 'Candidatus Handelsmanbacteria' or\
                row == 'candidate division CPR2' or\
                row == 'Candidatus Cerribacteria' or\
                row == 'Candidatus Raymondbacteria' or\
                row == 'Candidatus Goldbacteria':
            code.append('56') # Assigning one code for all Candidate Phyla Radiation(CPR)
        elif row == 'Candidatus Hydrothermarchaeota':
            code.append('57')
        elif row == 'Candidatus Altiarchaeota':
            code.append('58')
        elif row == 'Candidatus Micrarchaeota':
            code.append('59')
        elif row == 'Candidatus Diapherotrites':
            code.append('60')
        elif row == 'Candidatus Aenigmarchaeota':
            code.append('61')
        elif row == 'Candidatus Huberarchaea':
            code.append('62')
        elif row == 'Nanoarchaeota':
            code.append('63')
        elif row == 'Candidatus Parvarchaeota':
            code.append('64')
        elif row == 'Candidatus Pacearchaeota':
            code.append('65')
        elif row == 'Candidatus Woesearchaeota':
            code.append('66')
        elif row == 'Euryarchaeota':
            code.append('67')
        elif row == 'Candidatus Bathyarchaeota':
            code.append('68')
        elif row == 'Thaumarchaeota':
            code.append('69')
        elif row == 'Candidatus Geothermarchaeota':
            code.append('70')
        elif row == 'Candidatus Korarchaeota':
            code.append('71')
        elif row == 'Candidatus Nezhaarchaeota':
            code.append('72')
        elif row == 'Candidatus Verstraetearchaeota':
            code.append('73')
        elif row == 'Candidatus Marsarchaeota':
            code.append('74')
        elif row == 'Crenarchaeota':
            code.append('75')
        elif row == 'Asgard group':
            code.append('76')
        elif row == 'Euglenozoa':
            code.append('77')
        elif row == 'Heterolobosea':
            code.append('78')
        elif row == 'Metamonada':
            code.append('79')
        elif row == 'Apusozoa':
            code.append('80')
        elif row == 'Rotosphaerida':
            code.append('81')
        elif row == 'Fungi':
            code.append('82')
        elif row == 'Ichthyosporea':
            code.append('83')
        elif row == 'Filasterea':
            code.append('84')
        elif row == 'Choanoflagellata':
            code.append('85')
        elif row == 'Metazoa':
            code.append('86')
        elif row == 'Amoebozoa':
            code.append('87')
        elif row == 'Haptista':
            code.append('88')
        elif row == 'Cryptophyceae':
            code.append('89')
        elif row == 'Sar':
            code.append('90')
        elif row == 'Viridiplantae':
            code.append('91')
        elif row == 'Rhodophyta':
            code.append('92')

        else:
            code.append("0") # the following six phyla were not matched, thus excluded:
                             # Haloplasmatales, environmental samples, Candidatus Vecturithrix, Vampirococcus, Natronospirillum, unclassified DPANN group

    Sp_CAIR['code'] = code # inserting the code column
    Sp_CAIR.to_csv(outfile, index=False) # writing the CSV output file