In [None]:
%load_ext watermark

In [None]:
%watermark -a Schmelling,Nicolas -u -d -v -p matplotlib,numpy,pandas,biopython

---
Any comments and suggestions or questions?     
Please feel free to contact me via [twitter](https://twitter.com/DerSchmelling) or [email](mailto:Nicolas.Schmelling@hhu.de).

---

## Additional Analyses ##

In the previous [IPython notebook](1_KaiABC_BLAST_Data_Collection_and_Perprocessing.ipynb) we collected the data and preformed some preprocessing steps. 

_In this notebook we continued to preprocess the data such that it fits the desired analyses._

### _Preprocessing_ ###

1. _Grouping the genera in the four groups according to their taxonomy. Groups are Cyanobacteria, Proteobacteria, Archaea, and Other Bacteria. Assign a numeric identifier to each group for coloring._
2. _Calculating the identity mean for each protein and genus. A counter of each genus to record the number of individual organisms per genus will be created._
3. _Merging the the dataframes for each protein into a single dataframe containing information about protein identity for each protein and genus._
4. _Creating another dataframe containing the homologous copy number for each protein in cyanobacteria._

In [None]:
import re

import matplotlib.pyplot as plt
from matplotlib import ticker
import numpy as np
import pandas as pd

from Bio import SeqIO

%matplotlib inline

In [None]:
pd.set_option('mode.chained_assignment',None)

In [None]:
'''
The function will group the genera in to four groups according to
their taxonomy.Groups are Cyanobacteria, Proteobacteria, 
Archaea, and Other Bacteria. The function will also assign a numeric 
identifier to each group for later coloring.
'''
def categories_taxonomy(df):
    
    count = 0
    
    df['colormap'] = np.zeros(len(df.index))
    
    for i in list(df.taxonomy):
        if 'cyanobacteria' in str(i).lower():
            df.taxonomy[count] = 1
            df['colormap'][count] = 3
            count += 1
        elif 'proteobacteria' in str(i).lower():
            df.taxonomy[count] = 2
            df['colormap'][count] = 2
            count += 1
        elif 'archaea' in str(i).lower():
            df.taxonomy[count] = 3
            df['colormap'][count] = 1
            count += 1
        else:
            df.taxonomy[count] = 4
            df['colormap'][count] = 9
            count += 1
    
    return df

'''
The function will first call the function categories_taxonomy() and 
selects only cyanobacteria. Next a counter to record the number protein 
homologs of individual organisms will be created.
'''
def create_count_df(path, prot_name, df_merge, how='left'):
    
    df = pd.read_csv(path)
    df = categories_taxonomy(df)
    df = df[df.taxonomy == 1]
    
    leng = prot_name + '_length'
    coun = prot_name + '_count'
    
    df = df[['name',leng]]
    df = df[df[leng] > 0].drop_duplicates()
    df[coun] = df.groupby(['name']).transform('count')
    df = df[['name',coun]].drop_duplicates()
    
    df_new = pd.merge(df_merge, df, how=how)
    
    return df_new

'''
The function will first call the function create_count_df() and further 
calculate the identity mean for each protein and genus. It will also
remove all special characters, change the genus name of all organisms
labeled as 'Candidatus', and create a counter for each genus 
to record the number of individual organisms per genus. 
'''
def prepare_df(df, prot):
    
    df = categories_taxonomy(df)
    df = df[['name','genus','taxonomy','colormap','identity','genome_id']]
    
    count = 0
    for genus in df['genus']:
        df['genus'][count] = re.sub('[^a-zA-Z0-9 \n\.]', '', genus)
        if genus == 'Candidatus':
            df['genus'][count] = df['name'][count].split(' ',2)[1]
        count += 1
        
    df = df.groupby(['name','genus','taxonomy','colormap','genome_id']).mean()
    df = df.reset_index()
    df['%s_count'%prot] = df.drop_duplicates(subset=['name'])\
                                            [['name','genus']]\
                                            .groupby(['genus'])\
                                            .transform('count')
    df = df[['name','genus','taxonomy','colormap','identity','%s_count'%prot]]
    df.columns = ['name', 'genus', 'taxonomy','colormap',\
                  '%s_identity'%prot,'%s_count'%prot]
    df = df.groupby(['name','genus','taxonomy']).mean()
    df = df.reset_index()
    
    return df

'''
Read the CSV of a given protein and run the prepare_df() function.
'''
def process_df(prot):
    
    df = pd.read_csv('../data/%s.csv'%prot)
    df = prepare_df(df, prot)
    
    return df

'''
The function taxa_per_protein() calculates the percentage of organisms that 
contain a particular protein in their genome. The organisms are separated
into the four groups.
'''
def taxa_per_protein(protein, option):
    
    all_count = df_Sy7942_sorted['taxonomy'].value_counts()
    
    if option == '7942':
        cut = df_Sy7942_sorted[['taxonomy','%s_identity'%protein]].dropna()
        cut = cut['taxonomy'].value_counts()
        print('###############',protein,'###############')
        try:
            print('Cyanobacteria:\t',
                  (cut[1]+cut[5])/(all_count[1]+all_count[5])*100)
        except KeyError:
            print('Cyanobacteria:\t',
                  cut[1]/(all_count[1]+all_count[5])*100)
        try:
            print('Proteobacteria:\t',
                  cut[2]/all_count[2]*100)
        except KeyError:
            print('Proteobacteria:\t None')
        try:
            print('Archaea:\t',
                  cut[3]/all_count[3]*100)
        except KeyError:
            print('Archaea:\t None')
        try:
            print('Other Bacteria:\t',
                  cut[4]/all_count[4]*100)
        except KeyError:
            print('Other Bacteria:\t None')
        
    elif option == '6803':
        cut = df_Sy6803_sorted[['taxonomy','%s_identity'%protein]].dropna()
        cut = cut['taxonomy'].value_counts()
        print('###############',protein,'###############')
        try:
            print('Cyanobacteria:\t',
                  (cut[1]+cut[5])/(all_count[1]+all_count[5])*100)
        except KeyError:
            print('Cyanobacteria:\t',
                  cut[1]/(all_count[1]+all_count[5])*100)
        try:
            print('Proteobacteria:\t',
                  cut[2]/all_count[2]*100)
        except KeyError:
            print('Proteobacteria:\t None')
        try:
            print('Archaea:\t',
                  cut[3]/all_count[3]*100)
        except KeyError:
            print('Archaea:\t None')
        try:
            print('Other Bacteria:\t',
                  cut[4]/all_count[4]*100)
        except KeyError:
            print('Other Bacteria:\t None')

In [None]:
# Run the process_df() function for all Synechococcus proteins
for protein in ['kaiA', 'kaiB', 'kaiC', 'cpmA', 'ircA', 'pex', 'cdpA',
                'ldpA', 'labA', 'lalA', 'cikA', 'sasA', 'nhtA', 'prkE',
                'crm', 'rpaA', 'rpaB']:
    globals()[protein] = process_df(protein)

In [None]:
kaiC_orgs = list(set(kaiC.genus))

not_found = ['Oscillatoriales', 'Opitutaceae', 'halophilic',
             'Candidatus', 'Aquifex', 'Filomicrobium', 'Ziziphus']

orgs = list(set(kaiC_orgs) - set(not_found))
len(orgs)

In [None]:
# Merge the dataframes of each protein into one dataframe.
# First merge the dataframes containing information about KaiA, KaiB and KaiC.
# Remove all genera that are not in the organism list created before.
kaiC = kaiC[kaiC.genus.isin(orgs)]
df_Sy7942 = pd.merge(kaiC,kaiA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, kaiB, how='left')

# Merge the df_Sy7942 dataframe with the rest of the protein dataframes.
df_Sy7942 = pd.merge(df_Sy7942, cpmA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, ircA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, pex, how='left')
df_Sy7942 = pd.merge(df_Sy7942, cdpA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, ldpA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, labA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, lalA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, cikA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, sasA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, prkE, how='left')
df_Sy7942 = pd.merge(df_Sy7942, nhtA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, crm, how='left')
df_Sy7942 = pd.merge(df_Sy7942, rpaA, how='left')
df_Sy7942 = pd.merge(df_Sy7942, rpaB, how='left')

df_Sy7942 = df_Sy7942.groupby(['genus','taxonomy']).mean()
df_Sy7942 = df_Sy7942.reset_index()

df_Sy7942['rank'] = np.zeros(len(df_Sy7942))

# Change the genus of a special cyanobacterium
id = df_Sy7942[df_Sy7942.genus == 'cyanobacterium'].taxonomy.index[0]
df_Sy7942.genus[id] = 'Endosymbiont'
df_Sy7942.taxonomy[id] = 5

# Reduce the number of columns down to the one needed for the heatmap.
df_Sy7942 = df_Sy7942[['genus', 'taxonomy', 'colormap', 'rank',
                       'kaiA_identity', 'kaiB_identity', 'kaiC_identity',
                       'pex_identity', 'ldpA_identity', 'prkE_identity',
                       'nhtA_identity', 'ircA_identity', 'cdpA_identity',
                       'cikA_identity', 'sasA_identity', 'labA_identity',
                       'lalA_identity', 'crm_identity', 'rpaA_identity',
                       'rpaB_identity', 'cpmA_identity', 'kaiC_count']]

In [None]:
# Read the input data for each single protein and run the two function above.
for protein in ['kaiB1', 'kaiB2', 'kaiB3', 'kaiC1', 'kaiC2', 'kaiC3']:
    globals()[protein] = process_df(protein)

In [None]:
df_Sy6803 = pd.merge(kaiB1,kaiB2, how='outer')
df_Sy6803 = pd.merge(df_Sy6803, kaiB3, how='outer')
df_Sy6803 = pd.merge(df_Sy6803, kaiC1, how='outer')
df_Sy6803 = pd.merge(df_Sy6803, kaiC2, how='outer')
df_Sy6803 = pd.merge(df_Sy6803, kaiC3, how='outer')
df_Sy6803 = df_Sy6803.drop_duplicates()
df_Sy6803 = df_Sy6803[df_Sy6803.genus.isin(orgs)]

df_Sy6803 = df_Sy6803.groupby(['genus','taxonomy']).mean()
df_Sy6803 = df_Sy6803.reset_index()

df_Sy6803['rank'] = np.zeros(len(df_Sy6803))

# Change the genus of a special cyanobacterium
id = df_Sy6803[df_Sy6803.genus == 'cyanobacterium'].taxonomy.index[0]
df_Sy6803.genus[id] = 'Endosymbiont'
df_Sy6803.taxonomy[id] = 5

# Reduce the number of columns down to the one needed for the heatmap.
df_Sy6803 = df_Sy6803[['genus', 'taxonomy', 'colormap', 'rank',
                       'kaiB1_identity', 'kaiB2_identity', 'kaiB3_identity',
                       'kaiC1_identity', 'kaiC2_identity', 'kaiC3_identity']]

In [None]:
# Sort dataframe first by the four assigned groups 
# and then by the KaiC identity.
df_Sy7942_sorted = df_Sy7942.sort_values(by=['taxonomy','kaiC_identity'],\
                                         ascending=[True,False])

sorterIndex = dict(zip(df_Sy7942_sorted.genus,\
                       range(len(df_Sy7942_sorted.genus))))

# Use this order to sort the other dataframe
df_Sy6803_sorted = df_Sy6803.copy()
df_Sy6803_sorted['rank'] = df_Sy6803_sorted.genus.map(sorterIndex)

df_Sy6803_sorted = df_Sy6803_sorted.sort_values(by=['rank'],\
                                                ascending = [True])

### Distribution of protein across taxonomic groups

In [None]:
df_Sy7942_sorted['taxonomy'].value_counts()

In [None]:
# Calculate percentage of Synechococcus proteins in the four groups
for protein in ['kaiA','kaiB','kaiC','pex','ldpA','cikA','prkE','nhtA',
                'cdpA','ircA','sasA','rpaA','rpaB','labA','lalA','crm','cpmA']:
    taxa_per_protein(protein, '7942')

In [None]:
# Calculate percentage of Synechocystis proteins in the four groups
for protein in ['kaiB1','kaiC1','kaiB2','kaiC2','kaiB3','kaiC3']:
    taxa_per_protein(protein, '6803')

### Clock Count Analysis ###
__Number of Analyzed Cyanobacteria__

In [None]:
df = pd.read_csv('../clock_counts.csv', index_col='Unnamed: 0')
len(df)

In [None]:
df.info()
df.iloc[:,1:].drop_duplicates().info()

In [None]:
df = df.sort_values(by=['kaiA_count','kaiC1_count','kaiB1_count','kaiB3_count',
                        'kaiC3_count','kaiB2_count','kaiC2_count','cpmA_count',
                        'ircA_count','pex_count','nhtA_count','prkE_count',
                        'cdpA_count','ldpA_count','cikA_count','labA_count',
                        'sasA_count','lalA_count','rpaA_count','rpaB_count',
                        'crm_count'],
                         ascending=[False,False,False,False,False,False,False,
                                    False,False,False,False,False,False,False,
                                    False,False,False,False,False,False,False])

__Percentage of Clock Proteins in Cyanobacteria__

In [None]:
cyano_clock = df.iloc[:,1:].drop_duplicates()

print('In Cyanobacteria the following proteins are'\
      ' found in X % of all analyzed Cyanobacteria')

for protein in cyano_clock.columns:
    print(protein.split('_')[0]+':\t',
          len(cyano_clock[protein].dropna())/len(cyano_clock)*100,\
          '%')

__Multiple KaiBCs__

In [None]:
multi = df[['name','kaiB2_count','kaiC2_count','kaiB3_count','kaiC3_count']]

In [None]:
multi_bc = len(multi.iloc[:,1:][multi.sum(axis=1) > 0])/len(df)*100

print('%.2f' % multi_bc,'% of all Cyanobacteria show multiple Kai proteins.')

__All KaiBC homologs__

In [None]:
# A list of Cyanobacteria that have all known Kai homologs
multi.dropna()

### Major Kai Homologoutside of Cyanobacteria ###

__Bacteria__

In [None]:
all_bac = df_Sy6803_sorted[(df_Sy6803_sorted['taxonomy'] == 2) | \
                           (df_Sy6803_sorted['taxonomy'] == 4)]
len(all_bac)

In [None]:
all_bac.info()

In [None]:
main_kaiC = all_bac['kaiC3_identity'].count()
all_kaiC = len(all_bac[all_bac[['kaiC1_identity',
                                'kaiC2_identity',
                                'kaiC3_identity']].sum(axis=1) > 0])

print('KaiC3 is with %.2f' % (main_kaiC/all_kaiC*100),
      '% the main KaiC homolog in Bacteria.')

In [None]:
additional_kaiC = all_bac[['kaiC1_identity','kaiC2_identity']]\
                         [(all_bac['kaiC3_identity'].notnull()) &\
                          (all_bac[['kaiC1_identity','kaiC2_identity']]\
                           .sum(axis=1) > 0)]
    
print('%.2f' % (len(additional_kaiC)/main_kaiC*100), '% of those organisms'\
      ' have additional KaiC homologs (either KaiC1 or KaiC2).')

In [None]:
main_kaiB = all_bac['kaiB2_identity'].count()
all_kaiB = len(all_bac[all_bac[['kaiB1_identity',
                                'kaiB2_identity',
                                'kaiB3_identity']].sum(axis=1) > 0])

print('However, KaiB2 is with %.2f' % (main_kaiB/all_kaiB*100),
      '% the main KaiB homolog in Bacteria.')

In [None]:
additional_kaiB = all_bac[['kaiB1_identity','kaiB3_identity']]\
                         [(all_bac['kaiB2_identity'].notnull()) &\
                          (all_bac[['kaiB1_identity','kaiB3_identity']]\
                           .sum(axis=1) > 0)]
    
print('%.2f' % (len(additional_kaiB)/main_kaiB*100), '% of those organisms'\
      ' have additional KaiB homologs (either KaiB1 or KaiB3).')

__Archaea__

In [None]:
all_arc = df_Sy6803_sorted[df_Sy6803_sorted['taxonomy'] == 3]
len(all_arc)

In [None]:
all_arc.info()

In [None]:
main_kaiC_arc = all_arc['kaiC3_identity'].count()
all_kaiC_arc = len(all_arc[all_arc[['kaiC1_identity',
                                    'kaiC2_identity',
                                    'kaiC3_identity']].sum(axis=1) > 0])

print('KaiC3 is with %.2f' % (main_kaiC_arc/all_kaiC_arc*100),
      '% the main KaiC homolog in Archaea.')

In [None]:
additional_kaiC_arc = all_arc[['kaiC1_identity','kaiC2_identity']]\
                             [(all_arc['kaiC3_identity'].notnull()) &\
                              (all_arc[['kaiC1_identity','kaiC2_identity']]\
                               .sum(axis=1) > 0)]
    
print('%.2f' % (len(additional_kaiC_arc)/main_kaiC_arc*100), 
      '% of those organisms have additional KaiC homologs'\
      ' (either KaiC1 or KaiC2).')

In [None]:
main_kaiB_arc = all_arc['kaiB2_identity'].count()
all_kaiB_arc = len(all_arc[all_arc[['kaiB1_identity',
                                    'kaiB2_identity',
                                    'kaiB3_identity']].sum(axis=1) > 0])

print('However, KaiB2 is with %.2f' % (main_kaiB_arc/all_kaiB_arc*100),
      '% the main KaiB homolog in Bacteria.')

In [None]:
additional_kaiB_arc = all_arc[['kaiB1_identity','kaiB3_identity']]\
                             [(all_arc['kaiB2_identity'].notnull()) &\
                              (all_arc[['kaiB1_identity','kaiB3_identity']]\
                               .sum(axis=1) > 0)]
    

print('%.2f' % (len(additional_kaiB_arc)/main_kaiB_arc*100), 
      '% of those organisms have additional KaiB homologs'\
      ' (either KaiB1 or KaiB3).')

### Unusual Kai Proteins in Cyanobacteria ###

In [None]:
kaiA = pd.read_csv('../data/kaiA.csv')
kaiA = categories_taxonomy(kaiA)
kaiA = kaiA[['name','taxonomy','length','seq']]
kaiA.columns = ['name', 'taxonomy','kaiA_length','kaiA_seq']

kaiB = pd.read_csv('../data/kaiB.csv')
kaiB = categories_taxonomy(kaiB)
kaiB = kaiB[['name','taxonomy','length','seq']]
kaiB.columns = ['name', 'taxonomy','kaiB_length','kaiB_seq']

kaiC = pd.read_csv('../data/kaiC.csv')
kaiC = categories_taxonomy(kaiC)
kaiC = kaiC[['name','taxonomy','length','seq']]
kaiC.columns = ['name', 'taxonomy','kaiC_length','kaiC_seq']

In [None]:
KaiABC = pd.merge(kaiA, kaiB, how='outer')
KaiABC = pd.merge(KaiABC, kaiC, how='outer')
    
KaiABC = KaiABC.drop_duplicates()
KaiABC = KaiABC[KaiABC.kaiC_length.notnull()]

In [None]:
ABC = KaiABC.fillna(0)
ABC.head()

In [None]:
# Number of KaiC proteins per Group that are "outliers", that is
# either shorter than 400 or longer than 600 amino acids.
kaiC = pd.read_csv('../data/kaiC.csv')

kaiC = categories_taxonomy(kaiC)

for tax,group in zip([1,2,3,4],['Cyanobacteria','Proteobacteria',
                                'Archaea','Other Bacteria']):
    print('###############',group,'###############')
    print('Total:\t\t',len(kaiC[kaiC['taxonomy'] == tax]))
    print('Outliers:\t',len(kaiC[(kaiC['taxonomy'] == tax) & \
                                 ((kaiC['length'] < 400) | \
                                  (kaiC['length'] > 600))]))

In [None]:
# Bacterial KaiB length in present of unusual KaiCs
ABC['kaiB_length'][(ABC['taxonomy'] == 2) & \
                   ((ABC['kaiC_length'] < 400) | \
                   (ABC['kaiC_length'] > 600))]

In [None]:
# Cyanobacterial genera with elongated KaiB1 
#(only KaiB homolog with an elongated version)
kaiB1 = pd.read_csv('../data/kaiB1.csv')

tax_list = []
for tax in kaiB1[(kaiB1['taxonomy'].str.contains('Cyano')) & \
                 (kaiB1['length'] > 200)].taxonomy:
    tax_list.append(tax.split('; ',6)[5])
    
print(set(tax_list))#.to_csv('KaiB1_long.csv')

for tax in set(tax_list):
    print(tax+'\t',tax_list.count(tax))

In [None]:
# Cyanobacterial genera with elongated KaiB1 
#(only KaiB homolog with an elongated version)
kaiA = pd.read_csv('../data/kaiA.csv')

tax_list = []
for tax in kaiA[(kaiA['taxonomy'].str.contains('Cyano')) & \
                (kaiA['length'] < 200)].taxonomy:
    tax_list.append(tax.split('; ',6)[5])
    
print(set(tax_list))#.to_csv('KaiB1_long.csv')

for tax in set(tax_list):
    print(tax+'\t',tax_list.count(tax))

In [None]:
# Cyanobacteria with unusual KaiAs and KaiBs
ABC[(ABC['kaiA_length'] < 200) & \
    (ABC['kaiB_length'] > 200) & \
    (ABC['taxonomy'] == 1)]

### Get Sequences for Multialignments

`mkdir align`


#### KaiA

#### KaiB and KaiC (all and cyanobacteria only)

### Clustal Omega Alignment

`docker pull biodckr/clustal-omega`

`docker run -it -v ~/align:/data/align biodckr/clustal-omega`

1.

`clustalo --in align/kaiA_alignment.fasta --infmt fasta --seqtype Protein --dealign --out align/kaiA_aligned.fasta --outfmt fasta --iter 20 --max-guidetree-iterations 1 --force`


2.

`for f in align/*_cyanos.fasta`

`do`

`clustalo --in "$f" --infmt fasta --seqtype Protein --dealign --out "${f%_alignment_cyanos.fasta}_aligned_cyanos.fasta" --outfmt fasta --iter 20 --max-guidetree-iterations 1 --force`

`done`

3.

`for f in align/*_all.fasta`

`do`

`clustalo --in "$f" --infmt fasta --seqtype Protein --dealign --out "${f%_alignment_all.fasta}_aligned_all.fasta" --outfmt fasta --iter 20 --max-guidetree-iterations 1 --force`

`done`

---
### Previous ###

+ [Data Collection and Processing](1_KaiABC_BLAST_Data_Collection_and_Perprocessing.ipynb)
+ [Distribution of Circadian Clock Proteins](2_KaiABC_BLAST_Heatmap.ipynb)
+ [Length Distribution of KaiA, KaiB, KaiC](3_KaiABC_BLAST_Scatterplot.ipynb)
+ [Co-Occurence of Circadian Clock Proteins in Cyanobacteria](4_KaiABC_BLAST_FisherTest.ipynb)
---