In [1]:
%load_ext watermark

In [2]:
%watermark -a Schmelling,Nicolas -u -d -v -p matplotlib,numpy,pandas,scipy

Schmelling,Nicolas 
last updated: 2017-01-03 

CPython 3.5.2
IPython 4.1.1

matplotlib 1.5.1
numpy 1.10.4
pandas 0.18.0
scipy 0.17.0


---
Any comments and suggestions or questions?     
Please feel free to contact me via [twitter](https://twitter.com/DerSchmelling) or [email](mailto:Nicolas.Schmelling@hhu.de).

---

## Length distribution of KaiA, KaiB, KaiC ##

In the previous [IPython notebook](1_KaiABC_BLAST_Data_Collection_and_Perprocessing.ipynb) we collected the data and preformed some preprocessing steps. In this notebook we continued to preprocess the data such that it fits the desired analyses. Afterwards we visualized the data in three scatter plots containing the sequence length of KaiABC and the kernel density of each of the proteins.

### Preprocessing ###

1. Grouping the genera in the four groups according to their taxonomy. Groups are Cyanobacteria, Proteobacteria, Archaea, and Other Bacteria.
2. Slicing the KaiA, KaiB and KaiC dataframe such that only organism, taxonomy, sequence, and its length is left.
3. Merge the three dataframes into only dataframe and filter out some genera.

In [None]:
import re

import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as s

%matplotlib inline

In [None]:
pd.set_option('mode.chained_assignment',None)

In [None]:
'''
The function will group the genera in to four groups according to
their taxonomy.Groups are Cyanobacteria, Proteobacteria, 
Archaea, and Other Bacteria. The function will also assign a numeric 
identifier to each group for later coloring.
'''
def categories_taxonomy(df):
    
    count = 0
    
    for i in list(df.taxonomy):
        if 'cyanobacteria' in str(i).lower():
            df.taxonomy[count] = 1
            count += 1
        elif 'proteobacteria' in str(i).lower():
            df.taxonomy[count] = 2
            count += 1
        elif 'archaea' in str(i).lower():
            df.taxonomy[count] = 3
            count += 1
        else:
            df.taxonomy[count] = 4
            count += 1
    
    return df

In [None]:
# Read the CSV file and run the categories_taxonomy() function. 
# Slice the dataframe down to five columns and rename them.
kaiA = pd.read_csv('../data/kaiA.csv')
kaiA = categories_taxonomy(kaiA)
kaiA = kaiA[['name','genus','taxonomy','length','seq']]
kaiA.columns = ['name','genus','taxonomy','kaiA_length','kaiA_seq']

kaiB = pd.read_csv('../data/kaiB.csv')
kaiB = categories_taxonomy(kaiB)
kaiB = kaiB[['name','genus','taxonomy','length','seq']]
kaiB.columns = ['name','genus','taxonomy','kaiB_length','kaiB_seq']

kaiC = pd.read_csv('../data/kaiC.csv')
kaiC = categories_taxonomy(kaiC)
kaiC = kaiC[['name','genus','taxonomy','length','seq']]
kaiC.columns = ['name','genus','taxonomy','kaiC_length','kaiC_seq']

In [None]:
# Remove special characters from genus name and change Candidatus
# genus name
count = 0
for genus in kaiC['genus']:
    kaiC['genus'][count] = re.sub('[^a-zA-Z0-9 \n\.]', '', genus)
    if genus == 'Candidatus':
        kaiC['genus'][count] = kaiC['name'][count].split(' ',2)[1]
    count += 1

# Create an organisms list without duplicates
kaiC_orgs = list(set(kaiC.genus))

# Merge the dataframes, drop duplicates and remove entrances
# without KaiC
KaiABC = pd.merge(kaiA, kaiB, how='outer')
KaiABC = pd.merge(KaiABC, kaiC, how='outer')

KaiABC = KaiABC.drop_duplicates()
KaiABC = KaiABC[KaiABC.kaiC_length.notnull()]

not_found = ['Oscillatoriales', 'Opitutaceae', 'halophilic',
             'Candidatus', 'Aquifex', 'Filomicrobium', 'Ziziphus']

orgs = list(set(kaiC_orgs) - set(not_found))

# Keep only those organisms in the dataframe that occur in 
# the organisms list
KaiABC = KaiABC[KaiABC.genus.isin(orgs)]

# Fill nan values with 0
ABC = KaiABC.fillna(0)

### Scatterplot ###

In [None]:
f = plt.figure(figsize=(7.5,6.0))
    
plots = gridspec.GridSpec(5, 5, wspace=0.0, hspace=0.0,
                          height_ratios=[0.5, 2.5, 0.55, 0.5, 2.5],
                          width_ratios=[2.5, 0.5, 0.55, 2.5, 0.5])

# Remove the plot frame lines.
def clean_axis(ax):
    for sp in ax.spines.values():
        sp.set_visible(False)
        
hfont = {'fontname':'Arial'}

############ KaiAC Scatter Plot ############
AC = f.add_subplot(plots[1,0])

# Remove the tick marks and place labels at bottom and left
AC.tick_params(axis="both", which="both", bottom="off", top="off",  
               left="off", right="off",
               labelbottom="on",  labelleft="on")

clean_axis(AC)

# Plot KaiA sequence length vs. KaiC sequence length
# for each group.
AC.scatter(ABC.kaiA_length[ABC.taxonomy == 4], 
           ABC.kaiC_length[ABC.taxonomy == 4],
           color='k', edgecolor='', s=10, 
           alpha=0.3, label='Other', marker='d')

AC.scatter(ABC.kaiA_length[ABC.taxonomy == 3], 
           ABC.kaiC_length[ABC.taxonomy == 3],
           color='#e31a1c', edgecolor='', s=15, 
           alpha=0.5, label='Archaea', marker='^')

AC.scatter(ABC.kaiA_length[ABC.taxonomy == 2],
           ABC.kaiC_length[ABC.taxonomy == 2],
           color='#1f78b4', edgecolor='', s=10,
           alpha=0.5, label='Proteobacteria', marker='s')

AC.scatter(ABC.kaiA_length[ABC.taxonomy == 1],
           ABC.kaiC_length[ABC.taxonomy == 1],
           color='#33a02c', edgecolor='', s=10,
           alpha=0.5, label='Cyanobacteria', marker='o')

AC.set_xticks(range(0, 401, 100))
AC.set_xticklabels([str(x) for x in range(0, 401, 100)],
                   fontsize=7, **hfont)

AC.set_yticks(range(0, 701, 100))
AC.set_yticklabels([str(x) for x in range(0, 701, 100)],
                   fontsize=7, **hfont)

AC.set_ylabel('KaiC Length [AA]', fontsize=8, **hfont)
AC.set_xlabel('KaiA Length [AA]', fontsize=8, **hfont)

AC.set_xlim(-10,400)
AC.set_ylim(-10,770)

# Plot custom grid lines
for y in range(0, 701, 100):  
    AC.plot(range(-7, 400), [y] * len(range(-7, 400)), ":", lw=0.25,
            color="black", alpha=0.3) 
    
for x in range(0, 301, 100):  
    AC.plot([x] * len(range(-7, 800)), range(-7, 800), ":", lw=0.25,
            color="black", alpha=0.3)

# Plot custom axis
AC.plot((-7, 400), (-7,-7), "-", lw=0.5, color="grey")
AC.plot((400,400), (-7,770), "-", lw=0.5, color="grey")
AC.plot((-7, 400), (770,770), "-", lw=0.5, color="grey")
AC.plot((-7,-7), (-7,770), "-", lw=0.5, color="grey")

AC.text(-50,850, 'A', fontsize=15, **hfont)

############ KaiC length distribution ############
C1_dist = f.add_subplot(plots[1,1], sharey=AC)
 
C1_dist.tick_params(axis="both", which="both", bottom="off", top="off",  
                    left="off", right="off",
                    labelbottom="off", labelleft="off")

clean_axis(C1_dist)

data = ABC.sort_values(by=['kaiC_length'])

# Calculate the kernel density for KaiA length larger than 0
density = s.gaussian_kde(data['kaiC_length'][data['kaiC_length'] > 0],
                         bw_method='scott')

x = np.linspace(0,800,10000)

# Plot the calculated density distribution and
# fill the area underneath.
C1_dist.plot(density(x), x, color='#969696', lw=0.5)
C1_dist.fill_betweenx(x, 1e-4, density(x), color='#969696',
                      alpha=1, lw=0)
    
C1_dist.set_ylim(-10,770)
C1_dist.set_xlim(-0.0002, 0.008)

############ KaiA length distribution ############
A1_dist = f.add_subplot(plots[0,0], sharex=AC)
  
A1_dist.tick_params(axis="both", which="both", bottom="off", top="off",  
                    left="off", right="off",
                    labelbottom="off", labelleft="off")

clean_axis(A1_dist)

data = ABC.sort_values(by=['kaiA_length'])

density = s.gaussian_kde(data['kaiA_length'][data['kaiA_length'] > 0],
                         bw_method='scott')

x = np.linspace(0,410,10000)

A1_dist.plot(x, density(x), color='#969696', lw=0.5)
A1_dist.fill_between(x, 1e-4, density(x), color='#969696',
                     alpha=0.25, lw=0)
        
A1_dist.set_xlim(-10,400)
A1_dist.set_ylim(-0.0005, 0.025)

############ KaiBC Scatter Plot ############
BC = f.add_subplot(plots[1,3])
 
BC.tick_params(axis="both", which="both", bottom="off", top="off",  
               left="off", right="off",
               labelbottom="on", labelleft="on")

clean_axis(BC)

BC.scatter(ABC.kaiB_length[ABC.taxonomy == 4],
           ABC.kaiC_length[ABC.taxonomy == 4],
           color='k', edgecolor='', s=10,
           alpha=0.3, label='Other', marker='d')

BC.scatter(ABC.kaiB_length[ABC.taxonomy == 3],
           ABC.kaiC_length[ABC.taxonomy == 3],
           color='#e31a1c', edgecolor='', s=15,
           alpha=0.5, label='Archaea', marker='^')

BC.scatter(ABC.kaiB_length[ABC.taxonomy == 2],
           ABC.kaiC_length[ABC.taxonomy == 2],
           color='#1f78b4', edgecolor='', s=10,
           alpha=0.5, label='Proteobacteria', marker='s')

BC.scatter(ABC.kaiB_length[ABC.taxonomy == 1],
           ABC.kaiC_length[ABC.taxonomy == 1],
           color='#33a02c', edgecolor='', s=10,
           alpha=0.5, label='Cyanobacteria', marker='o')

BC.set_xticks(range(0, 401, 100))
BC.set_xticklabels([str(x) for x in range(0, 401, 100)],
                   fontsize=7, **hfont)

BC.set_yticks(range(0, 701, 100))
BC.set_yticklabels([str(x) for x in range(0, 701, 100)],
                   fontsize=7, **hfont)

BC.set_ylabel('KaiC Length [AA]', fontsize=8, **hfont)
BC.set_xlabel('KaiB Length [AA]', fontsize=8, **hfont)
    
BC.set_xlim(-10,400)
BC.set_ylim(-10,770)

for y in range(0, 701, 100):  
    BC.plot(range(-7, 400), [y] * len(range(-7, 400)), ":", lw=0.25,
            color="black", alpha=0.3) 
    
for x in range(0, 301, 100):  
    BC.plot([x] * len(range(-7, 800)), range(-7, 800), ":", lw=0.25,
            color="black", alpha=0.3)

BC.plot((-7, 400), (-7,-7), "-", lw=0.5, color="grey")
BC.plot((400,400), (-7,770), "-", lw=0.5, color="grey")
BC.plot((-7, 400), (770,770), "-", lw=0.5, color="grey")
BC.plot((-7,-7), (-7,770), "-", lw=0.5, color="grey")

BC.text(-50,850, 'B', fontsize=15, **hfont)

############ KaiC length distribution ############
C2_dist = f.add_subplot(plots[1,4], sharey=BC)
 
C2_dist.tick_params(axis="both", which="both", bottom="off", top="off",  
                    left="off", right="off",
                    labelbottom="off", labelleft="off")
    
clean_axis(C2_dist)

data = ABC.sort_values(by=['kaiC_length'])
    
density = s.gaussian_kde(data['kaiC_length'][data['kaiC_length'] > 0],
                         bw_method='scott')

x = np.linspace(0,800,10000)
    
C2_dist.plot(density(x), x, color='#969696', lw=0.5)
C2_dist.fill_betweenx(x, 1e-4, density(x), color='#969696',
                      alpha=1, lw=0)
    
C2_dist.set_ylim(-10,770)
C2_dist.set_xlim(-0.0002, 0.008)

############ KaiB length distribution ############
B1_dist = f.add_subplot(plots[0,3], sharex=BC)

B1_dist.tick_params(axis="both", which="both", bottom="off", top="off",  
                    left="off", right="off",
                    labelbottom="off", labelleft="off")

clean_axis(B1_dist)

data = ABC.sort_values(by=['kaiB_length'])
    
density = s.gaussian_kde(data['kaiB_length'][data['kaiB_length'] > 0],
                         bw_method='scott')

x = np.linspace(0,410,10000)
    
B1_dist.plot(x, density(x), color='#969696', lw=0.5)
B1_dist.fill_between(x, 1e-4, density(x), color='#969696',
                     alpha=0.65, lw=0)
      
B1_dist.set_xlim(-10,400)
B1_dist.set_ylim(-0.0005, 0.02)

############ KaiAB Scatter Plot ############
AB = f.add_subplot(plots[4,0])

AB.tick_params(axis="both", which="both", bottom="off", top="off",  
                 left="off", right="off",
                 labelbottom="on", labelleft="on")

clean_axis(AB)

AB.scatter(ABC.kaiA_length[ABC.taxonomy == 4],
           ABC.kaiB_length[ABC.taxonomy == 4],
           color='k', edgecolor='', s=10,
           alpha=0.3, label='Other', marker='d')

AB.scatter(ABC.kaiA_length[ABC.taxonomy == 3],
           ABC.kaiB_length[ABC.taxonomy == 3],
           color='#e31a1c', edgecolor='', s=15,
           alpha=0.5, label='Archaea', marker='^')

AB.scatter(ABC.kaiA_length[ABC.taxonomy == 2],
           ABC.kaiB_length[ABC.taxonomy == 2],
           color='#1f78b4', edgecolor='', s=10,
           alpha=0.5, label='Proteobacteria', marker='s')

AB.scatter(ABC.kaiA_length[ABC.taxonomy == 1],
           ABC.kaiB_length[ABC.taxonomy == 1],
           color='#33a02c', edgecolor='', s=10,
           alpha=0.5, label='Cyanobacteria', marker='o')

AB.set_xticks(range(0, 401, 100))
AB.set_xticklabels([str(x) for x in range(0, 401, 100)],
                   fontsize=7, **hfont)

AB.set_yticks(range(0, 401, 100))
AB.set_yticklabels([str(x) for x in range(0, 401, 100)],
                   fontsize=7, **hfont)

AB.set_ylabel('KaiB Length [AA]', fontsize=8, **hfont)
AB.set_xlabel('KaiA Length [AA]', fontsize=8, **hfont)
    
AB.set_xlim(-10,400)
AB.set_ylim(-10,400)

for y in range(0, 301, 100):  
    AB.plot(range(-7, 400), [y] * len(range(-7, 400)), ":", lw=0.25,
            color="black", alpha=0.3) 
    
for x in range(0, 301, 100):  
    AB.plot([x] * len(range(-7, 400)), range(-7, 400), ":", lw=0.25,
            color="black", alpha=0.3)

AB.plot((-7, 400), (-7,-7), "-", lw=0.5, color="grey")
AB.plot((400,400), (-7,400), "-", lw=0.5, color="grey")
AB.plot((-7, 400), (400,400), "-", lw=0.5, color="grey")
AB.plot((-7,-7), (-7,400), "-", lw=0.5, color="grey")

AB.text(-50,450, 'C', fontsize=15, **hfont)

############ KaiB length distribution ############
B2_dist = f.add_subplot(plots[4,1], sharey=AB)

B2_dist.tick_params(axis="both", which="both", bottom="off", top="off",  
                    left="off", right="off",
                    labelbottom="off", labelleft="off")
    
clean_axis(B2_dist)

data = ABC.sort_values(by=['kaiB_length'])
    
density = s.gaussian_kde(data['kaiB_length'][data['kaiB_length'] > 0],
                         bw_method='scott')

x = np.linspace(0,400,10000)
    
B2_dist.plot(density(x), x, color='#969696', lw=0.5)
B2_dist.fill_betweenx(x, 1e-4, density(x), color='#969696',
                      alpha=0.65, lw=0)
    
B2_dist.set_ylim(-10,400)
B2_dist.set_xlim(-0.0005, 0.02)

############ KaiA length distribution ############
A2_dist = f.add_subplot(plots[3,0], sharex=AB)

A2_dist.tick_params(axis="both", which="both", bottom="off", top="off",  
                    left="off", right="off",
                    labelbottom="off", labelleft="off")

clean_axis(A2_dist)

data = ABC.sort_values(by=['kaiA_length'])
    
density = s.gaussian_kde(data['kaiA_length'][data['kaiA_length'] > 0],
                         bw_method='scott')

x = np.linspace(0,400,10000)
    
A2_dist.plot(x, density(x), color='#969696', lw=0.5)
A2_dist.fill_between(x, 0, density(x), color='#969696',
                     alpha=0.25, lw=0)
        
A2_dist.set_xlim(-10,400)
A2_dist.set_ylim(-0.0005, 0.025)

############ Legend ############
ax_led = f.add_subplot(plots[4,3])

ax_led.tick_params(axis="both", which="both", bottom="off", top="off",  
                   left="off", right="off",
                   labelbottom="off", labelleft="off")

clean_axis(ax_led)

ax_led.scatter([], [],
               color='k', edgecolor='', s=40,
               alpha=0.3, label='Other', marker='d')
ax_led.scatter([], [],
               color='#e31a1c', edgecolor='', s=60,
               alpha=0.5, label='Archaea', marker='^')
ax_led.scatter([], [],
               color='#1f78b4', edgecolor='', s=40,
               alpha=0.5, label='Proteobacteria', marker='s')
ax_led.scatter([], [],
               color='#33a02c', edgecolor='', s=40,
               alpha=0.5, label='Cyanobacteria', marker='o')

handles, labels = ax_led.get_legend_handles_labels()

ax_led.legend(handles[::-1], labels[::-1], scatterpoints=1, fontsize=8,
              loc='upper left').draw_frame(False)

plt.savefig('../scatter.pdf', format='pdf',
            bbox_inches='tight', dpi=1000)

plt.show()

---
### Previous ###

+ [Data Collection and Processing](1_KaiABC_BLAST_Data_Collection_and_Perprocessing.ipynb)
+ [Distribution of Circadian Clock Proteins](2_KaiABC_BLAST_Heatmap.ipynb)

### Next ###

+ [Co-Occurence of Circadian Clock Proteins in Cyanobacteria](4_KaiABC_BLAST_FisherTest.ipynb)
+ [Additional Analyses](5_KaiABC_BLAST_Other.ipynb)

---