In [143]:
import pandas as pd
import numpy as np
import skbio
from collections import Counter

In [None]:
#need to calculate shannon's index

In [4]:
mapped_motif_bed = "../../data/FIMO/responsivepromoters_motifs_mapped.bed"

In [9]:
df = pd.read_table(mapped_motif_bed, sep='\t', header=None)
cols = ['chr', 'start', 'stop', 'name_rep', 'score', 'strand', 'promoter_AGI', 'p-value', 'q-value', 'matched_sequence', 'TF_name', 'TF_AGI']
df.columns = cols

In [35]:
df

Unnamed: 0,chr,start,stop,name_rep,score,strand,promoter_AGI,p-value,q-value,matched_sequence,TF_name,TF_AGI
0,1,33629,33647,ABI3VP1_tnt.VRN1_colamp_a,7.86301,-,AT1G01050,0.000080,0.04650,AAGTTTTGTCTTTTGGTGT,VRN1,AT3G18990
1,1,33961,33979,ABI3VP1_tnt.VRN1_colamp_a,10.71230,+,AT1G01050,0.000030,0.02870,GTGGATGTTTTTTTTTTAC,VRN1,AT3G18990
2,1,33962,33982,REM_tnt.REM19_colamp_a,11.16180,-,AT1G01050,0.000034,0.02860,GAGGTAAAAAAAAAACATCCA,REM19,AT1G49480
3,1,33964,33982,ABI3VP1_tnt.VRN1_colamp_a,10.35620,+,AT1G01050,0.000034,0.03080,GATGTTTTTTTTTTACCTC,VRN1,AT3G18990
4,1,34064,34084,ND_tnt.FRS9_col_a,7.51562,+,AT1G01050,0.000014,0.03250,ACTTGGCTCTCTTTCACTTCC,FRS9,AT4G38170
...,...,...,...,...,...,...,...,...,...,...,...,...
16242,5,26957500,26957518,C2H2_tnt.TF3A_col_a,11.18640,-,AT5G67590,0.000018,0.04110,GCTTCTTCTCCTCCTTCCT,TF3A,AT1G72050
16243,5,26957503,26957521,C2H2_tnt.TF3A_col_a,12.82200,-,AT5G67590,0.000008,0.02690,CATGCTTCTTCTCCTCCTT,TF3A,AT1G72050
16244,5,26957535,26957563,BBRBPC_tnt.BPC5_colamp_a,3.36047,+,AT5G67590,0.000009,0.01770,AACCAAAGGAGAGTTTTCGAGAGAGAGAC,BPC5,AT4G38910
16245,5,26957541,26957564,BBRBPC_tnt.BPC1_col_a,1.12500,+,AT5G67590,0.000034,0.04670,AGGAGAGTTTTCGAGAGAGAGACT,BPC1,AT2G01930


##### The Shannon index is an information statistic index, which means it assumes all species are represented in a sample and that they are randomly sampled. 
##### In the Shannon index, p is the proportion (n/N) of individuals of one particular species found (n) divided by the total number of individuals found (N), ln is the natural log, Σ is the sum of the calculations, and s is the number of species.  
##### The Simpson index is a dominance index because it gives more weight to common or dominant species.  In this case, a few rare species with only a few representatives will not affect the diversity. 
##### In the Simpson index, p is the proportion (n/N) of individuals of one particular species found (n) divided by the total number of individuals found (N), Σ is still the sum of the calculations, and s is the number of species.

### calculate shannon's index

#### variables:
#### H = Shannon's diversity index
#### S = total no. of TFs binding a promoter (species in a community)
#### Pi = proportion of S made up of the ith species
#### E_H = equitability (eveness)

### first split df by promoter

#### get names of each promoter

In [12]:
print(df.dtypes)

chr                   int64
start                 int64
stop                  int64
name_rep             object
score               float64
strand               object
promoter_AGI         object
p-value             float64
q-value             float64
matched_sequence     object
TF_name              object
TF_AGI               object
dtype: object


In [18]:
promoters = df.promoter_AGI.unique()

#### turn into a df

In [20]:
shannon_df = pd.DataFrame(promoters)

In [23]:
shannon_df

Unnamed: 0,0
0,AT1G01050
1,AT1G01780
2,AT1G01910
3,AT1G02000
4,AT1G03070
...,...
394,AT5G65360
395,AT5G65470
396,AT5G66250
397,AT5G67210


### calculate S

#### groupby promoter, and include only unique TFs within each promoter group. Preserve column names.

In [77]:
groupby_promoter = df.groupby('promoter_AGI')['TF_AGI'].nunique().copy()


In [57]:
groupby_promoter = df.groupby(by='promoter_AGI', as_index=False).agg({'TF_AGI': pd.Series.nunique})

In [58]:
print(groupby_promoter)

    promoter_AGI  TF_AGI
0      AT1G01050       4
1      AT1G01780      19
2      AT1G01910      16
3      AT1G02000       7
4      AT1G03070      43
..           ...     ...
394    AT5G65360       5
395    AT5G65470       3
396    AT5G66250       1
397    AT5G67210       9
398    AT5G67590       7

[399 rows x 2 columns]


##### rename cols

In [59]:
groupby_promoter.rename(columns = {'TF_AGI':'S_unique_TF_count'}, inplace = True)

In [60]:
print(groupby_promoter)

    promoter_AGI  S_unique_TF_count
0      AT1G01050                  4
1      AT1G01780                 19
2      AT1G01910                 16
3      AT1G02000                  7
4      AT1G03070                 43
..           ...                ...
394    AT5G65360                  5
395    AT5G65470                  3
396    AT5G66250                  1
397    AT5G67210                  9
398    AT5G67590                  7

[399 rows x 2 columns]


## Pi = proportion of S made up of the ith species

In [114]:
groupby_promoter_counts = df.groupby(['promoter_AGI', 'TF_AGI'], as_index=False).size()

In [142]:
groupby_promoter_counts

promoter_AGI  TF_AGI   
AT1G01050     AT1G49480    1
              AT3G18990    3
              AT4G38170    1
              AT4G38910    2
AT1G01780     AT1G29160    1
                          ..
AT5G67590     AT2G01930    4
              AT3G18990    2
              AT4G38170    1
              AT4G38910    9
              AT5G66940    1
Length: 4264, dtype: int64

In [131]:
counts = pd.DataFrame(groupby_promoter_counts)

In [144]:
#count no. of each TF binding in each promoter
groupby_promoter_counts = df.groupby('promoter_AGI')['TF_AGI'].value_counts().unstack(fill_value=0)

In [148]:
groupby_promoter_counts = df.pivot_table(index='TF_AGI', columns='promoter_AGI', aggfunc = 'size', fill_value=0)

In [149]:
groupby_promoter_counts

promoter_AGI,AT1G01050,AT1G01780,AT1G01910,AT1G02000,AT1G03070,AT1G03400,AT1G03457,AT1G03560,AT1G03920,AT1G04250,...,AT5G60490,AT5G61360,AT5G62650,AT5G64050,AT5G64813,AT5G65360,AT5G65470,AT5G66250,AT5G67210,AT5G67590
TF_AGI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT1G01250,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G02230,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G03800,0,0,0,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G06180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT1G09540,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT5G65210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT5G66940,0,1,0,0,0,0,0,1,0,0,...,2,1,0,0,0,0,0,0,1,1
AT5G67000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AT5G67190,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [153]:
#test run, known shannon's diversity H is 1.372 with base e
test = pd.read_csv('../../data/test.csv')
test

Unnamed: 0,species,n
0,rabbit,6
1,plant,5
2,human,1
3,monkey,3
4,cat,12


In [155]:
skbio.diversity.alpha.shannon(test.n, base=np.e)

1.3731530496710267

### this is correct

In [156]:
#I will use base 2 for my calculations (The choice of logarithm base is arbitrary and can be chosen freely)

skbio.diversity.alpha.shannon(groupby_promoter_counts.AT1G01050, base=2)

1.8423709931771086

In [None]:
#read in promoter.bed file