In [29]:
%matplotlib inline

In [30]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
data = pd.read_excel('data.xlsx', header=1)

## Внутривидовая изменчивость по количественным признакам

In [31]:
data_num = data[['L1p1l','L1p2l','W1p1l','W1p2l','L2p3l','L2p4l','W2p3l','W2p4l', 'S2p3l','S2p4l','Lkd','Wkd','OtnWLkd','Dvsh','Dosh','Lp','Dpl','Lns','Wns','Lvs','Wvs'
]]
data_nonnum =  data[['Dp','Dvl','Dnl','Dc','Dvns','Dnns','Dvvs','Dnvs']]
num_null = data_num.isnull().sum(axis=1)
nonnum_null = data_nonnum.isnull().sum(axis=1)
sp_var = data['Sp']
pop_var = data['idCP']
num_features = ['L1p1l','L1p2l','W1p1l','W1p2l','L2p3l','L2p4l','W2p3l','W2p4l', 'S2p3l','S2p4l','Lkd','Wkd','OtnWLkd','Dvsh','Dosh','Lp','Dpl','Lns','Wns','Lvs','Wvs'
]
nonnum_features = ['Dp','Dvl','Dnl','Dc','Dvns','Dnns','Dvvs','Dnvs']

In [32]:
data_num = data_num[num_null==0]
sp_var = sp_var[num_null==0]

### Анализ изменчивости для каждого вида в отдельности

In [33]:
most_constant = []
most_variable = []
for sp in sp_var.unique():
    sel = data_num[sp_var==sp]
    cvars = sel.var(axis=0) / sel.mean(axis=0) * 100.0
    cvars.sort_values(inplace=True)
    print('Coefficient of variations for %s' % sp, 'Sample size: ', np.sum(sp_var==sp))
    print(cvars)
    print('Top 10 Most constant features:')
    print(cvars.index[:10].tolist(),'\n'*2)
    most_constant.append(cvars.index[:10].tolist())
    most_variable.append(cvars.index[-10:].tolist())
print('Most constant features as matrix:\n', np.matrix(most_constant),'\n' * 2)
print('Most variable features as matrix:\n', np.matrix(most_variable),'\n' * 2)
print('\n\nCommon constant features:', set.intersection(*map(lambda x: set(x), most_constant)))
print('\n\nCommon variable features:', set.intersection(*map(lambda x: set(x), most_variable)))

Coefficient of variations for Df Sample size:  383
OtnWLkd     0.563678
S2p4l       4.169237
Wvs         5.141845
S2p3l       5.422698
Wns         9.047363
Lvs         9.795145
W2p4l      10.508721
Wkd        10.540564
W2p3l      10.636286
W1p2l      11.314903
W1p1l      11.349886
Lns        11.454953
Dpl        11.867544
Dvsh       20.226930
Dosh       26.068964
L1p2l      28.918738
L1p1l      29.346109
L2p4l      30.178134
L2p3l      30.511250
Lkd        30.891143
Lp         61.841022
dtype: float64
Top 10 Most constant features:
['OtnWLkd', 'S2p4l', 'Wvs', 'S2p3l', 'Wns', 'Lvs', 'W2p4l', 'Wkd', 'W2p3l', 'W1p2l'] 


Coefficient of variations for Dp Sample size:  20
OtnWLkd    0.190843
Wvs        0.380625
Lvs        1.499871
S2p4l      1.594441
S2p3l      1.642105
Wns        2.485036
Wkd        2.951147
Dpl        3.187465
W1p1l      3.458033
W1p2l      3.616379
Dvsh       4.360428
Lns        4.371019
Lkd        4.989844
W2p3l      5.235260
Dosh       5.883148
L1p1l      6.316665
W2p4

Таким образом, виды имеют приблизительно одинаковые наборы как изменчивых, так и констанстантных признаков, среди всех количественных признаков. При этом общими констанстными признаками являются {'Wns', 'S2p4l', 'Wvs', 'S2p3l', 'OtnWLkd'}, а общими, наиболее вариабельными - {'Dosh', 'L1p2l', 'L2p3l', 'Lkd', 'Lp', 'L2p4l'}.

## Внутривидовая изменчивость по качественным признакам

In [83]:
from IPython.core.display import display, HTML
uniqus = []
allfeatures = []
for sp in data['Sp'].unique():
    sel = data.loc[data.Sp==sp, nonnum_features]
    res = pd.get_dummies(sel)
    freq = res.sum(axis=0) / sum(data.Sp==sp)
    freq.sort_values(inplace=True)
    display(HTML('<strong> Current species: %s, total=%s</strong><br>' % (sp, sum(data.Sp == sp))), freq)
    uniqus += freq.index[freq == 1.0].tolist()
    allfeatures += freq.index.tolist()
display(HTML('<H2>Unique features, counts: </H2>'))
for f in set(uniqus):
    print('Unique: ', f, allfeatures.count(f))

Dvns_O      0.002611
Dc_OG       0.007833
Dnvs_G      0.007833
Dnl_G       0.010444
Dvl_G       0.010444
Dvns_OG     0.013055
Dvns_G      0.015666
Dp_OG       0.015666
Dvl_OG      0.031332
Dnvs_OG     0.033943
Dnns_OG     0.054830
Dnns_O      0.054830
Dnl_OG      0.065274
Dp_G        0.070496
Dnl_O       0.104439
Dvvs_O      0.156658
Dvvs_Re     0.164491
Dc_Re       0.172324
Dvns_ORe    0.221932
Dp_ORe      0.229765
Dvl_ORe     0.263708
Dnns_ORe    0.276762
Dnl_Re      0.289817
Dnvs_ORe    0.357702
Dnl_ORe     0.530026
Dnvs_Re     0.600522
Dnns_Re     0.613577
Dvvs_ORe    0.678851
Dp_Re       0.684073
Dvl_Re      0.694517
Dvns_Re     0.746736
Dc_G        0.817232
dtype: float64

Dvl_G       0.10
Dnl_G       0.10
Dp_Re       0.15
Dnl_ORe     0.20
Dnvs_G      0.20
Dnns_ORe    0.20
Dvl_ORe     0.30
Dc_OG       0.30
Dvns_ORe    0.30
Dvl_Re      0.60
Dc_G        0.70
Dnl_Re      0.70
Dvns_Re     0.70
Dnvs_Re     0.80
Dnns_Re     0.80
Dp_G        0.85
Dvvs_ORe    1.00
dtype: float64

Dvns_Re     0.012346
Dnns_Re     0.012346
Dvvs_Re     0.012346
Dc_G        0.246914
Dvns_O      0.246914
Dc_Re       0.259259
Dc_ORe      0.493827
Dnns_O      0.493827
Dnns_ORe    0.493827
Dvns_ORe    0.740741
Dvvs_ORe    0.987654
Dp_O        1.000000
Dvl_O       1.000000
Dnl_O       1.000000
Dnvs_ORe    1.000000
dtype: float64

Dc_O        0.047619
Dvns_Re     0.047619
Dvvs_O      0.047619
Dc_ORe      0.952381
Dvns_ORe    0.952381
Dvvs_ORe    0.952381
Dp_O        1.000000
Dvl_O       1.000000
Dnl_O       1.000000
Dnns_ORe    1.000000
Dnvs_ORe    1.000000
dtype: float64

Dc_Re      0.015385
Dnl_ORe    0.015385
Dvl_Re     0.015385
Dnvs_Re    0.015385
Dp_Re      0.015385
Dvns_Re    0.046154
Dnns_Re    0.046154
Dvvs_Re    0.061538
Dvns_G     0.307692
Dvvs_OG    0.323077
Dvl_G      0.323077
Dp_OG      0.338462
Dnns_OG    0.338462
Dnl_OG     0.353846
Dc_G       0.353846
Dnvs_OG    0.369231
Dvvs_G     0.615385
Dnvs_G     0.615385
Dnns_G     0.615385
Dc_OG      0.630769
Dnl_G      0.630769
Dvns_OG    0.646154
Dp_G       0.646154
Dvl_OG     0.661538
dtype: float64

Dvl_Re      0.047619
Dc_G        0.047619
Dvns_Re     0.047619
Dnns_Re     0.047619
Dvvs_Re     0.047619
Dvl_ORe     0.952381
Dc_Re       0.952381
Dvns_ORe    0.952381
Dnns_ORe    0.952381
Dvvs_ORe    0.952381
Dp_ORe      1.000000
Dnl_ORe     1.000000
Dnvs_ORe    1.000000
dtype: float64

Unique:  Dnns_ORe 5
Unique:  Dvl_O 2
Unique:  Dnvs_ORe 4
Unique:  Dnl_O 3
Unique:  Dp_ORe 2
Unique:  Dvvs_ORe 5
Unique:  Dnl_ORe 4
Unique:  Dp_O 2
