# Statistiques descriptives

In [1]:
import numpy as np
import scipy.stats as stat
import pandas as pd

In [2]:
d = np.array([0.553,0.57,0.576,0.601,0.606,0.606,0.609,0.611,0.615,0.628,0.654,0.662,0.668,0.67,0.672,0.69,0.693,0.749])

In [3]:
stat_des = stat.describe(d)
print(stat_des)

DescribeResult(nobs=18, minmax=(0.553, 0.749), mean=0.6351666666666667, variance=0.0025368529411764714, skewness=0.38763289979752136, kurtosis=-0.3587369048751916)


In [4]:
print(stat_des[0])

18


In [5]:
#médiane de NumPy
print(np.median(d)) # 0.6215

#fonction de répartition empirique
print(stat.percentileofscore(d,0.6215))

0.6214999999999999
50.0


In [6]:
#loi normale centrée et réduite
print(stat.norm.ppf(0.95,loc=0,scale=1)) # quantile d’ordre 0.95 de la loi normale CR = 1.64485
print
print(stat.norm.cdf(1.96,loc=0,scale=1)) # 0.975

1.6448536269514722
0.9750021048517795


In [7]:
#loi de Student - ddl = 30
print(stat.t.ppf(0.95,df=30)) # 1.6972
print
print(stat.t.cdf(1.96,df=30)) # 0.9703

1.6972608943617378
0.9703288435519748


In [8]:
#loi du khi-2 - ddl = 10
print(stat.chi.ppf(0.95,df=10)) # 4.2787
print
print(stat.chi.cdf(4.84,df=10)) # 0.9907

4.278672463892877
0.9907199233490717


In [9]:
#loi de Fisher, ddl numérateur = 1, ddl dénominateur = 30
print(stat.f.ppf(0.95,dfn=1,dfd=30)) # 4.1709
print
print(stat.f.cdf(3.48,dfn=1,dfd=30)) # 0.9281

4.170876785766691
0.9280786614651921


In [10]:
#test de normalité d'Agostino
ag = stat.normaltest(d) # message d’avertissement, n est trop faible pour un test fiable
print(ag) # (0.714, 0.699), statistique de test et p-value (si p-value < α, rejet de l’hyp. de normalité)

#test de Normalité Shapiro-Wilks
sp = stat.shapiro(d)
print(sp) # (0.961, 0.628), statistique et p-value
print

#test d'adéquation d'Anderson-Darling
ad = stat.anderson(d,dist="norm") # test possible pour autre loi que « norm »
print(ad) # (0.3403, array([ 0.503, 0.573, 0.687, 0.802, 0.954]), array([ 15. , 10. , 5. , 2.5, 1. ]))  stat de test, seuils critiques pour chaque niveau de risque, on constate ici que la p-value est sup. à 15%

NormaltestResult(statistic=0.7143907939185827, pvalue=0.699635777674135)
(0.9613386988639832, 0.6276763677597046)
AndersonResult(statistic=0.34029632368620355, critical_values=array([0.503, 0.573, 0.687, 0.802, 0.954]), significance_level=array([15. , 10. ,  5. ,  2.5,  1. ]))


  "anyway, n=%i" % int(n))


In [11]:
#génération de valeurs aléatoires - loi normale (0, 1)
alea1 = stat.norm.rvs(loc=0,scale=1,size=30)
print(stat.normaltest(alea1)) # (2.16, 0.338), compatible avec la loi normale (heureusement !)
print

#génération - loi exponentielle
alea2 = stat.expon.rvs(size=30)
print(stat.normaltest(alea2)) # (17.62, 0.00015), non compatible (bien sûr)
print

#Numpy aussi propose un générateur
alea3 = np.random.normal(loc=0,scale=1,size=30)
print(stat.normaltest(alea3)) # (2.41, 0.299), compatible
print

#échantillonnage de m obs. parmi n
d1 = np.random.choice(d,size=5,replace=False) #sans remise
print(d1) # (0.69 0.628 0.606 0.662 0.668)
print

d2 = np.random.choice(d,size=5,replace=True) #avec remise
print(d2) # (0.654 0.67 0.628 0.654 0.609)

NormaltestResult(statistic=0.287557486580451, pvalue=0.8660793527295024)
NormaltestResult(statistic=4.3739354768060394, pvalue=0.112256624512113)
NormaltestResult(statistic=0.1765290850356297, pvalue=0.9155186510761848)
[0.662 0.606 0.628 0.672 0.611]
[0.67  0.576 0.693 0.672 0.553]


In [12]:
#test de conformité de la moyenne
print(stat.ttest_1samp(d,popmean=0.618))
print

# (1.446, 0.166), stat. de test et p-value, p-value < α, rejet de H0
#*** si l'on s’amuse à détailler les calculs ***
#moyenne
m = np.mean(d) # 0.6352

#écart-type – ddof = 1 pour effectuer le calcul : 1/(n-1)
sigma = np.std(d,ddof=1) # 0.0504

#stat. de test t
import math
t = (m - 0.618)/(sigma/math.sqrt(d.size))
print(t) # 1.446, on retrouve bien la bonne valeur de la stat de test
print
#p-value – c’est un test bilatéral
#t distribution de Student, cdf() : cumulative distribution function

p = 2.0 * (1.0 - stat.t.cdf(math.fabs(t),d.size-1))
print(p) # 0.166, et la bonne p-value

Ttest_1sampResult(statistic=1.4460209354443436, pvalue=0.16635614378468758)
1.4460209354443436
0.16635614378468744


In [13]:
import numpy as np
import scipy.stats as stat

#treated – valeurs pour échantillon des individus ayant suivi le traitement
dt = np.array([24,43,58,71,43,49,61,44,67,49,53,56,59,52,62,54,57,33,46,43,57])

#control – échantillon de contrôle
dc = np.array([42,43,55,26,62,37,33,41,19,54,20,85,46,10,17,60,53,42,37,42,55,28,48])

#t-test – comparaison de param. de localisation – hyp. de variances égales
print(stat.ttest_ind(dt,dc)) # (t = 2.2665, p-value = 0.0286)
print

#t-test de Welch – comparaison de moyennes – hyp. de variances inégales
print(stat.ttest_ind(dt,dc,equal_var=False)) # (2.3109, 0.0264)
print

#test de Mann-Whitney - non paramétrique - avec correction de continuité
print(stat.mannwhitneyu(dt,dc)) # (stat. U = 135, p-value unilatérale = 0.00634)
print

#test de Bartlett – comparaison de paramètres d’échelle (variance)
print(stat.bartlett(dt,dc)) # (stat. = 3.8455, p-value = 0.0498)
print

#test de Ansari Bradley
print(stat.ansari(dt,dc)) # (stat. = 266, p-value = 0.2477)
print

#test de Levene
print(stat.levene(dt,dc)) # (stat. = 2.342, p-value = 0.1334)
print

#test de Kolomogorov-Smirnov – écart entre les fonctions de répartition empiriques
print(stat.ks_2samp(dt,dc)) # (stat. = 0.4699, p-value = 0.0099)

Ttest_indResult(statistic=2.266551599585943, pvalue=0.028629482832245753)
Ttest_indResult(statistic=2.310889197854228, pvalue=0.02638241282442478)
MannwhitneyuResult(statistic=135.0, pvalue=0.006338907890604657)
BartlettResult(statistic=3.8455356421051388, pvalue=0.04987858739511786)
AnsariResult(statistic=266.0, pvalue=0.24768048701691592)
LeveneResult(statistic=2.3418185975754007, pvalue=0.133440294168933)
Ks_2sampResult(statistic=0.46997929606625255, pvalue=0.009902040336386678)




### Comparaison de populations

In [14]:
#paired samples test
d1968 = np.array([0.42,0.5,0.52,0.45,0.43,0.55,0.45,0.34,0.45,0.54,0.42,0.51,0.49,0.54,0.5,0.58,0.49,0.56,0.63])
d1972 = np.array([0.45,0.5,0.52,0.45,0.46,0.55,0.60,0.49,0.35,0.55,0.52,0.53,0.57,0.53,0.59,0.64,0.5,0.57,0.64])

#t-test related samples - paramétrique
print(stat.ttest_rel(d1968,d1972))# (stat.test = -2.45, p-value = 0.024)
print

#test des rangs signés – non paramétrique
print(stat.wilcoxon(d1968,d1972)) # (stat = 16, p-value = 0.0122)

Ttest_relResult(statistic=-2.457703815601802, pvalue=0.024352597586836344)
WilcoxonResult(statistic=16.0, pvalue=0.0122241123197889)


### Associations

In [15]:
#données pour corrélation et régression (Irlande du Nord non incluse)
dalc = np.array([6.47,6.13,6.19,4.89,5.63,4.52,5.89,4.79,5.27,6.08])
dtob = np.array([4.03,3.76,3.77,3.34,3.47,2.92,3.2,2.71,3.53,4.51])

#corrélation de Pearson
print(stat.pearsonr(dalc,dtob)) # (r = 0.7843, p-value pour test t = 0.0072)
print

#corrélation de Spearman - basé sur les rangs
print(stat.spearmanr(dalc,dtob)) # (rho = 0.8303, p-value = 0.0029)
print

#tau de Kendall - concordance et discordance
print(stat.kendalltau(dalc,dtob)) # (tau = 0.6444, p-value = 0.0095)
print

#régression linéaire simple
print(stat.linregress(dalc,dtob)) # (pente = 0.6115, const = 0.1081, r = 0.7843, p-value test signif. pente = 0.0072, sigma err = 0.1710)

(0.7842873201538677, 0.007234358351729502)
SpearmanrResult(correlation=0.8303030303030302, pvalue=0.0029402270232795065)
KendalltauResult(correlation=0.6444444444444444, pvalue=0.00949109605344092)
LinregressResult(slope=0.6115012914647241, intercept=0.10815378587805125, rvalue=0.7842873201538676, pvalue=0.007234358351729509, stderr=0.17102020517368527)


### Comparaison de K populations

In [16]:
#Sodium – données pour tests K échantillons indép.
dbeef = np.array([495,477,425,322,482,587,370,322,479,375,330,300,386,401,645,440,317,319,298,253])
dmeat = np.array([458,506,473,545,496,360,387,386,507,393,405,372,144,511,405,428,339])
dpoultry = np.array([430,375,396,383,387,542,359,357,528,513,426,513,358,581,588,522,545])

#test égalité des dispersions (K variances)
print(stat.levene(dbeef,dmeat,dpoultry)) # (stat. = 0.2494, p-value = 0.7802)
print

#ANOVA à 1 facteur - comparaison des moyennes - paramétrique
print(stat.f_oneway(dbeef,dmeat,dpoultry)) # (stat. F = 1.7778, p-value = 0.1793)
print

#Test de Kruskal-Wallis – non-paramétrique
print(stat.kruskal(dbeef,dmeat,dpoultry)) # (stat. K = 4.1728, p-value = 0.0947)

LeveneResult(statistic=0.24943197044211946, pvalue=0.7801883296333626)
F_onewayResult(statistic=1.7777910697271164, pvalue=0.17932466808492342)
KruskalResult(statistic=4.7128082695637135, pvalue=0.09476035676351431)
