In [4]:
# import numpy
import numpy as np

In [5]:
data = np.loadtxt('populations.txt')
year, hares, lynxes, carrots = data.T  # trick: columns to variables

## Descriptive Statistics

#### Tasks

- Compute the mean and std of the populations of each species for the years in the period.
- Which year each species had the largest population?
- Whose population does vary the most?
- Compute correlations between each pair of variables. Whose populations are correlated the most?

In [6]:
h_mean = hares.mean()
h_std = hares.std()
print(f"For hares, the mean is {h_mean} and the std is {h_std}.")
l_mean = lynxes.mean()
l_std = lynxes.std()
print(f"For lynxes, the mean is {l_mean} and the std is {l_std}.")
c_mean = carrots.mean()
c_std = carrots.std()
print(f"For carrots, the mean is {c_mean} and the std is {c_std}.")


For hares, the mean is 34080.95238095238 and the std is 20897.906458089667.
For lynxes, the mean is 20166.666666666668 and the std is 16254.591536908763.
For carrots, the mean is 42400.0 and the std is 3322.5062255844787.


In [7]:
h_max_year = year[np.argmax(hares)]
print(f"For hares, the year with the largest population is {h_max_year}.")
l_max_year = year[np.argmax(lynxes)]
print(f"For lynxes, the year with the largest population is {l_max_year}.")
c_max_year = year[np.argmax(carrots)]
print(f"For carrots, the year with the largest population is {c_max_year}.")


For hares, the year with the largest population is 1903.0.
For lynxes, the year with the largest population is 1904.0.
For carrots, the year with the largest population is 1900.0.


In [8]:
h_range = hares.max() - hares.min()
l_range = lynxes.max() - lynxes.min()
c_range = carrots.max() - carrots.min()
print(f"The population ranges for hares, lynxes, and carrots are resp. {h_range}, {l_range}, and {c_range}.")
print("The hares population varies the most because it has the maximum range and std.")

The population ranges for hares, lynxes, and carrots are resp. 69800.0, 55400.0, and 11600.0.
The hares population varies the most because it has the maximum range and std.


In [9]:
corr_h_l = np.corrcoef(hares, lynxes)[0, 1]
print(f"THe correlation between hares and lynxes is {corr_h_l}.")
corr_h_c = np.corrcoef(hares, carrots)[0, 1]
print(f"THe correlation between hares and carrots is {corr_h_c}.")
corr_l_c = np.corrcoef(lynxes, carrots)[0, 1]
print(f"THe correlation between lynxes and carrots is {corr_l_c}.")

print("Lynxes and and carrots are correlated the most because their correlation coefficient is the biggest in magnitude.")


THe correlation between hares and lynxes is 0.07189206073535571.
THe correlation between hares and carrots is -0.016603777709879402.
THe correlation between lynxes and carrots is -0.6805771698401617.
Lynxes and and carrots are correlated the most because their correlation coefficient is the biggest in magnitude.


## Hypothesis Tests

In [10]:
# weights = np.concatenate((np.random.normal(85,20,25), np.random.normal(65,10,25)))
# people weights 
weights = [94.93428306,  82.23471398, 97.95377076, 115.46059713, 80.31693251,  80.31726086, 116.58425631, 
           100.34869458,  75.61051228, 95.85120087, 75.73164614, 75.68540493, 89.83924543,  46.73439511,  
           50.50164335,  73.75424942,  64.74337759,  91.28494665, 66.83951849, 56.75392597, 114.31297538, 
           80.48447399,  86.35056409,  56.50503628, 74.11234551,  66.1092259 ,  53.49006423,  68.75698018,
           58.9936131 ,  62.0830625 ,  58.98293388,  83.52278185, 64.86502775,  54.42289071,  73.22544912,  
           52.7915635 ,67.08863595,  45.40329876,  51.71813951,  66.96861236, 72.3846658 ,  66.71368281,  
           63.84351718,  61.98896304, 50.2147801 ,  57.80155792,  60.39361229,  75.57122226, 68.4361829 , 47.36959845]

In [11]:
# set the significance level (alpha) to 0.05
alpha = 0.05

In [12]:
"""
Function 'evaluate_test' which prints a conclusion of hypothesis test based on p-value and alpha

PARAMS:
    p (float) - p-value from test
    alpha - significance level
"""

def evaluate_test(p, alpha):
    print(f'p = {round(p,2)}, alpha = {alpha}')
    
    if p > alpha:
        print(f'We can not reject H0.')
    else:
        print(f'H0 is rejected.')

### Task
- Import Shapiro-Wilk Test to test if weights are normally distributed
- H0 = weights are normally distributed, HA = weights are not normally distributed
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html


In [13]:
from scipy.stats import shapiro

- use function 'evaluate_test' to make conclusion if weights are normally distributed


In [14]:
shapiro_test = shapiro(weights)
shapiro_test

ShapiroResult(statistic=0.9404902458190918, pvalue=0.014088480733335018)

In [15]:
p = shapiro_test.pvalue
p

0.014088480733335018

In [16]:
evaluate_test(p, alpha)

p = 0.01, alpha = 0.05
H0 is rejected.


In [17]:
# salaries in first company
salaries_company_A = [ 62779.75930907,  67487.49834604,  78998.91885801,  92801.06354333,
        94917.76195759,  85409.43843246,  65536.36510309,  97608.88920408,
        79613.1791369 ,  74035.25988438,  72698.71057961,  57170.2204782 ,
        96496.56571672,  78123.01652012,  69617.56847376,  89109.14505065,
        91809.98342107,  54010.91167324, 103259.7319888 , 113319.79557154,
        81529.81385057,  83590.49251746, 115902.53443622,  63608.1666576 ,
        72175.25765417,  88719.32305603,  97215.1090373 ,  80570.98830349,
        67796.25874935,  99321.80738101]

# salaries in second company
salaries_company_B = [ 89845.96793876,  90027.93042629, 108596.08141043, 120113.67952031,
        94794.04532001,  99565.51332692, 110927.06162603,  85471.82457925,
        79030.8553638 ,  82644.84718934,  71592.66608011,  68244.23637394,
       134420.97566401,  72106.76757987,  95429.7573215 ,  88285.90615416,
       110973.4078626 ,  92323.32822085, 117740.37152488,  87412.61048855,
        94906.53993793, 105017.39597368,  93983.46012639, 100538.051311  ,
        95673.65143504,  61727.33698247, 105311.27474286, 113551.6401474 ,
        87408.82036567,  85895.00912077]

### Task
- test the hypothesis that mean of salaries in companies are equal
- use t-test
- H0: salaries are the same, HA: salaries are not the same


In [18]:
import scipy.stats as st

In [19]:
t_test = st.ttest_ind(salaries_company_A, salaries_company_B)
t_test

Ttest_indResult(statistic=-2.960239205553994, pvalue=0.004446743273557563)

In [20]:
p = t_test.pvalue
p

0.004446743273557563

In [21]:
evaluate_test(p, alpha)

p = 0.0, alpha = 0.05
H0 is rejected.


In [22]:
loc_A, std_A = st.norm.fit(salaries_company_A)
loc_A, std_A

(82507.78449639535, 15569.509119771908)

In [23]:
loc_B, std_B = st.norm.fit(salaries_company_B)
loc_B, std_B

(94785.36713716066, 16013.761205101173)