# Statistics

In [2]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

### Types of Measures
- **Central Tendency:** tells you about the centres of data e.g. mean, median and mode
- **Variability:** tells you about the spread of data e.g. variance and standard deviation
- **Correlation or joint variability:** tells you about the relation between a pair of variables in a dataset e.g. correlation coefficient

#### Mean

In [9]:
x = [23, 45, 12, 34, 56, 32, 44, 67, 39, 50]
mean = sum(x) / len(x)
mean2 = statistics.mean(x)
mean3 = statistics.fmean(x)  # a faster implementation of mean
print(mean, mean2, mean3)


x_nan = [23, np.nan, 12, 34, 56, 32, 44, 67, 39, 50]
mean4 = sum(x_nan) / len(x_nan)
mean5 = statistics.mean(x_nan)
mean6 = statistics.fmean(x_nan)
mean7 = np.mean(x_nan)
mean8 = np.nanmean(x_nan)  # ignore nan values
print(mean4, mean5, mean6, mean7, mean8)

40.2 40.2 40.2
nan nan nan nan 39.666666666666664


#### Weighted mean

In [15]:
# Define the data values
x = [8.0, 1, 2.5, 4, 28.0]
# Define the weights for each data value
w = [0.1, 0.2, 0.3, 0.25, 0.15]

# Calculate the weighted mean by multiplying each data value by its weight,
# summing the products, and then dividing by the sum of the weights
weighted_mean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
weighted_mean2 = np.average(x,weights=w)
# Print the result
print(weighted_mean, weighted_mean2)

6.95 6.95


#### Harmonic mean

In [20]:
x = [8.0, 1, 2.5, 4, 28.0]
hmean = len(x) / sum(1 / item for item in x)
hmean2 = statistics.harmonic_mean(x)
hmean3 = scipy.stats.hmean(x)
print(hmean, hmean2, hmean3)

2.7613412228796843 2.7613412228796843 2.7613412228796843


### Analysis Types

- **univariate:** one variable
- **bivariate:** two variables
- **multivariate:** three or mare variables 



#### Univariate

In [3]:
# Create a sample data set
data = {'ages': [23, 45, 12, 34, 56, 32, 44, 67, 39, 50]}
df = pd.DataFrame(data)

# Calculate descriptive statistics
mean = df['ages'].mean()
median = df['ages'].median()
mode = df['ages'].mode()
std_dev = df['ages'].std()

# Print results
print('Mean:', mean)
print('Median:', median)
print('Mode:', mode[0])
print('Standard Deviation:', std_dev)

Mean: 40.2
Median: 41.5
Mode: 12
Standard Deviation: 15.984714921171134


### Population and samples 
- **Population:** set of all elements you're interested in 
- **Sample**: a subset of a population

### Outliers
- Natural variation in data
- Change in the behavior of the observed system
- Errors in data collection