# Descriptive Statistics (The Basics)
## 1. Measures of Central Tendency ("The Center")

In [1]:
# data generator
import random

def create(size, first, last):
    data = []
    for i in range(size):
        data.append(random.randint(first, last))
    return data

In [2]:
# Mean (The Average)
data = create(10, 1, 1000)

data_sum = 0
for num in data:
    data_sum += num

the_mean = data_sum / len(data)
print('Data: ', data)
print('Mean: ', the_mean)

Data:  [808, 270, 443, 445, 193, 618, 140, 686, 776, 869]
Mean:  524.8


In [3]:
# Median (The Middle)
data = create(11, 1, 1000)
print('Data:        ', data)
data.sort()

if len(data) % 2 == 0:
    index = len(data) // 2
    the_median = (data[index] + data[index-1]) / 2
else:
    the_median = data[len(data)//2]

print('Data sorted: ', data)
print('Median:      ', the_median)

Data:         [495, 279, 553, 602, 960, 359, 596, 814, 37, 312, 893]
Data sorted:  [37, 279, 312, 359, 495, 553, 596, 602, 814, 893, 960]
Median:       553


In [4]:
# Mode (The Most Frequent)
data = create(100, 1, 10)

mode_dict = {}
for num in data:
    if num in mode_dict:
        mode_dict[num] += 1
    else:
        mode_dict[num] = 1

max_frequency = max(mode_dict.values())
mode = [key for key, value in mode_dict.items() if value == max_frequency]

print('Data:          ', data)
print('Max frequency: ', max_frequency)
print('Modes:         ', mode)

Data:           [6, 10, 5, 1, 5, 5, 7, 1, 8, 10, 10, 3, 3, 6, 7, 9, 7, 7, 7, 10, 8, 2, 3, 8, 2, 1, 2, 1, 7, 1, 4, 2, 4, 6, 2, 7, 1, 7, 9, 8, 6, 9, 6, 10, 9, 6, 2, 10, 5, 10, 3, 1, 9, 6, 9, 3, 6, 8, 4, 4, 1, 3, 3, 7, 10, 6, 5, 3, 6, 2, 7, 1, 3, 2, 4, 6, 9, 2, 1, 1, 10, 7, 10, 8, 3, 7, 3, 6, 4, 6, 9, 8, 4, 10, 5, 10, 5, 1, 2, 2]
Max frequency:  13
Modes:          [6]


## 2. Measures of Spread / Variability (The "Dispersion")

In [5]:
# Range
data = create(15, 1, 1000)
the_range = max(data) - min(data)

print('Data:  ', data)
print('Range: ', the_range)

Data:   [11, 456, 574, 236, 442, 829, 101, 732, 224, 749, 228, 166, 74, 695, 594]
Range:  818


In [6]:
# Variance & Standard Deviation (std)
import math
data = create(15, 1, 1000)

data_sum = 0
for num in data:
    data_sum += num
mean = data_sum / len(data)

differences = [(mean - num)**2 for num in data]

data_sum = 0
for num in differences:
    data_sum += num
variance = data_sum / len(differences) # this is population variance
# sample variance is data_sum / len(differences) - 1
std = math.sqrt(variance)

print('Data:                ', data)
print('Mean:                ', mean)
print('Differences squared: ', differences)
print('Variance:            ', variance)
print('Standart deviation:  ', std)
# The most data points fall between mean +- std

Data:                 [377, 547, 287, 427, 336, 397, 573, 789, 9, 779, 369, 457, 631, 285, 399]
Mean:                 444.1333333333333
Differences squared:  [4506.884444444443, 10581.551111111112, 24690.88444444444, 293.55111111111086, 11692.817777777776, 2221.5511111111105, 16606.61777777778, 118933.01777777779, 189341.01777777777, 112135.68444444444, 5645.017777777776, 165.5511111111113, 34919.15111111111, 25323.417777777777, 2037.0177777777772]
Variance:             37272.915555555555
Standart deviation:   193.06194745613533


## 3. Measures of Position (The "Distribution")

In [23]:
# Percentiles (A value below which a give percentage of observation fall)

def calculate_percentile(data, percentile):
    sorted_data = sorted(data)
    length = len(sorted_data)
    index = (percentile / 100) * (length - 1)

    if index.is_integer():
        return sorted_data[int(index)]
    else:
        lower_index = int(index)
        upper_index = lower_index + 1
        fraction = index - lower_index
        
        lower_value = sorted_data[lower_index]
        upper_value = sorted_data[upper_index]
        
        return lower_value + fraction * (upper_value - lower_value)

data = create(25, 1, 1000)
print('Data:            ', data)
print('Sorted data:     ', sorted(data))
print('20s percentile:  ', calculate_percentile(data, 20))
print('33rd percentile: ', calculate_percentile(data, 33))
print('80s percentile:  ', calculate_percentile(data, 80))

Data:             [873, 709, 808, 672, 699, 586, 617, 61, 760, 288, 194, 812, 161, 858, 159, 138, 405, 761, 853, 784, 44, 320, 111, 123, 902]
Sorted data:      [44, 61, 111, 123, 138, 159, 161, 194, 288, 320, 405, 586, 617, 672, 699, 709, 760, 761, 784, 808, 812, 853, 858, 873, 902]
20s percentile:   154.8
33rd percentile:  280.48
80s percentile:   808.8


In [29]:
# Quartiles and Interquartile (IQR)

data = create(30, 1, 1000)

# First quartile is the 25th percentile
q1 = calculate_percentile(data, 25)
# Second quartile is the 50th percentile (The Median)
q2 = calculate_percentile(data, 50)
# Third quartile is the 75th percentile
q3 = calculate_percentile(data, 75)

# Interquartile (IQR) is the difference between Q3 and Q1
iqr = q3 - q1

print('Data:        ', data)
print('Sorted data: ', sorted(data))
print('Q1:          ', q1)
print('Q2:          ', q2)
print('Q3:          ', q3)
print('IQR:         ', iqr)

Data:         [674, 601, 277, 976, 196, 596, 114, 51, 824, 857, 33, 318, 873, 560, 557, 30, 230, 683, 740, 456, 691, 593, 336, 559, 925, 752, 866, 846, 793, 134]
Sorted data:  [30, 33, 51, 114, 134, 196, 230, 277, 318, 336, 456, 557, 559, 560, 593, 596, 601, 674, 683, 691, 740, 752, 793, 824, 846, 857, 866, 873, 925, 976]
Q1:           287.25
Q2:           594.5
Q3:           782.75
IQR:          495.5
