# Descriptive Statistics (The Basics)
## 1. Measures of Central Tendency ("The Center")

In [2]:
# data generator
import random

def create(size, first, last):
    data = []
    for i in range(size):
        data.append(random.randint(first, last))
    return data

In [3]:
# Mean (The Average)
data = create(10, 1, 1000)

data_sum = 0
for num in data:
    data_sum += num

the_mean = data_sum / len(data)
print('Data: ', data)
print('Mean: ', the_mean)

Data:  [656, 766, 336, 176, 672, 800, 870, 113, 88, 498]
Mean:  497.5


In [4]:
# Median (The Middle)
data = create(11, 1, 1000)
print('Data:        ', data)
data.sort()

if len(data) % 2 == 0:
    index = len(data) // 2
    the_median = (data[index] + data[index-1]) / 2
else:
    the_median = data[len(data)//2]

print('Data sorted: ', data)
print('Median:      ', the_median)

Data:         [505, 506, 744, 386, 575, 505, 365, 726, 170, 584, 20]
Data sorted:  [20, 170, 365, 386, 505, 505, 506, 575, 584, 726, 744]
Median:       505


In [5]:
# Mode (The Most Frequent)
data = create(100, 1, 10)

mode_dict = {}
for num in data:
    if num in mode_dict:
        mode_dict[num] += 1
    else:
        mode_dict[num] = 1

max_frequency = max(mode_dict.values())
mode = [key for key, value in mode_dict.items() if value == max_frequency]

print('Data:          ', data)
print('Max frequency: ', max_frequency)
print('Modes:         ', mode)

Data:           [9, 7, 3, 7, 1, 2, 1, 7, 1, 6, 6, 2, 6, 7, 7, 3, 8, 5, 10, 2, 2, 8, 6, 8, 6, 2, 9, 5, 2, 5, 6, 9, 9, 2, 1, 4, 5, 5, 3, 9, 5, 2, 10, 3, 8, 10, 2, 10, 6, 2, 2, 1, 3, 5, 8, 8, 10, 8, 6, 4, 4, 1, 7, 7, 2, 9, 6, 5, 1, 9, 5, 10, 10, 2, 6, 5, 7, 1, 5, 6, 6, 1, 2, 5, 10, 10, 7, 6, 2, 2, 5, 3, 9, 4, 10, 6, 3, 5, 7, 6]
Max frequency:  16
Modes:          [2]


## 2. Measures of Spread / Variability (The "Dispersion")

In [6]:
# Range
data = create(15, 1, 1000)
the_range = max(data) - min(data)

print('Data:  ', data)
print('Range: ', the_range)

Data:   [910, 249, 770, 919, 723, 681, 735, 390, 612, 304, 377, 657, 473, 576, 17]
Range:  902


In [7]:
# Variance & Standard Deviation (std)
data = create(15, 1, 1000)

data_sum = 0
for num in data:
    data_sum += num
mean = data_sum / len(data)

differences = [(mean - num)**2 for num in data]

data_sum = 0
for num in differences:
    data_sum += num
variance = data_sum / len(differences) # this is population variance
# sample variance is data_sum / len(differences) - 1
std = variance**0.5

print('Data:                ', data)
print('Mean:                ', mean)
print('Differences squared: ', differences)
print('Variance:            ', variance)
print('Standart deviation:  ', std)
# The most data points fall between mean +- std

Data:                 [747, 64, 681, 519, 317, 233, 956, 645, 165, 209, 575, 392, 237, 403, 467]
Mean:                 440.6666666666667
Differences squared:  [93840.1111111111, 141877.77777777778, 57760.1111111111, 6136.111111111109, 15293.444444444449, 43125.44444444445, 265568.4444444444, 41752.1111111111, 75992.11111111112, 53669.44444444445, 18045.444444444438, 2368.444444444446, 41480.11111111112, 1418.7777777777792, 693.4444444444434]
Variance:             57268.088888888895
Standart deviation:   239.3075194992604


## 3. Measures of Position (The "Distribution")

In [8]:
# Percentiles (A value below which a give percentage of observation fall)

def calculate_percentile(data, percentile):
    sorted_data = sorted(data)
    length = len(sorted_data)
    index = (percentile / 100) * (length - 1)

    if index.is_integer():
        return sorted_data[int(index)]
    else:
        lower_index = int(index)
        upper_index = lower_index + 1
        fraction = index - lower_index
        
        lower_value = sorted_data[lower_index]
        upper_value = sorted_data[upper_index]
        
        return lower_value + fraction * (upper_value - lower_value)

data = create(25, 1, 1000)
print('Data:            ', data)
print('Sorted data:     ', sorted(data))
print('20s percentile:  ', calculate_percentile(data, 20))
print('33rd percentile: ', calculate_percentile(data, 33))
print('80s percentile:  ', calculate_percentile(data, 80))

Data:             [109, 658, 465, 460, 250, 712, 106, 856, 842, 284, 941, 350, 692, 174, 20, 876, 985, 649, 894, 109, 41, 20, 916, 907, 302]
Sorted data:      [20, 20, 41, 106, 109, 109, 174, 250, 284, 302, 350, 460, 465, 649, 658, 692, 712, 842, 856, 876, 894, 907, 916, 941, 985]
20s percentile:   109.0
33rd percentile:  281.28
80s percentile:   879.6


In [9]:
# Quartiles and Interquartile (IQR)

data = create(30, 1, 1000)

# First quartile is the 25th percentile
q1 = calculate_percentile(data, 25)
# Second quartile is the 50th percentile (The Median)
q2 = calculate_percentile(data, 50)
# Third quartile is the 75th percentile
q3 = calculate_percentile(data, 75)

# Interquartile (IQR) is the difference between Q3 and Q1
iqr = q3 - q1

print('Data:        ', data)
print('Sorted data: ', sorted(data))
print('Q1:          ', q1)
print('Q2:          ', q2)
print('Q3:          ', q3)
print('IQR:         ', iqr)

Data:         [345, 953, 183, 117, 899, 574, 148, 553, 641, 238, 438, 288, 828, 465, 886, 613, 790, 41, 267, 64, 529, 121, 482, 343, 970, 263, 868, 224, 38, 939]
Sorted data:  [38, 41, 64, 117, 121, 148, 183, 224, 238, 263, 267, 288, 343, 345, 438, 465, 482, 529, 553, 574, 613, 641, 790, 828, 868, 886, 899, 939, 953, 970]
Q1:           227.5
Q2:           451.5
Q3:           752.75
IQR:          525.25
