# Using Python 3.7.4 to calculate Dispersion

In [None]:
# Import some required libraries
import numpy as np
from scipy import stats
import statistics
import pandas as pd

# Define the Data

In [None]:
# Let us consider a simple series:
Series_X = np.array([366, 93, 694, 655, 530, 728, 456, 413, 298, 676, 478, 462, 553, 89, 194, 257, 589, 526, 552, 647, 
                     468, 218, 537, 505, 104, 82, 491, 474, 720, 594, 138, 127, 404, 142, 480, 523, 188, 514, 645, 281, 
                     187, 447, 128, 181, 501, 298, 756, 765, 450, 711, 663, 63, 628, 105, 430, 532, 737, 669, 773, 727, 
                     234, 126, 76, 538, 127, 100, 653, 785, 507, 324, 205, 741, 61, 299, 443, 275, 106, 770, 566, 325, 
                     305, 330, 345, 424, 333, 227, 102, 207, 489, 211, 648, 396, 428, 512, 591, 263, 167, 253, 214, 530, 
                     581, 753, 445, 169, 302, 518, 694, 548, 716, 365, 535, 734, 102, 762, 351, 393, 222, 506, 123, 402, 
                     355, 644, 170, 131, 373, 456, 754, 621])
df = pd.Series(Series_X)
df.columns = ['Series_X']

In [None]:
# A simple describe can provide most of the basic statistics of Central Tendency and Dispersion
df.describe()

# Range

In [None]:
# Calculate the range, the easiest measure:

max_val = df.describe()['max']
min_val = df.describe()['min']
Range_val = max_val - min_val
print("Minimum of Series_X = ",min_val,"\nMaximum of Series_X = ",max_val,"\nHence, Range of Series_X= ",Range_val)

# Quartile Deviation

In [None]:
# Calculate quartile deviation:

Q3 = df.describe()['75%']
Q1 = df.describe()['25%']

InterQuartile_Range = (Q3 - Q1)
Quartile_Deviation  = InterQuartile_Range/2

print("3rd Quartile of Series_X = ",Q3,"\n1st Quartile of Series_X = ",Q1,
      "\nHence, InterQuartile Range of Series_X= Q3-Q1 = ",InterQuartile_Range,
      "\nAnd Quartile Deviation = (Q3-Q1)/2= ",Quartile_Deviation)

# Mean Absolute Deviation (from Mean)

In [None]:
# Calculate mean absolute deviation about mean:

Mean_Series_X = df.describe()['mean']

Deviation_from_Mean = df.sub(Mean_Series_X,axis=0)
Absolute_Deviation_from_Mean = abs(Deviation_from_Mean)

Mean_Absolute_Deviation_from_Mean = sum(Absolute_Deviation_from_Mean)/df.describe()['count']

print("Mean of Series_X = ",round(Mean_Series_X,2),
      "\nMean Absolute Deviation from Mean for Series_X = ",round(Mean_Absolute_Deviation_from_Mean,2))

# Standard Deviation and Variance

In [None]:
# Calculate Standard deviation:

Deviation_from_Mean_Squared = Deviation_from_Mean**2
Std_Deviation = (sum(Deviation_from_Mean_Squared)/df.describe()['count'])**(1/2)

print("Mean of Series_X = ",round(Mean_Series_X,2),
      "\nStandard Deviation for Series_X = ",round(Std_Deviation,2))

Note that the Standard Deviation calculated by our formula is slightly different than what is provided by the "Describe" function in Pandas. What is the reason for this? The difference is we are dividing the sum(Deviation_from_Mean_Squared) by the total number of observations in Series_X, i.e., 128 whereas the internal formula in describe function divides by 1 less, i.e., 127. What is the reason for this? We consider that Series_X is our total population whereas major tools consider the data provided as a sample. When we calculate the SD for a sample, we use N-1. This is called Bessel's correction which approximately corrects the bias that is created when the population SD formula is applied on the sample population. In this case, since we consider Series_X is our full population, we are safe dividing by the population count. However, if we selected a random sample of 50 observations out of the 128 observations, we would divide by 49 instead of 50 in that case.

In [None]:
# Calculate Variance:

Variance_val = Std_Deviation**2

print("Variance for Series_X = ",round(Variance_val,2))