# Descriptive Statistics - Measures of Central Tendency and variability
* Perform the following operations on any open source dataset (e.g., data.csv)

Provide summary statistics (mean, median, minimum, maximum, standard deviation) for a dataset (age, income etc.) with numeric variables grouped by one of the qualitative (categorical) variable. For example, if your categorical variable is age groups and quantitative variable is income, then provide summary statistics of income grouped by the age groups. Create a list that contains a numeric value for each response to the categorical variable. 


In [1]:
import random
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [5]:
gen = ['Male', 'Female']
loc = ['Chennai','Kochi','Kolkata','Mumbai','Banglore','Delhi']

data={'Age' : [random.randint(18,45) for _ in range(500)],
'Monthly_Income' : [random.randint(25000, 100000) for _ in range(500)],
'Gender' : [random.choice(gen) for _ in range(500)],
'Location' : [random.choice(loc) for _ in range(500)]}

In [10]:
df = pd.DataFrame(data)
df

Unnamed: 0,Age,Monthly_Income,Gender,Location
0,32,63470,Female,Banglore
1,22,81088,Female,Delhi
2,31,61956,Female,Delhi
3,35,98865,Male,Delhi
4,37,73229,Male,Banglore
...,...,...,...,...
495,38,62248,Female,Banglore
496,19,58710,Male,Delhi
497,20,92344,Female,Kochi
498,21,96905,Female,Mumbai


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             500 non-null    int64 
 1   Monthly_Income  500 non-null    int64 
 2   Gender          500 non-null    object
 3   Location        500 non-null    object
dtypes: int64(2), object(2)
memory usage: 15.8+ KB


In [12]:
df.describe()

Unnamed: 0,Age,Monthly_Income
count,500.0,500.0
mean,31.174,61960.06
std,7.947203,21716.387614
min,18.0,25111.0
25%,24.0,44456.25
50%,31.0,61296.5
75%,38.0,80501.25
max,45.0,99999.0


In [13]:
df.shape

(500, 4)

In [17]:
df['Age'].value_counts()

23    25
35    25
38    24
32    24
20    23
27    23
28    21
26    20
22    19
21    19
42    19
25    19
41    19
36    18
44    18
40    16
30    16
29    16
34    15
19    15
18    15
43    15
37    15
39    13
31    13
45    12
33    12
24    11
Name: Age, dtype: int64

In [18]:
df['Gender'].value_counts()

Female    272
Male      228
Name: Gender, dtype: int64

In [19]:
df['Location'].value_counts()

Banglore    95
Delhi       86
Kochi       81
Kolkata     81
Mumbai      81
Chennai     76
Name: Location, dtype: int64

In [21]:
df.duplicated().sum()

0

In [22]:
df.nunique()

Age                28
Monthly_Income    500
Gender              2
Location            6
dtype: int64

In [24]:
df.isnull().sum()

Age               0
Monthly_Income    0
Gender            0
Location          0
dtype: int64

In [57]:
def calculate_all_state_values(df):
  statData = {}
  indexes = ['mean', 'median', 'variance', 'standard deviation', 'minimum', 'maximum', '25% percentile', '50% percentile', '75% percentile', "count"]
  for column in df.columns:
    try:
      columnConvertedValues = pd.to_numeric(df[column])
      mean = df[column].mean()
      median = df[column].median()
      var = df[column].var()
      std = df[column].std()
      min = df[column].min()
      max = df[column].max()
      percentile25 = np.percentile(df[column], 25)
      percentile50 = np.percentile(df[column], 50)
      percentile75 = np.percentile(df[column], 75)
      count = df[column].count()
      statData[column] = [mean, median, var, std, min, max, percentile25, percentile50, percentile75, count]
    except ValueError:
      pass
  stat = pd.DataFrame(statData, index = indexes);
  return stat;

In [58]:
calculate_all_state_values(df)

Unnamed: 0,Age,Monthly_Income
mean,31.174,61960.06
median,31.0,61296.5
variance,63.15804,471601500.0
standard deviation,7.947203,21716.39
minimum,18.0,25111.0
maximum,45.0,99999.0
25% percentile,24.0,44456.25
50% percentile,31.0,61296.5
75% percentile,38.0,80501.25
count,500.0,500.0


In [59]:
def calculate_all_state_values_using_formula(df):
  statData = {}
  indexes = ['mean', 'median', 'mode', 'variance', 'standard deviation', 'minimum', 'maximum', '25% percentile', '50% percentile', '75% percentile', 'count']
  for column in df.columns:
    try:
      columnConvertedValues = pd.to_numeric(df[column])
      mean = df[column].sum()/len(df[column])
      sorted_data = np.sort(df['Monthly_Income'])
      median = df[column].median()
      mode = df[column].mode()
      var = np.square(df[column] - mean).sum()/len(df[column])
      std = np.sqrt(np.square(df[column] - mean).sum()/len(df[column]))
      min = df[column].min()
      max = df[column].max()
      percentile25 = df.loc[int((len(df[column])+1)/4), column]
      percentile50 = np.percentile(df[column], 50)
      percentile75 = np.percentile(df[column], 75)
      count = df[column].count()
      statData[column] = [mean, median, mode, var, std, min, max, percentile25, percentile50, percentile75, count]
    except ValueError:
      pass
  stat = pd.DataFrame(statData,index=indexes) 
  return stat

In [60]:
calculate_all_state_values_using_formula(df)

Unnamed: 0,Age,Monthly_Income
mean,31.174,61960.06
median,31.0,61296.5
mode,"0 23 1 35 Name: Age, dtype: int64",0 25111 1 25188 2 25314 3 ...
variance,63.031724,470658288.0204
standard deviation,7.939252,21694.660357
minimum,18,25111
maximum,45,99999
25% percentile,32,66833
50% percentile,31.0,61296.5
75% percentile,38.0,80501.25


In [61]:
import statistics
def calculate_all_state_values_using_stat_module(df):
  statData = {}
  indexes = ['mean', 'median', 'variance', 'standard deviation', 'minimum', 'maximum', '25% percentile', '50% percentile', '75% percentile', "count", "hormonic mean"]
  for column in df.columns:
    try:
      columnConvertedValues = pd.to_numeric(df[column])
      mean = statistics.mean(df[column])
      median = statistics.median(df[column])
      var = statistics.variance(df[column])
      std = statistics.stdev(df[column])
      min = df[column].min()
      max = df[column].max()
      percentile25 = np.percentile(df[column], 25)
      percentile50 = np.percentile(df[column], 50)
      percentile75 = np.percentile(df[column], 75)
      count = df[column].count()
      hormonic_mean = statistics.harmonic_mean(df[column])
      statData[column] = [mean, median, var, std, min, max, percentile25, percentile50, percentile75, count, hormonic_mean]
    except ValueError:
      pass
  stat = pd.DataFrame(statData, index = indexes);
  return stat;


In [62]:
calculate_all_state_values_using_stat_module(df)

Unnamed: 0,Age,Monthly_Income
mean,31.174,61960.06
median,31.0,61296.5
variance,63.15804,471601500.0
standard deviation,7.947203,21716.39
minimum,18.0,25111.0
maximum,45.0,99999.0
25% percentile,24.0,44456.25
50% percentile,31.0,61296.5
75% percentile,38.0,80501.25
count,500.0,500.0
