# Chapter 2. Types of Data

## Structured versus unstructured data

### Example of data preprocessing

In [2]:
len("This Wednesday morn, are you early to rise? Then look East. The Crescent Moon joins Venus & Saturn. Afloat in the dawn skies.")
# get the length of this text (number of characters for a string)


125

## Quantitative versus qualitative data

### Example – world alcohol consumption data

In [3]:
import pandas as pd

# read in the CSV file from a URL
drinks = pd.read_csv('https://raw.githubusercontent.com/sinanuozdemir/principles_of_data_science/master/data/chapter_2/drinks.csv')

# examine the data's first five rows
drinks.head()           # print the first 5 rows

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [4]:
drinks['continent'].describe()

count     170
unique      5
top        AF
freq       53
Name: continent, dtype: object

In [5]:
drinks['beer_servings'].describe()

count    193.000000
mean     106.160622
std      101.143103
min        0.000000
25%       20.000000
50%       76.000000
75%      188.000000
max      376.000000
Name: beer_servings, dtype: float64

## The four levels of data

### The nominal level

### The ordinal level

In [6]:
import numpy

results = [5, 4, 3, 4, 5, 3, 2, 5, 3, 2, 1, 4, 5, 3, 4, 4, 5, 4, 2, 1, 4, 5, 4, 3, 2, 4, 4, 5, 4, 3, 2, 1]

sorted_results = sorted(results)

print(sorted_results)
'''
[1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
'''

print(numpy.mean(results))    # == 3.4375

print(numpy.median(results))  # == 4.0

[1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
3.4375
4.0


### The interval level

In [7]:
import numpy

temps = [31, 32, 32, 31, 28, 29, 31, 38, 32, 31, 30, 29, 30, 31, 26]

print(numpy.mean(temps))    # == 30.73

print(numpy.median(temps))  # == 31.0

30.733333333333334
31.0


In [8]:
import numpy

temps = [31, 32, 32, 31, 28, 29, 31, 38, 32, 31, 30, 29, 30, 31, 26]

mean = numpy.mean(temps)    # == 30.73

squared_differences = []
# empty list o squared differences

for temperature in temps:
    difference = temperature - mean                
    # how far is the point from the mean

    squared_difference = difference**2             
    # square the difference

    squared_differences.append(squared_difference) 
    # add it to our list
    

average_squared_difference = numpy.mean(squared_differences)  
# This number is also called the "Variance"


standard_deviation = numpy.sqrt(average_squared_difference)   
# We did it!


print(standard_deviation)  # == 2.5157

2.5157283018817607


### The ratio level

In [9]:
import numpy

temps = [31, 32, 32, 31, 28, 29, 31, 38, 32, 31, 30, 29, 30, 31, 26]

num_items = len(temps)
product = 1.

for temperature in temps:
    product *= temperature
    
geometric_mean = product**(1./num_items)

print(geometric_mean)   # == 30.634

30.63473484374659
