In [9]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
s = Series([10,20,30,40,50])

## What kind of floats do we have?

- float16
- float32
- float64

In [3]:
#what about other things?

s = Series(['hello', 'out', 'there'])
s

0    hello
1      out
2    there
dtype: object

In [4]:
s = Series([10,20.5,'hello', [2,4,6]])
s

0           10
1         20.5
2        hello
3    [2, 4, 6]
dtype: object

In [5]:
# if you read in data that is dirty/bad/missing/corrupt, you might well
# get a dtype of "object" even though you really want/expect int or float

# Exercise: Ages

1. Create a series containing the ages of 5 people in your family
2. What dtype would be appropriate?
3. What if you want to measure the age more precisely, using fractional years? What would you use then?
4. Create a new series in which you measure the age (approximately) in seconds. Create that series using multiplication. Is int64 big enough for that?

In [11]:
ages = Series([1, 3, 36, 69])
ages

0     1
1     3
2    36
3    69
dtype: int64

In [12]:
#what is the max we'll get with int64?
# it's 8 bytes, so we'll get up to 2**64

2**64

18446744073709551616

In [16]:
ages = Series([1, 3, 36, 69], dtype='int8')
ages


0     1
1     3
2    36
3    69
dtype: int8

In [17]:
ages = Series([0.5, 3.2, 36, 69])
ages

0     0.5
1     3.2
2    36.0
3    69.0
dtype: float64

In [8]:
ageInSeconds = ages*365*24*60*60
ageInSeconds

0    1.576800e+07
1    9.460800e+07
2    1.135296e+09
3    2.175984e+09
dtype: float64

# Changing dtypes

It's very common for us to want to change the dtype of a series. We can't do that! Once a dtype is set, it is forever. However, we can get a new series back, based on the old one, with a different dtype.

In [23]:
s = Series('10 20 30'.split())
s

0    10
1    20
2    30
dtype: object

In [24]:
s*3

0    101010
1    202020
2    303030
dtype: object

In [25]:
s=s.astype('int64') #convert a string to digits
s

0    10
1    20
2    30
dtype: int64

In [26]:
s*3

0    30
1    60
2    90
dtype: int64

In [27]:
s = Series([10,20,30])
s.astype('int8')

0    10
1    20
2    30
dtype: int8

In [29]:
s = Series('10 20 hello 30'.split())
s.astype('int64')

ValueError: invalid literal for int() with base 10: 'hello'

In [32]:
s.astype('int8', errors='ignore')

0       10
1       20
2    hello
3       30
dtype: object

In [33]:
s = Series([10,20,30])
s.astype('float16')

0    10.0
1    20.0
2    30.0
dtype: float16

In [37]:
s = Series([10.5,20.3,30.4])
s

0    10.5
1    20.3
2    30.4
dtype: float64

In [35]:
s.astype('int64') #this will cut off the digits behind the decimal

0    10
1    20
2    30
dtype: int64

In [43]:
s = Series([10.8, 20.3, 30.4])
s

s.round().astype('int64')


0    11
1    20
2    30
dtype: int64

# Exercise: Rounding floats

1. Create a series of 10 floats between 0-1,000.
2. Get the mean of all floats whose integer portion is even.

In [57]:
np.random.seed(0)
s = Series(np.random.rand(10)) * 1000 # this returns 10 random floats between 0 and 1000
s

0    548.813504
1    715.189366
2    602.763376
3    544.883183
4    423.654799
5    645.894113
6    437.587211
7    891.773001
8    963.662761
9    383.441519
dtype: float64

In [61]:
s.loc[s.astype('int64') % 2 == 0]  #generates a True or False series that can be applied using .loc

0    548.813504
2    602.763376
3    544.883183
dtype: float64

In [64]:
(
    s
    .loc[
        s.astype('int64') % 2 == 0]
            .mean() #gets the mean of those integers that are even
)

565.4866876652885

# NaN -- not a number

When we have values in our series, they are all of the same dtype. What if there are missing values? They can't be 0. They need to be something that we can clearly understand is different.

The solution is 'NaN'. In NumPy, we write is as 'nan'. I Pandas, we call it 'NaN'. But they are precisely the same.

In [65]:
type(np.nan)

float

In [66]:
np.nan == 0

False

In [67]:
np.nan == np.nan

False

In [68]:
s = Series([10,20,30,np.nan,50])
s

0    10.0
1    20.0
2    30.0
3     NaN
4    50.0
dtype: float64

In [69]:
s.astype('int64')

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [71]:
s = Series([10,20,30])
s.loc[1] = np.nan
s

0    10.0
1     NaN
2    30.0
dtype: float64

In [72]:
#how can I find NaN values?

s.loc[s != np.nan]

0    10.0
1     NaN
2    30.0
dtype: float64

# Where does 'NaN' come from?

- Missing data
- Technical glitches
- Human error
- A reasonable choice on a survey

In [10]:
s = Series([10,20,30,np.nan,50])
s

0    10.0
1    20.0
2    30.0
3     NaN
4    50.0
dtype: float64

In [11]:
s.isna()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [12]:
s.dropna()

0    10.0
1    20.0
2    30.0
4    50.0
dtype: float64

In [13]:
s.isna().sum()

1