# Agenda: Indexing and dtypes

- Indexes
    - Setting
    - Resetting
- `inplace=True`
- dtypes

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
# we can use any data we want as the index
# the easiest way to set the index (after the fact) is to assign to the "index" attribute

s.index

RangeIndex(start=0, stop=5, step=1)

In [4]:
s.index = list('abcde')
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [5]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [6]:
# we can also set the index when we create the series, by passing a value to
# the "index" keyword argument

s = Series([10, 20, 30, 40, 50],
           index=list('abcde'))
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [7]:
s = Series([10, 20, 30, 40, 50],
           index=list('abcd'))    # only 4 elements in the index -- what happens?
s

ValueError: Length of values (5) does not match length of index (4)

In [8]:
# sometimes, we want to get rid of an index we've set
# we can use the "reset_index" method to do so -- this returns a new data frame

s.reset_index()

Unnamed: 0,index,0
0,a,10
1,b,20
2,c,30
3,d,40
4,e,50


In [9]:
# have I changed s?
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

# `reset_index` and `inplace=True`

Many methods in Python don't modify a series (or data frame), but rather return a new one, based on the old one, in which our changes have taken place. For example, when we invoked `reset_index`, we got back a new data frame, but our series was unchanged.

This is on purpose! It allows you, for example, to use method chaining to create long and complex queries without assignment.

If you want, though, you can use the keyword argument `inplace=True` in `reset_index` and many other methods. This forces the method to modify the series/data frame itself. The method will then return `None`.

**PLEASE NEVER, EVER USE `inplace=True`**

It is so tempting!

# `set_index`

If you have a data frame with two or more columns, you can modify that data frame, such that one of the columns is then used as the index. You can invoke `set_index` and pass the name of the column you want to use:

In [10]:
df = s.reset_index()
df

Unnamed: 0,index,0
0,a,10
1,b,20
2,c,30
3,d,40
4,e,50


In [11]:
df.set_index('index')  # here, I pass the name of the column that should be used as an index

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
a,10
b,20
c,30
d,40
e,50


In [12]:
# have I changed df?  No, I just got back a new data frame
df

Unnamed: 0,index,0
0,a,10
1,b,20
2,c,30
3,d,40
4,e,50


In [13]:
# can we set column 0 to be our index? Yes!
df.set_index(0)

Unnamed: 0_level_0,index
0,Unnamed: 1_level_1
10,a
20,b
30,c
40,d
50,e


In [14]:
df

Unnamed: 0,index,0
0,a,10
1,b,20
2,c,30
3,d,40
4,e,50


In [15]:
# what if I just want column 0?

df[0]   # on a data frame, [] return a column

0    10
1    20
2    30
3    40
4    50
Name: 0, dtype: int64

In [17]:
# let's get all values from column 0 where the index is 'c'

df.loc[    df['index'] == 'c'  ]  # we hand a boolean series to df.loc, and get back matching rows

Unnamed: 0,index,0
2,c,30


In [19]:
# we can make this easier and nicer (to read and write) by setting the index!
# we can even use method chaining across lines!

df.set_index('index').loc['c']

0    30
Name: c, dtype: int64

In [20]:
(
    df
    .set_index('index')
    .loc['c']
)

0    30
Name: c, dtype: int64

# Exercise: Weather and indexes

1. Create a series in which the index contains dates in MMDD format (`'0531'` and `'0601'`), all as strings. The values should be the expected/forecast high temperatures for the next 10 days wherever you are.
2. Use `reset_index`. What do you see?
3. Use a mask index to retrieve the projected high temps for June 2 and June 5.
4. Use `set_index` and `.loc` to achieve the same goal (but, I hope, in a nicer/more compact way).

In [21]:
s = Series([28, 31, 34, 36, 33,   34, 34, 34, 33, 31],
           index='0531 0601 0602 0603 0604 0605 0606 0607 0608 0609'.split())
s

0531    28
0601    31
0602    34
0603    36
0604    33
0605    34
0606    34
0607    34
0608    33
0609    31
dtype: int64

In [23]:
# reset_index on a series produces a two-column data frame, with the original index + original data
# reset_index on a data frame produces a data frame with one new column, the original index, added to the existing ones

s.reset_index()  

Unnamed: 0,index,0
0,531,28
1,601,31
2,602,34
3,603,36
4,604,33
5,605,34
6,606,34
7,607,34
8,608,33
9,609,31


In [24]:
# 3. Use a mask index to retrieve the projected high temps for June 2 and June 5.


(
    s
    .reset_index()
    .loc[(s.index == '0602') | (s.index == '0605')]  # show rows of s, post resetting index, where the index is 0602 or 0605
)

Unnamed: 0,index,0
2,602,34
5,605,34


In [27]:
# there is an "isin" method 

(
    s
    .loc[(s.index.isin(['0602', '0605']))]
    .reset_index()
)

Unnamed: 0,index,0
0,602,34
1,605,34


In [38]:
# Use set_index and .loc to achieve the same goal (but, I hope, in a nicer/more compact way).

df = s.reset_index()
df

Unnamed: 0,index,0
0,531,28
1,601,31
2,602,34
3,603,36
4,604,33
5,605,34
6,606,34
7,607,34
8,608,33
9,609,31


In [40]:
(
    df                          # we have our data frame with the default (range) index and two columns, "index" and 0
    .set_index('index')         # use the "index" column as the data frame's index
    .loc[ ['0602', '0605'] ]    # use fancy indexing and .loc to retrieve two rows from that data frame
)

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
602,34
605,34


In [34]:
df['index']   # show me the column named "index" in the data frame df

0    0531
1    0601
2    0602
3    0603
4    0604
5    0605
6    0606
7    0607
8    0608
9    0609
Name: index, dtype: object

In [35]:
df = df.set_index('index')
df['index']   # show me the column named "index" in df -- which doesn't exist any more!

KeyError: 'index'

In [36]:
df

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
531,28
601,31
602,34
603,36
604,33
605,34
606,34
607,34
608,33
609,31


In [41]:
# when I have a data frame, loc retrieves rows (based on the index) and [] retrieve columns

df

Unnamed: 0,index,0
0,531,28
1,601,31
2,602,34
3,603,36
4,604,33
5,605,34
6,606,34
7,607,34
8,608,33
9,609,31


In [42]:
df.loc[2]

index    0602
0          34
Name: 2, dtype: object

In [43]:
df.loc[ [2,4,7] ]

Unnamed: 0,index,0
2,602,34
4,604,33
7,607,34


In [44]:
(
    df
    .set_index('index')
    .loc[[2,4,7]]   # this won't work -- I can use iloc like this, but not loc
)

KeyError: "None of [Index([2, 4, 7], dtype='int64', name='index')] are in the [index]"

In [46]:
(
    df
    .set_index('index')
    .loc[['0602','0605', '0608']] 
)

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
602,34
605,34
608,33


In [47]:
# get a boolean series showing when df[0] (the max temp) will be higher than the mean
# for max temps 

df[0] > df[0].mean()

0    False
1    False
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9    False
Name: 0, dtype: bool

In [48]:
# Given this, I can now apply it (with .loc) to df, and get only those rows where df[0] > mean

df.loc[  df[0] > df[0].mean()   ]

Unnamed: 0,index,0
2,602,34
3,603,36
4,604,33
5,605,34
6,606,34
7,607,34
8,608,33


# Let's talk about `dtypes`

Remember that in Python, everything is an object. We have integer objects, float objects, and also string, list, tuple, etc. objects. These objects manage their own memory -- so if I have an integer, it's responsible for knowing how much memory to request from the system, and also for freeing up that memory when we don't need the value any more.

This means that an integer in Python can take up an unknown amount of RAM. 

This is completely counter to how things work in languages like C, where you know how big an integer is (typically 64 bits), and then you know that if you have 10 integers, you'll need 640 bits. That's because in C-like languages, a variable is an alias to a location in memory. The language needs to know how much memory to allocate to each value, by learning (from us) what type each value will be.

Pandas, using NumPy under the hood, is very similar to C in this case, because it's written in C and uses C data structures. A Pandas series is basically a bunch of C-language integers, floats, etc.

This means that we need to tell Pandas how much memory it should allocate for each value in a series. We need to balance out two things: (a) making sure that there is enough space to store the values we want, but (b) not taking up too much space unnecessarily.

In [49]:
s = Series([10, 20, 30])
s.dtype   # show me the dtype for this series -- and yes, all values in the series must have the same dtype

dtype('int64')

In [50]:
# a 64-bit integer can be as big as
2 ** 64

18446744073709551616

In [51]:
s = Series([10, 20, 30], dtype='int32')  # now we're using 32-bit integers, rather than 64-bit integers


In [52]:
s

0    10
1    20
2    30
dtype: int32

# What options do we have?

- Signed integers
    - `int64` (default)
    - `int32`
    - `int16`
    - `int8`
- Unsigned integers, which are always positive
    - `uint64`
    - `uint32`
    - `uint16`
    - `uint8`
- Floats
    - `float16`
    - `float32`
    - `float64`


In [53]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [54]:
# if each integer is 64 bits (8 bytes) that means that our series contains about 40 bytes

s.memory_usage()

172

In [55]:
s = Series([10, 20, 30, 40, 50, 60])
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

In [56]:
s.memory_usage()

180

In [58]:
# I want to keep track of small numbers.
# Is it OK to use int8?
# Yes! (Probably)

s = Series([2, 10, 15, 20, 30, 35, 100], dtype='int8')
s

0      2
1     10
2     15
3     20
4     30
5     35
6    100
dtype: int8

In [59]:
s + 3

0      5
1     13
2     18
3     23
4     33
5     38
6    103
dtype: int8

In [60]:
s ** 3

0      8
1    -24
2     47
3     64
4    120
5    123
6     64
dtype: int8

In [61]:
# we used 8-bit signed integers, which means that we can go from -127 to +127
# anything above +127 or below -127 "wraps around"

# you have to defend yourself against such things -- make sure that you're using a dtype that is sufficiently large
# for your needs!



# Exercise: Setting dtypes

1. Create a series of 10 (random, using `np.random.randint(LOW, HIGH, NUMBER)`) random integers with `a-j` as the index. Multiply the first and final values by 1,000. Is the dtype a factor.
2. Create a series of random float values from 1-1,000, with the dtype of np.float64. Is that too big? Big enough?

In [63]:
np.random.seed(0)   # ensure that we are all getting the same "random" numbers
Series(np.random.randint(0, 100, 10))

0    51
1    89
2    13
3    89
4    11
5    19
6    19
7    58
8    71
9    60
dtype: int64