In [1]:
import pandas as pd
import numpy as np

# A Note on Column (Re)Assignment

* It is not always clear when Pandas returns a copy vs view.
* It's dangerous, don't do it!
* See Chapter 2.3 and 4.1 in the textbook.
* Caution: Tutorials lie to you!

### TLDR; Make copies whenever possible, use `assign` for adding columns

In [2]:
def create_rands():
    np.random.seed(42)
    data = np.random.randint(0, 5, size=(10, 5))
    df = pd.DataFrame(data, columns=['col%d' % n for n in range(5)])
    return df

# 1 -- slice rows, then change column

In [18]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [19]:
evens = df[df['col0'] % 2 == 0]
# evens = df.loc[9]
evens

Unnamed: 0,col0,col1,col2,col3,col4
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
9,0,1,4,1,3


In [20]:
evens['col0'] = -1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
evens

Unnamed: 0,col0,col1,col2,col3,col4
5,-1000,2,1,3,3
6,-1000,3,3,0,2
7,-1000,2,4,0,1
9,-1000,1,4,1,3


In [22]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


## 2 -- slice column, then change row

In [24]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [25]:
col = df.loc[:, 'col0']
col.loc[df['col0'] % 2 == 0] = -1000
col

0       3
1       1
2       3
3       1
4       1
5   -1000
6   -1000
7   -1000
8       3
9   -1000
Name: col0, dtype: int64

In [26]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,-1000,2,1,3,3
6,-1000,3,3,0,2
7,-1000,2,4,0,1
8,3,0,3,1,1
9,-1000,1,4,1,3


# 3 -- select row, then change entry

In [27]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [28]:
row = df.loc[0]
row['col0'] = -1000

In [29]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,-1000,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


# 3 -- select row, then change entry (w/string column)

In [30]:
df = create_rands().assign(col4='a')
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,a
1,1,2,2,2,a
2,3,2,4,1,a
3,1,3,4,0,a
4,1,4,3,0,a
5,2,2,1,3,a
6,2,3,3,0,a
7,4,2,4,0,a
8,3,0,3,1,a
9,0,1,4,1,a


In [31]:
row = df.loc[0]
row['col0'] = -1000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [32]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,a
1,1,2,2,2,a
2,3,2,4,1,a
3,1,3,4,0,a
4,1,4,3,0,a
5,2,2,1,3,a
6,2,3,3,0,a
7,4,2,4,0,a
8,3,0,3,1,a
9,0,1,4,1,a


## References vs Copies
* If table is homogeneous -- Pandas outsources memory management to numpy (reference)
* If table is heterogeneous -- Pandas makes copies

In [33]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [34]:
arr = df.values
arr[arr == 0] = -100000

In [37]:
arr

array([[      3,       4,       2,       4,       4],
       [      1,       2,       2,       2,       4],
       [      3,       2,       4,       1,       3],
       [      1,       3,       4, -100000,       3],
       [      1,       4,       3, -100000, -100000],
       [      2,       2,       1,       3,       3],
       [      2,       3,       3, -100000,       2],
       [      4,       2,       4, -100000,       1],
       [      3, -100000,       3,       1,       1],
       [-100000,       1,       4,       1,       3]])

In [38]:
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,-100000,3
4,1,4,3,-100000,-100000
5,2,2,1,3,3
6,2,3,3,-100000,2
7,4,2,4,-100000,1
8,3,-100000,3,1,1
9,-100000,1,4,1,3


In [39]:
df = create_rands().assign(col5='a')
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,3,4,2,4,4,a
1,1,2,2,2,4,a
2,3,2,4,1,3,a
3,1,3,4,0,3,a
4,1,4,3,0,0,a
5,2,2,1,3,3,a
6,2,3,3,0,2,a
7,4,2,4,0,1,a
8,3,0,3,1,1,a
9,0,1,4,1,3,a


In [40]:
arr = df.values
arr[arr == 0] = -100000

In [41]:
arr

array([[3, 4, 2, 4, 4, 'a'],
       [1, 2, 2, 2, 4, 'a'],
       [3, 2, 4, 1, 3, 'a'],
       [1, 3, 4, -100000, 3, 'a'],
       [1, 4, 3, -100000, -100000, 'a'],
       [2, 2, 1, 3, 3, 'a'],
       [2, 3, 3, -100000, 2, 'a'],
       [4, 2, 4, -100000, 1, 'a'],
       [3, -100000, 3, 1, 1, 'a'],
       [-100000, 1, 4, 1, 3, 'a']], dtype=object)

In [42]:
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,3,4,2,4,4,a
1,1,2,2,2,4,a
2,3,2,4,1,3,a
3,1,3,4,0,3,a
4,1,4,3,0,0,a
5,2,2,1,3,3,a
6,2,3,3,0,2,a
7,4,2,4,0,1,a
8,3,0,3,1,1,a
9,0,1,4,1,3,a
