# Отбор подмножеств данных в pandas


## Часть 4. Как НЕ нужно отбирать подмножества данных


## Учимся, что не нужно делать


## Получение правильного ответа с помощью неправильного кода


## Индексирование по цепочке с использованием списков

In [1]:
a = [1, 5, 10, 3, 99, 5, 8, 20, 40]
a

[1, 5, 10, 3, 99, 5, 8, 20, 40]

In [2]:
a[2:6]

[10, 3, 99, 5]

In [3]:
a[2:6][0]

10

## Присваивание нового значения списку с помощью индексирования по цепочке

In [4]:
a[2:6][0] = 50
a

[1, 5, 10, 3, 99, 5, 8, 20, 40]

## Ничего не произошло???

In [5]:
a_temp = a[2:6]
a_temp[0] = 50
a_temp

[50, 3, 99, 5]

In [6]:
a

[1, 5, 10, 3, 99, 5, 8, 20, 40]

## Временный объект был единственным измененным объектом


## Но разве Python не изменяет «одинаковые» объекты?

In [7]:
a1 = [0, 1, 2, 3]
b1 = a1
b1[0] = 99
b1

[99, 1, 2, 3]

In [8]:
a1

[99, 1, 2, 3]

## Оба изменились!?!


## Докажем, что они являются одинаковыми с помощью функции id


In [9]:
id(a1)

4566206088

In [10]:
id(b1)

4566206088

In [11]:
id(a1) == id(b1)

True

## Итак, почему наше присваивание с помощью индексирования по цепочке не удалось?


## Поверхностная и глубокая копия (для опытных)

In [12]:
a = [7, [1, 2], 5, 6, 10, 14, 19, 20]
a

[7, [1, 2], 5, 6, 10, 14, 19, 20]

In [13]:
a_slice = a[1:4]
a_slice

[[1, 2], 5, 6]

In [14]:
id(a[1]) == id(a_slice[0])

True

## Присвоим внутреннему списку значение

In [15]:
inner_list = a_slice[0]
inner_list

[1, 2]

In [16]:
inner_list[0] = 99

In [17]:
a_slice

[[99, 2], 5, 6]

In [18]:
a

[7, [99, 2], 5, 6, 10, 14, 19, 20]

## Присваивание с помощью индексирования по цепочке в один этап

In [19]:
# выводим наш внутренний список
a

[7, [99, 2], 5, 6, 10, 14, 19, 20]

In [20]:
a[1:5][0][0] = 1000
a

[7, [1000, 2], 5, 6, 10, 14, 19, 20]

## Использование модуля copy для создания глубокой копии

In [21]:
import copy

In [22]:
a = [7, [1, 2], 5, 6, 10, 14, 19, 20]
a_slice = copy.deepcopy(a[1:4])

In [23]:
id(a[1]) == id(a_slice[0])

False

## Индексирование по цепочке в библиотеке pandas

In [24]:
import pandas as pd
df = pd.read_csv('Data/sample_data.csv', index_col=0)
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2


#### Индексирование по цепочке – пример 1

In [25]:
df[['food', 'age', 'color']]['age']

Jane         30
Niko          2
Aaron        12
Penelope      4
Dean         32
Christina    33
Cornelia     69
Name: age, dtype: int64

In [26]:
a = ['food', 'age', 'color']
b = 'age'
df[a][b]

Jane         30
Niko          2
Aaron        12
Penelope      4
Dean         32
Christina    33
Cornelia     69
Name: age, dtype: int64

#### Индексирование по цепочке – пример 2

In [27]:
df.loc[['Niko', 'Dean'], ['state', 'height', 'color']][['height', 'color']]

Unnamed: 0,height,color
Niko,70,green
Dean,180,gray


In [28]:
a = ['Niko', 'Dean'], ['state', 'height', 'color']
b = ['height', 'color']

df.loc[a][b]

Unnamed: 0,height,color
Niko,70,green
Dean,180,gray


#### Индексирование по цепочке – пример 3

In [29]:
df.iloc[2:5].iloc[:, -3:]

Unnamed: 0,age,height,score
Aaron,12,120,9.0
Penelope,4,80,3.3
Dean,32,180,1.8


#### Индексирование по цепочке – пример 4

In [30]:
df.loc[['Aaron', 'Dean', 'Christina']][['age', 'food']]

Unnamed: 0,age,food
Aaron,12,Mango
Dean,32,Cheese
Christina,33,Melon


#### Индексирование по цепочке – пример 5

In [31]:
df[df['age'] > 10]['score']

Jane         4.6
Aaron        9.0
Dean         1.8
Christina    9.5
Cornelia     2.2
Name: score, dtype: float64

## Выявление индексирования по цепочке


## Делаем программный код в примерах идиоматическим

#### Индексирование по цепочке – пример 1 (идиоматический программный код)

In [32]:
# df[['food', 'age', 'color']]['age'] - плохо
df['age']

Jane         30
Niko          2
Aaron        12
Penelope      4
Dean         32
Christina    33
Cornelia     69
Name: age, dtype: int64

#### Индексирование по цепочке – пример 2 (идиоматический программный код)

In [33]:
# df.loc[['Niko', 'Dean'], ['state', 'height', 'color']][['height', 'color']] - плохо
df.loc[['Niko', 'Dean'], ['height', 'color']]

Unnamed: 0,height,color
Niko,70,green
Dean,180,gray


#### Индексирование по цепочке – пример 3 (идиоматический программный код)

In [34]:
# df.iloc[2:5].iloc[:, -3:] - bad
df.iloc[2:5, -3:]

Unnamed: 0,age,height,score
Aaron,12,120,9.0
Penelope,4,80,3.3
Dean,32,180,1.8


#### Индексирование по цепочке – пример 4 (идиоматический программный код)

In [35]:
# df.loc[['Aaron', 'Dean', 'Christina']][['age', 'food']] - плохо
df.loc[['Aaron', 'Dean', 'Christina'], ['age', 'food']]

Unnamed: 0,age,food
Aaron,12,Mango
Dean,32,Cheese
Christina,33,Melon


#### Индексирование по цепочке – пример 5 (идиоматический программный код)

In [36]:
# df[df['age'] > 10]['score'] - плохо
df.loc[df['age'] > 10, 'score']

Jane         4.6
Aaron        9.0
Dean         1.8
Christina    9.5
Cornelia     2.2
Name: score, dtype: float64

## Почему индексирование по цепочке – это плохо?


#### Две отдельные операции


#### Предупреждение SettingWithCopy при выполнении присваивания

In [37]:
df[df['age'] > 10]['score'] = 99

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2


## Присваивание завершилось неудачей!

In [39]:
df_temp = df[df['age'] > 10]
df_temp['score'] = 99

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [40]:
df_temp

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,99
Aaron,FL,red,Mango,12,120,99
Dean,AK,gray,Cheese,32,180,99
Christina,TX,black,Melon,33,172,99
Cornelia,TX,red,Beans,69,150,99


## Как правильно выполнять операцию присваивания?

In [41]:
df.loc[df['age'] > 10, 'score'] = 99
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,99.0
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,99.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,99.0
Christina,TX,black,Melon,33,172,99.0
Cornelia,TX,red,Beans,69,150,99.0


## Пример срабатывания предупреждения SettingWithCopy, когда присваивание выполнено

In [42]:
df['score'][df['age'] > 10]

Jane         99.0
Aaron        99.0
Dean         99.0
Christina    99.0
Cornelia     99.0
Name: score, dtype: float64

In [43]:
df['score'][df['age'] > 10] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,0.0
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,0.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,0.0
Christina,TX,black,Melon,33,172,0.0
Cornelia,TX,red,Beans,69,150,0.0


## Что случилось на этот раз?


## Почему выдается предупреждение, когда наша операция завершена успешно?

In [45]:
df = pd.read_csv('Data/sample_data.csv', index_col=0)

In [46]:
s = df['score']
s

Jane         4.6
Niko         8.3
Aaron        9.0
Penelope     3.3
Dean         1.8
Christina    9.5
Cornelia     2.2
Name: score, dtype: float64

In [47]:
s[s > 5] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#### Почему здесь сработало предупреждение?

In [48]:
s

Jane         4.6
Niko         0.0
Aaron        0.0
Penelope     3.3
Dean         1.8
Christina    0.0
Cornelia     2.2
Name: score, dtype: float64

In [49]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,0.0
Aaron,FL,red,Mango,12,120,0.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,0.0
Cornelia,TX,red,Beans,69,150,2.2


## Почему предупреждение так бесполезно?


## Что на самом деле должно было сказать предупреждение


## Обобщим, когда срабатывает предупреждение SettingWithCopy

In [50]:
df1 = df[['color', 'age']]
df1

Unnamed: 0,color,age
Jane,blue,30
Niko,green,2
Aaron,red,12
Penelope,white,4
Dean,gray,32
Christina,black,33
Cornelia,red,69


In [51]:
df1['age'] # # нет предупреждения, здесь нет присваивания

Jane         30
Niko          2
Aaron        12
Penelope      4
Dean         32
Christina    33
Cornelia     69
Name: age, dtype: int64

In [52]:
df1['weight'] = [150, 30, 120, 40, 200, 130, 144]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
df1

Unnamed: 0,color,age,weight
Jane,blue,30,150
Niko,green,2,30
Aaron,red,12,120
Penelope,white,4,40
Dean,gray,32,200
Christina,black,33,130
Cornelia,red,69,144


In [54]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,0.0
Aaron,FL,red,Mango,12,120,0.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,0.0
Cornelia,TX,red,Beans,69,150,2.2


In [55]:
df1['age'] = 99

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [56]:
df1

Unnamed: 0,color,age,weight
Jane,blue,99,150
Niko,green,99,30
Aaron,red,99,120
Penelope,white,99,40
Dean,gray,99,200
Christina,black,99,130
Cornelia,red,99,144


In [57]:
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,0.0
Aaron,FL,red,Mango,12,120,0.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,0.0
Cornelia,TX,red,Beans,69,150,2.2


## Как библиотека pandas узнает, что нужно вызвать предупреждение?

In [58]:
df1.is_copy()

  """Entry point for launching an IPython kernel.


Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,0.0
Aaron,FL,red,Mango,12,120,0.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,0.0
Cornelia,TX,red,Beans,69,150,2.2


In [59]:
df1._is_view

False

In [60]:
df.is_copy is None

  """Entry point for launching an IPython kernel.


True

In [61]:
df._is_view

False

In [62]:
food = df['food']
food.is_copy is None # не копия

  


True

In [63]:
food._is_view

True

## Ложное отсутствие срабатывания SettingWithCopy при использовании индексирования по цепочке с индексаторами .loc и .iloc

In [64]:
df = pd.read_csv('Data/sample_data.csv', index_col=0)

In [65]:
df.loc[['Niko','Dean']]['age'] = 99
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2


In [66]:
df.loc['Niko':'Dean']['age'] = 99
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,99,70,8.3
Aaron,FL,red,Mango,99,120,9.0
Penelope,AL,white,Apple,99,80,3.3
Dean,AK,gray,Cheese,99,180,1.8
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2


## Какого черта????


## Хорошая новость


## Два распространенных сценария


## Сценарий 1 – Работа со всем объектом DataFrame

In [67]:
df = pd.read_csv('Data/sample_data.csv', index_col=0)

df.loc[df['state'] == 'TX', 'color'] = 'maroon'
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,maroon,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8
Christina,TX,maroon,Melon,33,172,9.5
Cornelia,TX,maroon,Beans,69,150,2.2


In [68]:
df.loc[['Jane', 'Dean', 'Cornelia'], 'score'] = df.loc[['Jane', 'Dean', 'Cornelia'], 'score'] + 5
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,9.6
Niko,TX,maroon,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,6.8
Christina,TX,maroon,Melon,33,172,9.5
Cornelia,TX,maroon,Beans,69,150,7.2


In [69]:
df.loc['Aaron', 'age'] = 15
df

Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,9.6
Niko,TX,maroon,Lamb,2,70,8.3
Aaron,FL,red,Mango,15,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,6.8
Christina,TX,maroon,Melon,33,172,9.5
Cornelia,TX,maroon,Beans,69,150,7.2


## Сценарий 2 – Работа с подмножеством исходного объекта DataFrame

In [70]:
df1 = df[['food', 'height']]
df1

Unnamed: 0,food,height
Jane,Steak,165
Niko,Lamb,70
Aaron,Mango,120
Penelope,Apple,80
Dean,Cheese,180
Christina,Melon,172
Cornelia,Beans,150


In [71]:
df1.is_copy()

  """Entry point for launching an IPython kernel.


Unnamed: 0,state,color,food,age,height,score
Jane,NY,blue,Steak,30,165,9.6
Niko,TX,maroon,Lamb,2,70,8.3
Aaron,FL,red,Mango,15,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,6.8
Christina,TX,maroon,Melon,33,172,9.5
Cornelia,TX,maroon,Beans,69,150,7.2


## Используйте метод .copy()

In [72]:
df1 = df[['food', 'height']].copy()
df1.is_copy is None

  


True

In [73]:
df1['height'] = 100
df1

Unnamed: 0,food,height
Jane,Steak,100
Niko,Lamb,100
Aaron,Mango,100
Penelope,Apple,100
Dean,Cheese,100
Christina,Melon,100
Cornelia,Beans,100


## Избегайте двусмысленности и сложности

## Отбор строк только с помощью оператора индексирования

In [74]:
df = pd.read_csv('Data/sample_data.csv', index_col=0)

In [75]:
df[1::2]

Unnamed: 0,state,color,food,age,height,score
Niko,TX,green,Lamb,2,70,8.3
Penelope,AL,white,Apple,4,80,3.3
Christina,TX,black,Melon,33,172,9.5


In [76]:
df['Niko':'Dean']

Unnamed: 0,state,color,food,age,height,score
Niko,TX,green,Lamb,2,70,8.3
Aaron,FL,red,Mango,12,120,9.0
Penelope,AL,white,Apple,4,80,3.3
Dean,AK,gray,Cheese,32,180,1.8


## Еще более странный… частичный отбор подмножества строк

In [77]:
df_sort = df.sort_index()
df_sort

Unnamed: 0,state,color,food,age,height,score
Aaron,FL,red,Mango,12,120,9.0
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2
Dean,AK,gray,Cheese,32,180,1.8
Jane,NY,blue,Steak,30,165,4.6
Niko,TX,green,Lamb,2,70,8.3
Penelope,AL,white,Apple,4,80,3.3


In [78]:
df_sort['C':'E']

Unnamed: 0,state,color,food,age,height,score
Christina,TX,black,Melon,33,172,9.5
Cornelia,TX,red,Beans,69,150,2.2
Dean,AK,gray,Cheese,32,180,1.8


## Я никогда так не делаю

## Если я хочу получить срез строк, я всегда использую .loc/.iloc

## Поиск скалярного значения с помощью .at/.iat

In [79]:
df.at['Dean', 'age']

32

In [80]:
df.iat[5, 2]

'Melon'

## Какое предназначение у индексаторов .at/.iat?

## Выводы