In [153]:
import numpy as np
import pandas as pd

## Немного ни о чем

Поговорим про разные представления для пропущенных значений

In [2]:
df_float = pd.DataFrame(
    {
        'column_none': [1., 2., 3., 4., 5., None],
        'column_nan': [1., 2., 3., 4., 5., np.nan],
    }
)

In [3]:
df_float['column_none'] == df_float['column_nan']

0     True
1     True
2     True
3     True
4     True
5    False
dtype: bool

In [4]:
df_float

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [5]:
df_float.loc[5]

column_none   NaN
column_nan    NaN
Name: 5, dtype: float64

In [6]:
df_float['column_none'].dtype, df_float['column_nan'].dtype

(dtype('float64'), dtype('float64'))

Видим, что None скастовался в np.nan и стал float. Что будет, если у нас изначально данные из целых чисел?

In [7]:
df_int = pd.DataFrame(
    {
        'column_none': [1, 2, 3, 4, 5, None],
        'column_nan': [1, 2, 3, 4, 5, np.nan],
    }
)

In [8]:
df_int['column_none'] == df_int['column_nan']

0     True
1     True
2     True
3     True
4     True
5    False
dtype: bool

In [9]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [10]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


Тоже получили конвертацию None в NaN

In [11]:
df_int.loc[5]

column_none   NaN
column_nan    NaN
Name: 5, dtype: float64

In [12]:
df_int['column_none'].dtype, df_int['column_nan'].dtype

(dtype('float64'), dtype('float64'))

Произошел каст, вероятно не очень желательный

In [13]:
df_int['column_nan'] = df_int['column_nan'].astype(np.int16)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

Просто так скастить в integer, не выбрасывая NaN нельзя

In [61]:
df_int['column_nan'] = df_int['column_nan'].astype("Int16")

О, что-то получилось?

In [62]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [63]:
df_int['column_none'].nbytes, df_int['column_nan'].nbytes

(48, 18)

In [64]:
df_int.column_none.loc[5]

nan

In [65]:
df_int.column_nan.loc[5]

<NA>

Получили еще одну версию для "ничего"?))0)

In [66]:
df_int.isna()

Unnamed: 0,column_none,column_nan
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,True,True


In [67]:
df_int['column_none'].loc[5].nbytes

8

In [145]:
df_int.memory_usage(index=False)

column_1    48
column_2    18
dtype: int64

In [69]:
df_int.columns = ['column_1', 'column_2']

In [70]:
df_int

Unnamed: 0,column_1,column_2
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [71]:
df_int.loc[1, 'column_1'] = None
df_int.loc[1, 'column_2'] = None

In [72]:
df_int

Unnamed: 0,column_1,column_2
0,1.0,1.0
1,,
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


Произошел еще один type cast

In [73]:
nas = df_int.column_2[df_int.column_2.isna()]

In [74]:
nas.iloc[0] == nas.iloc[1]

<NA>

In [75]:
nas.iloc[0] is nas.iloc[1]

True

!!!

In [76]:
id(nas.iloc[0]), id(nas.iloc[1])

(139672941813136, 139672941813136)

Это один и тот же объект!

In [77]:
nans = df_int.column_1[df_int.column_1.isna()]

In [78]:
nans

1   NaN
5   NaN
Name: column_1, dtype: float64

In [79]:
a, b = nans.values

In [80]:
a, b

(nan, nan)

In [81]:
a == b

False

In [82]:
a is b

False

In [83]:
id(a), id(b)

(139672314563696, 139672314563920)

И напоследок

In [84]:
set([float('nan'), float('nan')])

{nan, nan}

In [85]:
set([np.float64('nan'), np.float64('nan')])

{nan, nan}

In [86]:
set([pd.NA, pd.NA])

{<NA>}

In [87]:
set([np.nan, np.nan])

{nan}

In [88]:
np.nan is np.nan is np.NaN is np.NAN

True

In [89]:
pd.NA is pd.NA

True

In [90]:
type(1 + pd.NA)

pandas._libs.missing.NAType

In [91]:
id(1 + pd.NA), id(pd.NA)

(139672941813136, 139672941813136)

In [92]:
np.nan is np.NaN is np.NAN

True

In [93]:
type(1 + np.nan)

float

In [94]:
id(np.nan), id(np.nan + 1)

(139673343209232, 139672315328240)

## Задачи

###  Given series A and series B

In [95]:
series_a = pd.Series([1, 2, 4, 3])
series_b = pd.Series([3, 4, 5, 6])

- Items is series A not present in series B

In [96]:
~series_a.isin(series_b)

0     True
1     True
2    False
3    False
dtype: bool

In [97]:
series_a[~series_a.isin(series_b)]

0    1
1    2
dtype: int64

- Intersection of series

In [98]:
series_a = pd.Series([1, 2, 4, 3])
series_b = pd.Series([3, 4, 5, 6])

In [99]:
%%time
np.intersect1d(series_a, series_b)

CPU times: user 67 µs, sys: 3 µs, total: 70 µs
Wall time: 62.7 µs


array([3, 4])

In [100]:
%%time
set(series_a) & set(series_b)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 9.54 µs


{3, 4}

- Items presented only in one of the series, not in both

In [101]:
np.setdiff1d(series_a, series_b), np.setdiff1d(series_b, series_a)

(array([1, 2]), array([5, 6]))

In [102]:
np.union1d(np.setdiff1d(series_a, series_b), np.setdiff1d(series_b, series_a))

array([1, 2, 5, 6])

или

In [103]:
np.setxor1d(series_a, series_b)

array([1, 2, 5, 6])

или

In [104]:
series_union = pd.Series(np.union1d(series_a, series_b))
series_intersect = pd.Series(np.intersect1d(series_a, series_b))

series_union, series_intersect

(0    1
 1    2
 2    3
 3    4
 4    5
 5    6
 dtype: int64,
 0    3
 1    4
 dtype: int64)

In [105]:
series_union[~series_union.isin(series_intersect)]

0    1
1    2
4    5
5    6
dtype: int64

### Merge by column pairs: fruit-pazham, weight-kilo

In [106]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

In [107]:
df1.head()

Unnamed: 0,fruit,weight,price
0,apple,high,7
1,banana,medium,0
2,orange,low,12
3,apple,high,1
4,banana,medium,11


In [108]:
df2.head()

Unnamed: 0,pazham,kilo,price
0,apple,high,14
1,orange,low,4
2,pine,high,7
3,apple,low,13
4,orange,high,9


In [109]:
pd.merge(df1, df2, how='inner', left_on=['fruit', 'weight'], right_on = ['pazham', 'kilo'], suffixes=('_left', '_right'))

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,7,apple,high,14
1,apple,high,1,apple,high,14
2,apple,high,2,apple,high,14
3,orange,low,12,orange,low,4
4,orange,low,7,orange,low,4
5,orange,low,14,orange,low,4


In [110]:
df_merged = df1.merge(df2, how='inner', left_on=['fruit', 'weight'], right_on = ['pazham', 'kilo'], suffixes=('_left', '_right'))
df_merged

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,7,apple,high,14
1,apple,high,1,apple,high,14
2,apple,high,2,apple,high,14
3,orange,low,12,orange,low,4
4,orange,low,7,orange,low,4
5,orange,low,14,orange,low,4


Lets explore dropping duplicate rows

In [111]:
df_merged.drop_duplicates(keep='first')

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,7,apple,high,14
1,apple,high,1,apple,high,14
2,apple,high,2,apple,high,14
3,orange,low,12,orange,low,4
4,orange,low,7,orange,low,4
5,orange,low,14,orange,low,4


In [112]:
df_merged.drop_duplicates(keep='last', inplace=False, ignore_index=False, subset=['fruit', 'weight'])

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
2,apple,high,2,apple,high,14
5,orange,low,14,orange,low,4


- *keep* отвечает за то, какой индекс оставить для одинаковых элементов, первый встреченный или последний
- *ignore_index* -- делать ли реиндексацию или нет
- *subset* -- задать множество колонок, по которым будем считать дупликаты
- *inplace* -- (здесь и далее) делать ли изменения в исходном датафрейме (inplace=True) или вернуть копию

###  Reverse all rows (first row should become last etc.)

In [146]:
df = pd.DataFrame(np.arange(30).reshape(-1, 6), columns=map(lambda x: f'column_{x}', range(6)))

In [147]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29


In [148]:
df.iloc[::-1]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [149]:
df[::-1]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [150]:
df.iloc[::-1, :]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [151]:
df.reindex(index=df.index[::-1])

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [152]:
df.loc[df.index[::-1]]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


### Get column name with the highest number of row-wise maximum’s in dataframe

In [116]:
df = pd.DataFrame(np.random.randint(0, 100, 50).reshape(-1, 5), columns=map(lambda x: f'column_{x}', range(5)))

In [117]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [118]:
df.idxmax(axis=1)

0    column_4
1    column_1
2    column_3
3    column_2
4    column_4
5    column_3
6    column_3
7    column_0
8    column_0
9    column_1
dtype: object

То же самое через df.apply

In [119]:
df.apply(np.argmax, axis=1)

0    4
1    1
2    3
3    2
4    4
5    3
6    3
7    0
8    0
9    1
dtype: int64

In [120]:
df.idxmax(axis=1).value_counts()

column_3    3
column_4    2
column_1    2
column_0    2
column_2    1
dtype: int64

In [121]:
df.idxmax(axis=1).value_counts().index

Index(['column_3', 'column_4', 'column_1', 'column_0', 'column_2'], dtype='object')

Наш ответ

In [122]:
df.idxmax(axis=1).value_counts().index[0]

'column_3'

### Find the positions of numbers that are multiples of N

In [123]:
N = 5

In [124]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


For each column Series separately 

In [125]:
pos_map = df.apply(lambda x: x % N == 0)
for col in pos_map.columns:
    ser = pos_map.loc[:, col]
    print(f'{col}: {ser[ser].index.tolist()}')

column_0: [5]
column_1: [6, 7]
column_2: [7]
column_3: [6]
column_4: [8, 9]


```
column_0: [0, 2, 6, 7]
column_1: [0, 5]
column_2: [6, 9]
column_3: [7, 8]
column_4: [5, 8]
```

Now try to treat rows and columns as coordinates. Return list (or array) of pairs for such elements. (one-liner)

In [126]:
np.argwhere(df.values % N == 0).tolist()

[[5, 0], [6, 1], [6, 3], [7, 1], [7, 2], [8, 4], [9, 4]]

```[[0, 0],
 [0, 1],
 [2, 0],
 [5, 1],
 [5, 4],
 [6, 0],
 [6, 2],
 [7, 0],
 [7, 3],
 [8, 3],
 [8, 4],
 [9, 2]]
 ```

### Compute the minimum-by-maximum for every row of dataframe

In [127]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [128]:
# your code
# hint: use df.min, max or np.min, max

### Normalize all columns of df by subtracting the column mean and divide by standard deviation.

In [129]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [130]:
# your code
# hint: use apply (for what axis?), np.mean and np.std (or similar pandas methods)

### Range all columns of df such that the minimum value in each column is 0 and max is 1

In [131]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [132]:
# your code
# same as prev task. google for the minmax normalization formula 

### Create a column that contains the second-largest value in each row?

In [133]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [134]:
# your code
# hint: apply with sorting

### Split a text column into two separate columns?

In [135]:
df_text = pd.DataFrame(
    {
        'row':
        [
            'id\t Name, Surname',
            '2\t Nadal, Raphael',
            '5\t Djokovic,  Novak',
            '1\t Federer, Roger'
        ]
    }
)

In [136]:
df_text

Unnamed: 0,row
0,"id\t Name, Surname"
1,"2\t Nadal, Raphael"
2,"5\t Djokovic, Novak"
3,"1\t Federer, Roger"


In [137]:
splitted_df = df_text.row.str.split(r'\t|,', expand=True)

In [138]:
splitted_df

Unnamed: 0,0,1,2
0,id,Name,Surname
1,2,Nadal,Raphael
2,5,Djokovic,Novak
3,1,Federer,Roger


In [139]:
splitted_df.columns = splitted_df.iloc[0]

In [140]:
splitted_df

Unnamed: 0,id,Name,Surname
0,id,Name,Surname
1,2,Nadal,Raphael
2,5,Djokovic,Novak
3,1,Federer,Roger


In [141]:
splitted_df.drop(0, inplace=True)

In [142]:
splitted_df

Unnamed: 0,id,Name,Surname
1,2,Nadal,Raphael
2,5,Djokovic,Novak
3,1,Federer,Roger


In [143]:
splitted_df.set_index('id', inplace=True)

In [144]:
splitted_df

Unnamed: 0_level_0,Name,Surname
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Nadal,Raphael
5,Djokovic,Novak
1,Federer,Roger


---

Больше [упражнений на Pandas с решениями](https://www.machinelearningplus.com/python/101-pandas-exercises-python/)

## PS

1. Для тех, кто сидит на Windows, советую поставить WSL2 и работать из-под этой подсистемы:

- [гайд](https://learn.microsoft.com/ru-ru/windows/wsl/install#manual-installation-steps) от Microsoft
- [базовые команды терминала](https://ubuntu.com/tutorials/command-line-for-beginners#1-overview)
- [установите Anaconda](https://www.digitalocean.com/community/tutorials/how-to-install-the-anaconda-python-distribution-on-ubuntu-22-04)

Запускать ноутбучки локально с помощью команды `jupyter notebook --no-browser`. Также можно работать с подсистемой с помощью VSCode

Сможете легко склонировать наш (и не только) [репозиторий](https://github.com/sanityseeker/lspy-2023) с помощью git clone :) 

2. Расширения для Jupyter Notebook -- [nbextensions](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html) 