# Школа алготрейдеров. Блок торгового ПО и программирования
## Занятие 2. Стандартная библиотека языка Python. Обработка табличных данных с помощью библиотеки pandas

### [Тип `set`](https://docs.python.org/3/library/stdtypes.html#set) (неупорядоченные множества)

In [1]:
s = {1, 7, 'abc', 3.2}
s

{1, 3.2, 'abc', 7}

In [2]:
len(s), list(s)

(4, [1, 3.2, 'abc', 7])

In [3]:
for x in s:
    print(x * 2)

2
6.4
abcabc
14


In [4]:
[x * 2 for x in s]

[2, 6.4, 'abcabc', 14]

#### Получить все уникальные элементы списка

In [5]:
a = [x % 7 for x in range(10) if x % 3 in (0, 2)]
a

[0, 2, 3, 5, 6, 1, 2]

In [6]:
set(a)

{0, 1, 2, 3, 5, 6}

In [7]:
list(set(a))

[0, 1, 2, 3, 5, 6]

In [8]:
{x % 7 for x in range(10) if x % 3 in (0, 2)}

{0, 1, 2, 3, 5, 6}

#### Операции с множествами

In [9]:
a = set()  # пустое множество
b = {2, 3, 4}
a.add(1)
a.add(2)
b.remove(4)
a, b

({1, 2}, {2, 3})

In [10]:
a | b

{1, 2, 3}

In [11]:
a & b

{2}

In [12]:
a - b

{1}

In [13]:
a ^ b

{1, 3}

In [14]:
a ^= b
a

{1, 3}

In [15]:
1 in a, 2 in a

(True, False)

In [16]:
{3, 1, 2} > {3, 2}

True

In [17]:
{3, 1, 2} > {2, 1, 3}

False

#### Ограничения на элементы

In [18]:
s = set()
s.add(8)
s.add('text')
s.add(True)
s

{8, True, 'text'}

In [19]:
s.add([5, 6, 7])  # элементы изменяемых типов нельзя класть в set

TypeError: unhashable type: 'list'

In [20]:
s.add((5, 6, 7))
s

{8, True, (5, 6, 7), 'text'}

#### Неизменяемый `set` — `frozenset`

In [21]:
s.add({1})

TypeError: unhashable type: 'set'

In [22]:
s.update((frozenset({2, 3}), frozenset({3, 2})))
s

{8, True, (5, 6, 7), 'text', frozenset({2, 3})}

#### Особенности

In [23]:
{False, True, 0, 1, 0.0, 1.0}

{0.0, 1.0}

In [24]:
0 == False == 0.0 != 1 == True == 1.0

True

### [Тип `dict`](https://docs.python.org/3/library/stdtypes.html#mapping-types-dict) (словари)

In [25]:
d = {5: 'five', '7': 'seven', 3.14: '≈ pi', (9, 0): []}
d

{3.14: '≈ pi', (9, 0): [], '7': 'seven', 5: 'five'}

In [26]:
type({})  # пустой словарь

dict

In [27]:
d[9, 0]

[]

In [28]:
d[9, 0] = 49
d

{3.14: '≈ pi', (9, 0): 49, '7': 'seven', 5: 'five'}

In [29]:
d[frozenset({-3, 'ab'})] = {89}
d

{3.14: '≈ pi',
 (9, 0): 49,
 '7': 'seven',
 5: 'five',
 frozenset({-3, 'ab'}): {89}}

In [30]:
{x: x ** 2 for x in range(10)}

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64, 9: 81}

In [31]:
a = {1: 2}
b = a
b[1] = 89
a, b

({1: 89}, {1: 89})

In [34]:
from copy import deepcopy
a = {1: 2}
b = deepcopy(a)
b[1] = 89
a, b

({1: 2}, {1: 89})

In [35]:
d = {'One': 1, 'Two': 2, 'Three': 3}
for x in d:
    print(x)

One
Two
Three


In [36]:
for key in d.keys():
    print(key)

One
Two
Three


In [37]:
type(d.keys())

dict_keys

In [38]:
for value in d.values():
    print(value)

1
2
3


In [39]:
for item in d.items():
    print(item)

('One', 1)
('Two', 2)
('Three', 3)


In [40]:
for key, value in d.items():
    print('d[{}] = {}'.format(key, value))

d[One] = 1
d[Two] = 2
d[Three] = 3


In [41]:
list(d.items())

[('One', 1), ('Two', 2), ('Three', 3)]

#### Распаковка словарей

In [42]:
a = {'one': 1, 'two': 2}
{'three': 3, **a, 'four': 4}

{'four': 4, 'one': 1, 'three': 3, 'two': 2}

In [43]:
'one = {one}, two = {two}'.format(**a)

'one = 1, two = 2'

### Функции

In [44]:
def find_all_squares_in_range(max_value):
    squares = []
    square_base = 0
    while square_base ** 2 <= max_value:
        squares.append(square_base ** 2)
        square_base += 1
    return squares

In [45]:
find_all_squares_in_range(0)

[0]

In [46]:
find_all_squares_in_range(111)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100]

In [47]:
sum(find_all_squares_in_range(10))

14

In [48]:
def find_all_squares_in_range(max_value, skip_even=False):
    squares = []
    square_base = 0
    while square_base ** 2 <= max_value:
        if not (skip_even and square_base % 2 == 0):
            squares.append(square_base ** 2)
        square_base += 1
    return squares

In [49]:
find_all_squares_in_range(111, True)

[1, 9, 25, 49, 81]

In [50]:
find_all_squares_in_range(111, skip_even=True)

[1, 9, 25, 49, 81]

In [51]:
def strange_abs(x):
    if x < 0:
        return -x

In [52]:
print(strange_abs(-9))

9


In [53]:
print(strange_abs(9))

None


#### [Лямбда-функции](https://docs.python.org/3.5/tutorial/controlflow.html#lambda-expressions)

In [54]:
average = lambda x, y: (x + y) / 2
average

<function __main__.<lambda>>

In [55]:
average(6, 9)

7.5

#### Сортировка по параметру

In [56]:
a = [(1, 'one'), (4, 'four'), (2, 'two'), (3, 'three')]
a.sort()
a

[(1, 'one'), (2, 'two'), (3, 'three'), (4, 'four')]

In [57]:
a.sort(key=lambda item: item[1])  # «ключ» для сортировки — 2-й элемент каждого кортежа
a

[(4, 'four'), (1, 'one'), (3, 'three'), (2, 'two')]

In [58]:
companies = ['Finam', 'MOEX', 'Yandex', 'Google']
sorted(companies)

['Finam', 'Google', 'MOEX', 'Yandex']

In [59]:
# сравниваем сначала по длине, затем по названию; обратный порядок
sorted(companies, key=lambda name: (len(name), name), reverse=True)

['Yandex', 'Google', 'Finam', 'MOEX']

#### Обработка произвольного числа аргументов

In [60]:
def print_all_args(*args, **kwargs):
    print('Positional arguments: {},\nkeyword arguments: {}'.format(args, kwargs))
    
print_all_args(2, 'text', None, first=1, second=2, third=3)

Positional arguments: (2, 'text', None),
keyword arguments: {'third': 3, 'first': 1, 'second': 2}


#### Генераторы

In [61]:
def find_all_squares_in_range_better(max_value):
    square_base = 0
    while square_base ** 2 <= max_value:
        yield square_base ** 2
        square_base += 1

In [62]:
type(find_all_squares_in_range_better(10))

generator

In [63]:
sum(find_all_squares_in_range_better(10))

14

### [PEP8](https://www.python.org/dev/peps/pep-0008/): рекомендации по оформлению кода

### [`Fraction`](https://docs.python.org/3/library/fractions.html#module-fractions) — рациональные дроби

### [`Decimal`](https://docs.python.org/3/library/decimal.html#module-decimal) — вещественные числа произвольной точности

### [Модуль `datetime`](https://docs.python.org/3/library/datetime.html#module-datetime): работа с датой и временем

In [64]:
import datetime as dt
dt.time(21, 4, 39)

datetime.time(21, 4, 39)

In [65]:
from datetime import time
time(21, 4, 39)

datetime.time(21, 4, 39)

In [66]:
ts = dt.datetime(2016, 6, 3, 17, 24, 4)
ts

datetime.datetime(2016, 6, 3, 17, 24, 4)

In [67]:
ts.day

3

In [68]:
ts.minute

24

In [69]:
ts.strftime('%d.%m.%Y %H:%M:%S')

'03.06.2016 17:24:04'

In [70]:
dt.datetime.strptime('03.06.2016 17:24:04', '%d.%m.%Y %H:%M:%S')

datetime.datetime(2016, 6, 3, 17, 24, 4)

In [71]:
lecture_start = dt.datetime(2016, 9, 8, 20, 45)
lecture_end = dt.datetime(2016, 9, 8, 22, 15)
td = lecture_end - lecture_start
td

datetime.timedelta(0, 5400)

In [72]:
td.total_seconds()

5400.0

In [73]:
lecture_start + td

datetime.datetime(2016, 9, 8, 22, 15)

### Загрузка данных из файлов

#### [Загружаем котировки](http://www.finam.ru/profile/moex-akcii/pllc-yandex-n-v/export/?market=1&em=388383&code=YNDX&apply=0&df=7&mf=8&yf=2016&from=07.09.2016&dt=7&mt=8&yt=2016&to=07.09.2016&p=7&f=YNDX_160907_160907&e=.txt&cn=YNDX&dtf=1&tmf=1&MSOR=1&mstime=on&mstimever=1&sep=1&sep2=1&datf=1&at=1)

![](finam-2-yndx-params.png)

In [74]:
%ls

 ’®¬ ў гбва®©бвўҐ C ­Ґ Ё¬ҐҐв ¬ҐвЄЁ.
 ‘ҐаЁ©­л© ­®¬Ґа в®¬ : B019-4922

 ‘®¤Ґа¦Ё¬®Ґ Ї ЇЄЁ C:\Users\User\Documents\IPython Notebooks\”Ё­ ¬\jupyter-notebooks

07.09.2016  23:42    <DIR>          .
07.09.2016  23:42    <DIR>          ..
07.09.2016  21:41    <DIR>          .ipynb_checkpoints
06.09.2016  22:14            68я675 finam-1.ipynb
07.09.2016  23:42            29я072 finam-2.ipynb
07.09.2016  22:42            26я248 finam-2-yndx-params.png
07.09.2016  22:09           162я023 YNDX_160901_160907.csv
               4 д ©«®ў        286я018 Ў ©в
               3 Ї Ї®Є   2я293я297я152 Ў ©в бў®Ў®¤­®


In [75]:
filename = 'YNDX_160901_160907.csv'
file = open(filename)

In [76]:
file.read()[:100]

'<TICKER>,<PER>,<DATE>,<TIME>,<LAST>,<VOL>\nYNDX,0,20160901,100003,1430.500000000,7\nYNDX,0,20160901,10'

In [77]:
file = open(filename)

In [78]:
file.readline()

'<TICKER>,<PER>,<DATE>,<TIME>,<LAST>,<VOL>\n'

In [79]:
lines = open(filename).readlines()
lines[:10]

['<TICKER>,<PER>,<DATE>,<TIME>,<LAST>,<VOL>\n',
 'YNDX,0,20160901,100003,1430.500000000,7\n',
 'YNDX,0,20160901,100010,1431.000000000,1\n',
 'YNDX,0,20160901,100010,1431.000000000,23\n',
 'YNDX,0,20160901,100010,1431.000000000,9\n',
 'YNDX,0,20160901,100010,1430.500000000,6\n',
 'YNDX,0,20160901,100014,1438.000000000,1\n',
 'YNDX,0,20160901,100018,1431.500000000,16\n',
 'YNDX,0,20160901,100018,1431.500000000,5\n',
 'YNDX,0,20160901,100018,1431.500000000,5\n']

In [80]:
header, *lines = open(filename)
header, lines[:10]

('<TICKER>,<PER>,<DATE>,<TIME>,<LAST>,<VOL>\n',
 ['YNDX,0,20160901,100003,1430.500000000,7\n',
  'YNDX,0,20160901,100010,1431.000000000,1\n',
  'YNDX,0,20160901,100010,1431.000000000,23\n',
  'YNDX,0,20160901,100010,1431.000000000,9\n',
  'YNDX,0,20160901,100010,1430.500000000,6\n',
  'YNDX,0,20160901,100014,1438.000000000,1\n',
  'YNDX,0,20160901,100018,1431.500000000,16\n',
  'YNDX,0,20160901,100018,1431.500000000,5\n',
  'YNDX,0,20160901,100018,1431.500000000,5\n',
  'YNDX,0,20160901,100018,1430.500000000,21\n'])

In [81]:
header = [name[1:-1].capitalize() for name in header.strip().split(',')]
header

['Ticker', 'Per', 'Date', 'Time', 'Last', 'Vol']

In [82]:
[
    line.strip().split(',')
    for line in lines
][:10]

[['YNDX', '0', '20160901', '100003', '1430.500000000', '7'],
 ['YNDX', '0', '20160901', '100010', '1431.000000000', '1'],
 ['YNDX', '0', '20160901', '100010', '1431.000000000', '23'],
 ['YNDX', '0', '20160901', '100010', '1431.000000000', '9'],
 ['YNDX', '0', '20160901', '100010', '1430.500000000', '6'],
 ['YNDX', '0', '20160901', '100014', '1438.000000000', '1'],
 ['YNDX', '0', '20160901', '100018', '1431.500000000', '16'],
 ['YNDX', '0', '20160901', '100018', '1431.500000000', '5'],
 ['YNDX', '0', '20160901', '100018', '1431.500000000', '5'],
 ['YNDX', '0', '20160901', '100018', '1430.500000000', '21']]

In [83]:
# zip позволяет одновременно пройтись по двум последовательностям одинаковой длины
[
    {
        name: value
        for name, value in zip(header, line.strip().split(','))
    }
    for line in lines
][:5]

[{'Date': '20160901',
  'Last': '1430.500000000',
  'Per': '0',
  'Ticker': 'YNDX',
  'Time': '100003',
  'Vol': '7'},
 {'Date': '20160901',
  'Last': '1431.000000000',
  'Per': '0',
  'Ticker': 'YNDX',
  'Time': '100010',
  'Vol': '1'},
 {'Date': '20160901',
  'Last': '1431.000000000',
  'Per': '0',
  'Ticker': 'YNDX',
  'Time': '100010',
  'Vol': '23'},
 {'Date': '20160901',
  'Last': '1431.000000000',
  'Per': '0',
  'Ticker': 'YNDX',
  'Time': '100010',
  'Vol': '9'},
 {'Date': '20160901',
  'Last': '1430.500000000',
  'Per': '0',
  'Ticker': 'YNDX',
  'Time': '100010',
  'Vol': '6'}]

### [pandas](http://pandas.pydata.org/pandas-docs/stable/)

In [84]:
import pandas as pd

In [85]:
data = pd.read_csv(filename)
data.head(5)

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<LAST>,<VOL>
0,YNDX,0,20160901,100003,1430.5,7
1,YNDX,0,20160901,100010,1431.0,1
2,YNDX,0,20160901,100010,1431.0,23
3,YNDX,0,20160901,100010,1431.0,9
4,YNDX,0,20160901,100010,1430.5,6


In [86]:
data = pd.read_csv(filename, parse_dates={'<DATETIME>': ['<DATE>', '<TIME>']}, index_col='<DATETIME>')
data.head(5)

Unnamed: 0_level_0,<TICKER>,<PER>,<LAST>,<VOL>
<DATETIME>,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-09-01 10:00:03,YNDX,0,1430.5,7
2016-09-01 10:00:10,YNDX,0,1431.0,1
2016-09-01 10:00:10,YNDX,0,1431.0,23
2016-09-01 10:00:10,YNDX,0,1431.0,9
2016-09-01 10:00:10,YNDX,0,1430.5,6


In [87]:
data.rename(columns=lambda name: name[1:-1].lower(), inplace=True)
data.rename(columns={'last': 'price'}, inplace=True)
data.head(5)

Unnamed: 0_level_0,ticker,per,price,vol
<DATETIME>,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-09-01 10:00:03,YNDX,0,1430.5,7
2016-09-01 10:00:10,YNDX,0,1431.0,1
2016-09-01 10:00:10,YNDX,0,1431.0,23
2016-09-01 10:00:10,YNDX,0,1431.0,9
2016-09-01 10:00:10,YNDX,0,1430.5,6


In [88]:
data.drop(['ticker', 'per'], axis=1, inplace=True)
data.head(5)

Unnamed: 0_level_0,price,vol
<DATETIME>,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-09-01 10:00:03,1430.5,7
2016-09-01 10:00:10,1431.0,1
2016-09-01 10:00:10,1431.0,23
2016-09-01 10:00:10,1431.0,9
2016-09-01 10:00:10,1430.5,6


In [89]:
data.describe()

Unnamed: 0,price,vol
count,3893.0,3893.0
mean,1446.438351,35.599538
std,18.306256,78.610474
min,1410.5,1.0
25%,1431.5,2.0
50%,1445.5,10.0
75%,1462.0,37.0
max,1482.5,1000.0


In [90]:
data['price'].head(5)

<DATETIME>
2016-09-01 10:00:03    1430.5
2016-09-01 10:00:10    1431.0
2016-09-01 10:00:10    1431.0
2016-09-01 10:00:10    1431.0
2016-09-01 10:00:10    1430.5
Name: price, dtype: float64

In [91]:
data.iloc[0]

price    1430.5
vol         7.0
Name: 2016-09-01 10:00:03, dtype: float64

In [92]:
for i, row in data.iterrows():
    print('Datetime {}, price {}, vol {}'.format(row.name, row.price, row.vol))
    break

Datetime 2016-09-01 10:00:03, price 1430.5, vol 7.0


In [93]:
data.loc[data.vol > 700]

Unnamed: 0_level_0,price,vol
<DATETIME>,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-09-06 17:13:13,1447.0,836
2016-09-06 17:39:28,1445.5,747
2016-09-06 17:42:53,1445.0,1000
2016-09-06 18:27:47,1440.0,956
2016-09-07 13:39:05,1434.0,945
2016-09-07 16:46:20,1420.5,959


In [94]:
data.vol > 700

<DATETIME>
2016-09-01 10:00:03    False
2016-09-01 10:00:10    False
2016-09-01 10:00:10    False
2016-09-01 10:00:10    False
2016-09-01 10:00:10    False
2016-09-01 10:00:14    False
2016-09-01 10:00:18    False
2016-09-01 10:00:18    False
2016-09-01 10:00:18    False
2016-09-01 10:00:18    False
2016-09-01 10:00:24    False
2016-09-01 10:00:24    False
2016-09-01 10:00:24    False
2016-09-01 10:00:51    False
2016-09-01 10:00:51    False
2016-09-01 10:00:51    False
2016-09-01 10:00:52    False
2016-09-01 10:00:52    False
2016-09-01 10:01:04    False
2016-09-01 10:01:04    False
2016-09-01 10:01:08    False
2016-09-01 10:01:09    False
2016-09-01 10:01:10    False
2016-09-01 10:02:09    False
2016-09-01 10:02:09    False
2016-09-01 10:02:09    False
2016-09-01 10:02:09    False
2016-09-01 10:02:12    False
2016-09-01 10:02:43    False
2016-09-01 10:02:58    False
                       ...  
2016-09-07 18:28:57    False
2016-09-07 18:28:57    False
2016-09-07 18:28:57    False
201

In [101]:
data.resample('2H').mean().head(15)

Unnamed: 0_level_0,price,vol
<DATETIME>,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-09-01 10:00:00,1434.65748,24.370079
2016-09-01 12:00:00,1436.693548,61.612903
2016-09-01 14:00:00,1436.811688,21.380952
2016-09-01 16:00:00,1457.742515,62.718563
2016-09-01 18:00:00,1461.669118,28.735294
2016-09-01 20:00:00,,
2016-09-01 22:00:00,,
2016-09-02 00:00:00,,
2016-09-02 02:00:00,,
2016-09-02 04:00:00,,
