# DICT
## 给字典排序

### 字典是无序的，排序则需要对字典进行格式转换

In [1]:
price = { 'ALAPHA': 10, 'BETA': 100,  'SITA': 33,  'GAMA': 470}
sorted_price = sorted(zip(price.values(), price.keys()))
max_price = max(zip(price.values(), price.keys()))
'sorted price is {} and the max is {}'.format(sorted_price, max_price)

"sorted price is [(10, 'ALAPHA'), (33, 'SITA'), (100, 'BETA'), (470, 'GAMA')] and the max is (470, 'GAMA')"

### 传递参数来进行排序也很优雅

In [2]:
max(price, key=lambda k: price[k])  # 注意，如果不注明，则默认指字典的Key

'GAMA'

### 非常复杂的字典进行排序，要用到operator模块itemgetter函数

In [3]:
rows = [
    {'fname': 'Brian', 'lname': 'Chloe', 'uid': 1003},
    {'fname': 'Jack', 'lname': 'Chloe2', 'uid': 1002},
    {'fname': 'Ken', 'lname': 'Chloe3', 'uid': 1001},
    {'fname': 'Brian', 'lname': 'Chloe4', 'uid': 1004},
]

from operator import itemgetter

rows_by_fname = sorted(rows, key=itemgetter('fname'))
rows_by_uid = sorted(rows, key=itemgetter('uid'))
rows_by_lfname = sorted(rows, key=itemgetter('lname', 'fname'))

rows_by_lfname = sorted(rows, key=lambda r: (r['lname'], r['fname']))  # 或者继续传递参数，跟上面一行语句效果一样

rows_by_lfname

[{'fname': 'Brian', 'lname': 'Chloe', 'uid': 1003},
 {'fname': 'Jack', 'lname': 'Chloe2', 'uid': 1002},
 {'fname': 'Ken', 'lname': 'Chloe3', 'uid': 1001},
 {'fname': 'Brian', 'lname': 'Chloe4', 'uid': 1004}]

#### 根据摸个特定的字段（比如日期）来分组迭代数据，可以用模块itertools的函数groupby

In [4]:
rows = [
    {'address': 'ekjrlks', 'date': '07/01/2012'},
    {'address': 'dewq3334 3', 'date': '04/21/2012'},
    {'address': 'asdfebbb', 'date': '08/08/2012'},
    {'address': 'rrrrrase 33', 'date': '08/08/2012'},
    {'address': 'bbbadfe', 'date': '09/24/2012'},
    {'address': 'qqqqwer3444 rdsaf', 'date': '11/01/2012'},
    {'address': 'dasrer we', 'date': '05/06/2012'},
    {'address': 'eewqqwefadf w3rdf', 'date': '12/03/2012'},
    {'address': 'jmjgyujhf', 'date': '04/21/2012'}
]

from operator import itemgetter
from itertools import groupby

rows.sort(key=itemgetter('date'))  # itertools.groupby只能发现紧挨着的相同项，所以先要sort一下

for date, items in groupby(rows, key=itemgetter('date')):  # groupby会生成一个值（在例子里面是date），和一个子迭代器（所有的项）
    print(date)
    for i in items:
        print(' ', i)

04/21/2012
(' ', {'date': '04/21/2012', 'address': 'dewq3334 3'})
(' ', {'date': '04/21/2012', 'address': 'jmjgyujhf'})
05/06/2012
(' ', {'date': '05/06/2012', 'address': 'dasrer we'})
07/01/2012
(' ', {'date': '07/01/2012', 'address': 'ekjrlks'})
08/08/2012
(' ', {'date': '08/08/2012', 'address': 'asdfebbb'})
(' ', {'date': '08/08/2012', 'address': 'rrrrrase 33'})
09/24/2012
(' ', {'date': '09/24/2012', 'address': 'bbbadfe'})
11/01/2012
(' ', {'date': '11/01/2012', 'address': 'qqqqwer3444 rdsaf'})
12/03/2012
(' ', {'date': '12/03/2012', 'address': 'eewqqwefadf w3rdf'})


## 字典序列里取最大或最小的几个值，使用堆序列模块heapq

In [5]:
# 序列举例
import heapq

nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2]
print(heapq.nlargest(3, nums))  # 最大三个
print(heapq.nsmallest(3, nums))  # 最小三个

print(nums)
heapq.heapify(nums)  # nums元素会以“堆”的顺序排列，堆的特点就是第一个元素总是最小的那个，其它不论顺序。
print(nums)
print(heapq.heappop(nums))
print(nums)
print(heapq.heappop(nums))
print(nums)
print(heapq.heappop(nums))
print(nums)
print(heapq.heappop(nums))
print(nums)

[42, 37, 23]
[-4, 1, 2]
[1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2]
[-4, 2, 1, 23, 7, 2, 18, 23, 42, 37, 8]
-4
[1, 2, 2, 23, 7, 8, 18, 23, 42, 37]
1
[2, 2, 8, 23, 7, 37, 18, 23, 42]
2
[2, 7, 8, 23, 42, 37, 18, 23]
2
[7, 23, 8, 23, 42, 37, 18]


In [6]:
# 字典序列时，可以像sort一样传入key，从而能处理复杂的数据
portfolio = [
    {'name': 'IBM', 'shares': 100, 'price': 91.1},
    {'name': 'IBM2', 'shares': 50, 'price': 520.1},
    {'name': 'IBM3', 'shares': 10, 'price': 31.75},
    {'name': 'IBM4', 'shares': 122, 'price': 16.2},
    {'name': 'IBM5', 'shares': 70, 'price': 115.65},
]

cheap = heapq.nsmallest(3, portfolio, key=lambda s: s['price'])
expensive = heapq.nlargest(3, portfolio, key=lambda s: s['price'])
print(cheap)
print(expensive)

[{'price': 16.2, 'name': 'IBM4', 'shares': 122}, {'price': 31.75, 'name': 'IBM3', 'shares': 10}, {'price': 91.1, 'name': 'IBM', 'shares': 100}]
[{'price': 520.1, 'name': 'IBM2', 'shares': 50}, {'price': 115.65, 'name': 'IBM5', 'shares': 70}, {'price': 91.1, 'name': 'IBM', 'shares': 100}]


### 从定义的时候就赋予它“有序”的属性，先加入的排在前面，使用模块collections里面的类OrderedDict

In [7]:
from collections import OrderedDict

d = OrderedDict()
d['foo'] = 1
d['bar'] = 2
d['spam'] = 3
d['grok'] = 4

for key in d:
    print(key, d[key])


import json

a = json.dumps(d)
a

('foo', 1)
('bar', 2)
('spam', 3)
('grok', 4)


'{"foo": 1, "bar": 2, "spam": 3, "grok": 4}'

## 合并两个字典 
### 最简单的方法就是update，但update会修改掉对象字典；不想改掉原字典，就得copy()出来一个新的

In [8]:
a = {'x': 1, 'z': 3}
b = {'y': 2, 'z': 4}
c = a.copy()
c.update(b)
c

{'x': 1, 'y': 2, 'z': 4}

### 可以使用collections模块ChainMap函数（Python3），将多个字典逻辑上合成一个

In [9]:
from collections import ChainMap

c = ChainMap(a, b) # 生成的其实不是个字典，但是可以像字典一样用
print(c)  # print这个c会报错，因为实际上c不是一个新的字典，实体不存在
print(c['x'])
print(c['y'])
print(c['z'])  # 重复的key，只采用第一个映射中对应的值

print(len(c))
print(list(c.keys()))
print(list(c.values()))

c['z'] = 10  # 修改会反馈到原生字典/映射里去
print(c)
print(b)

c['w'] = 40  # 增加元素总是只在第一个映射结构上
print(c)
print(a)

ImportError: cannot import name ChainMap

## 比较字典
### 字典的key支持常见的集合操作，但是value不支持

In [None]:
a = {'x': 1, 'y': 2, 'z': 3}
b = {'w': 10, 'x': 11, 'y': 2}
a = {'x': 1, 'y': 2, 'z': 3}
b = {'w': 10, 'x': 11, 'y': 2}
print(a.keys() & b.keys())  # Python3 codes
print(a.keys() - b.keys())
print(a.items() & b.items())  # 返回一个序列，包含了a,b交集的一个tuple, tuple里面是key和value

c = {key: a[key] for key in a.keys() - {'z', 'w'}}
print(c)

# LIST, TUPLE
## 快速定义使用变量 - 分解操作_*

In [None]:
# 如果对象是可迭代的，就可以执行分解操作，快速定义变量得到你想要的值
data = ['ACME', 50, 91.1, (2012, 12, 21)]
name, shares, price, date = data
print(name, shares, price, date)

name, shares, price, (year, mon, day) = data
print(year, mon, day)

# 不要的变量可以用_来直接丢弃
_, shares, prices, _ = data
print(shares, prices)

data2 = 'Hello'
a, b, c, d, e = data2
print(a, b, c, d, e)

# 如果对象的长度是可变的/超过了想分解出来的长度，可以用“*变量”的方式来解决"too many values to unpack"的问题
a, *b, c = data2
print(a, b, c)

record = ('Chloe', 'chloe@example.com', '773-555-1212', '847-555-1212')
name, email, *phone_numbers = record
print(name, email, phone_numbers)

# 还可以直接计算变量值
record2 = [10, 8, 7, 1, 9, 5, 10, 3]
*history, current = record2
print(sum(history)/len(history), current)

# 或者更多复杂的应用，让代码变得更简洁
source = 'nobody:*:-2:-2:Unprivilileged User:/var/empty:/usr/bin/false'
uname, *fields, homedir, sh = source.split(":")
print(name, homedir, sh)



In [None]:
source2 = {{'foo', 1, 2}, {'bar', 'hello'}, {'foo', 3, 4}}  # 复杂字典结构，找出规律分别处理

def do_foo(x,y):
    print('foo', x, y)

def do_bar(s):
    print('bar', s)

for tag, *args in source2:
    if tag == 'foo':
        do_foo(*args)
    elif tag == 'bar':
        do_bar(*args)
    else:
        pass

## 无限的数据，有限历史记录

In [None]:
from collections import deque

q = deque(maxlen=3)
for i in range(5):
    q.append(i)
    print('{} loop: {}'.format(i, list(q)))

In [53]:
from collections import deque

def search_content(lines, pattern, history=5):
    previous_lines = deque(maxlen=history)
    for line in lines:
        if pattern in line:
            yield line, previous_lines
        previous_lines.append(line)  # 历史记录

if __name__ == '__main__':
    with open('.\data\example1.txt') as f:
        for line, previlines in search_content(f, 'python', 5):
            for pline in previlines:
#                 print('previous line: ' + pline, end='')  # python3 code
                print('previous line: ' + pline)
#             print('current line: ' + line, end='')  # python3 code
            print('current line: ' + line) 
            print(25*'*')

current line: I python

*************************
previous line: I python

current line: Really python

*************************
previous line: I python

previous line: Really python

current line: Like python

*************************
previous line: I python

previous line: Really python

previous line: Like python

current line: The python

*************************
previous line: I python

previous line: Really python

previous line: Like python

previous line: The python

current line: Python python

*************************
previous line: I python

previous line: Really python

previous line: Like python

previous line: The python

previous line: Python python

current line: But python

*************************
previous line: Really python

previous line: Like python

previous line: The python

previous line: Python python

previous line: But python

current line: Where python

*************************
previous line: Like python

previous line: The python

previous line: Pyth

## 一个序列中出现此时最多的元素，使用模块collections里面的类Counter

In [None]:
import re
from collections import Counter


with open('example1.txt') as f:
    words = re.split(r'(?:\s|\n)\s*', f.read())

word_counters = Counter(words)
top_three = word_counters.most_common(3)  # most_common是Counter的一个方法
print(top_three)
print(word_counters['python'])

## 一个key对应多个值的字典，优雅的使用模块collections的类defaultdict

In [None]:
from collections import defaultdict

md = defaultdict(list)
md['a'].append(4)
md['a'].append(8)
md['b'].append(1)
print(md)

md = defaultdict(set)
md['a'].add(4)
md['a'].add(8)
md['b'].add(1)
md

## 切片命名增加代码可读性使用内部函数slice

In [None]:
record = '...................100 ........513.25 ................'
SHARES = slice(19, 22)
PRICE = slice(31, 38)

cost = int(record[SHARES]) * float(record[PRICE])
print(cost)
SHARES.start, SHARES.stop, SHARES.step  # 这个切片对象的实例可以读取操作更多属性

In [None]:
items = [0, 1, 2, 3, 4, 5, 6]  # 处理列表中硬编码的索引值会更有效
a = slice(2, 4)
print(items[a])
items[a] = [10, 11]
print(items)
del items[a]
print(items)
print(a.start, a.stop, a.step)

## 序列去重

In [None]:
# 如果不考虑原序列的顺序的话，序列去重最快的方法是把它转成set
a = ["hello", "the", "world", "it", "is", "my", "world"]
print(a)
set(a)

### 如果需要去除复制序列中出现的重复元素，并保持原序列前后顺序不变

In [None]:
def dedupe(items, key=None):
    seen = set()
    for item in items:
        val = item if key is None else key(item)  # 注意这里的key(items)，很有意
        if val not in seen:
            yield item  # 注意yield的用法
            seen.add(val)

# 下面是使用这个函数，可以像sorted()和max()一样使用
a = [{'x': 1, 'y': 2}, {'x': 1, 'y': 3}, {'x': 1, 'y': 2}, {'x': 2, 'y': 4}]
res = list(dedupe(a, key=lambda d: (d['x'], d['y'])))
print(res)
res = list(dedupe(a, key=lambda d: d['x']))
print(res)

# String and Text
## 排版文档或者很长的字符串,使用模块textwrap

In [None]:
import textwrap
import shutil
# print(shutil.get_terminal_size().columns)  # Python3 codes

from shutil_backports import get_terminal_size as gts  # Python2 codes
print(gts().columns)

s = r'President Trump on Sunday slammed “the Fake News Media,” which he called “out of control,” after a string of major errors in reporting on his presidency emerged over the past week.“Very little discussion of all the purposely false and defamatory stories put out this week by the Fake News Media,” he tweeted. “They are out of control - correct reporting means nothing to them.”The president continued, “Major lies written, then forced to be withdrawn after they are exposed...a stain on America!”Very little discussion of all the purposely false and defamatory stories put out this week by the Fake News Media. They are out of control - correct reporting means nothing to them. Major lies written, then forced to be withdrawn after they are exposed...a stain on America!'

print(textwrap.fill(s, 140));print()  # 每行140个字符
print(textwrap.fill(s, 140, initial_indent='>>'));print()  # 每行140个字符，首行缩进
print(textwrap.fill(s, 140, subsequent_indent='<<'));print()  # 每行140个字符，非首行缩进


## 变量差值，使用.format或.format_map

In [None]:
s = '{name} has {n} messages.'
print(s.format(name='Chloe', n=16))

s = '{name} has {n} messages.'
name = 'Chloe'
n = 15
# print(s.format_map(vars()))  # Python3

### .format_map的高阶用法 Python3

In [None]:
# vars还可以用在实例上:vars(instance)
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
instance_a = Info('Chloe', 14)
print(s.format_map(vars(instance_a)))  # Python3 codes

# 如果少了一个参数呢，也不希望它抛异常出去
class safesub(dict):  # 注意这里是继承了dict的子类
    def __missing__(self, key):
        return '{' + key + '}'  # 修改__missing__时候的表现为不报异常，而直接打印{key}这样的代码出来
del n
print(s.format_map(safesub(vars())))

# frame hack: 将替换变量的过程隐藏在一个小型的功能函数里面， 跟函数的栈帧打交道
import sys

def mysub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))  # sys._getframe获得调用函数的栈信息

name = "Chloe"
n = 12

print(mysub('Hello {name}.'))
print(mysub('You have {n} messages.'))
print(mysub('Your favorite corlor is {color}.'))

## 替换字符串有几种思路：str.replace(), re.sub或re.subn, 回调函数

In [None]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'

import re
text2 = re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)  # 把年月日的顺序和格式调整（替换）掉
text2

In [None]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
text3 = datepat.sub(r'\3-\1-\2', text)
print(text3)

text4, n = datepat.subn(r'\3-\1-\2', text)  # 这里用subn方法得到替换了多少次
print(text4, n)

In [None]:
from calendar import month_abbr

def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

text5 = datepat.sub(change_date, text)  # 这个用法很优雅，注意函数定义中的m.group()，对re.sub有深入了解才能写出这样的方法
text5

## 正则表达式re

### 不区分大小写: flags=re.IGNORECASE

In [None]:
import re

text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall(r'python', text, flags=re.IGNORECASE))

text2 = re.sub('python', 'snake', text, flags=re.IGNORECASE)
print(text2)

def matchcase(word):  # 替换字符串，但保证跟原来的字符串大小写一致
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

text3 = re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)
print(text3)


### 跨行匹配使用：flags=re.DOTALL

In [None]:
text3 = '''/* this is a
                multiline comment */
'''

comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)  # 匹配两个*号之间的文字
comment.findall(text3)

### 分割字符串两种思路：split(), re.split()

In [None]:
# re.split()可以同时处理多个分隔符，或者分隔符周围有多个空格这样的情
line = 'asdf fjdk; afed, fjek,asdf,    foo'

import re

print(re.split(r'[;,\s]\s*', line))  # 使用了“任意”符号[]
print(re.split(r'(;|,|\s)\s*', line))  # 用()包起来的方式可以将括号的内容引入捕获组，所以分隔符也会被捕捉出来
print(re.split(r'(?:;|,|\s)\s*', line))  # 增加?:从而使用了非捕获组，这样分隔符就不会被捕捉出来

line2 = 'Computer says "no" Phone says "yes"'

strpat = re.compile(r'\"(.*)\"')  # ()表示捕获组
print(strpat.findall(line2))

strpat = re.compile(r'\"(.*?)\"')  # 加一个?来表示最短匹配
print(strpat.findall(line2))

strpat = re.compile(r'\".*\"')  # 如果没有()，则可以理解为整个都是捕获组
print(strpat.findall(line2))

### 字符串高阶用法：标记流stream of token

In [None]:
text = "foo = 23 + 42 * 10"

import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
scanner = master_pat.scanner('foo = 42')
scanner.match()
#print(_.lastgroup)

from collections import namedtuple

Token = namedtuple('Token', ['type', 'value'])

def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

print("-"*40)

tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')
for tok in tokens:
    print(tok)


# Number

## 进制转换使用函数bin(), oct(), hex()和.format()

In [None]:
x = 1234

print(bin(x))
print(oct(x))
_x = hex(x)
print(_x, type(_x))


In [None]:
# 如果不希望出现0b, 0x, 0o这样的前缀，可以使用format()
print(format(x, 'b'))
print(format(x, 'o'))
_x = format(x, 'x')
print(_x, type(_x))

x = -1234
# print(format(x, 'b')) 
print(format(2**32 + x, 'b'))  # 一个负数，如果想要产生一个对应的无符号数值的话，则需要加上最大值来设置比特位的长度
print(format(2**32 + x, 'x'))

In [None]:
# 将字符串形式的整数转换为不同的进制，则使用int()函数再配合适当的进制就可以了
print(int('4d2', 16))
y = int('10011010010', 2)
print(y, type(y))

## 随机数使用模块random

### 随机选1个值 random.choice

In [10]:
import random
# random.choice([list])

values = range(10)
for _ in range(3):
        print(random.choice(values))

print(list(random.choice(values) for _ in range(30)))

3
8
1
[0, 2, 8, 8, 2, 2, 1, 2, 3, 2, 9, 6, 9, 7, 1, 5, 1, 7, 8, 3, 8, 5, 8, 9, 0, 0, 5, 8, 1, 4]


### 随机取几个值 random.sample

In [None]:
# random.sample([list], sample_number)
for _ in range(5) :
    print(random.sample(values, 2), random.sample(values, 3))

### 摇骰子 random.shuffle

In [None]:
# random.shuffle([list])
random.shuffle(values)
print(values)
random.shuffle(values)
print(values)

### 指定范围内生成随机整数（拆红包） random.randrange

In [None]:
# random.randrange(start, stop=None, step=1, _int=<type 'int'>, _maxwidth=9007199254740992L )

print(random.randrange(1, 10)) # equals random.randint(1,9)  # randrange不包含stop的值，这是python期望的行为
print(random.randrange(0,100, 7))  # 0-99内所有数是7的倍数的；step只能是整数

### 0到1之间随机数 random.random

In [None]:
random.random()

### 生成长随机数值 random.getrandbits

In [15]:
a = random.getrandbits(200)  # 生成200比特位的随机整数值
a, len(format(a, 'b'))  # 整数（只要是数字）是没有len()语法的，只有可迭代的sequence or collection才可以用

### 真随机种子 random.seed

In [None]:
print(random.randint(1,10))
random.seed()
print(random.randint(1, 10))

## 处理小数点使用函数round, 模块format和模块decimal

In [None]:
round(1.23456, 3), round(1234.5678, -2)  #复数表示小数点前面几位

In [None]:
# round模块操作的原则是四舍五入，如果是五的话，会取到离该值最近的那个偶数上，比如1.5和2.5都会取整到2。
# 如果需要更精确的小数计算，不希望因为浮点数天生的误差带来影响，就需要用到decimal模块
4.2 + 2.1  # 浮点数天生的误差

In [None]:
from decimal import Decimal
a, b = Decimal(4.2), Decimal(2.1)
print(Decimal(6.3) == (a+b))
a, b = Decimal('4.2'), Decimal('2.1')  # 注意：正确的Decimal是用字符串的形式来定义数字
print(Decimal('6.3') == (a+b))


from decimal import localcontext
a = Decimal('2.3')
b = Decimal('1.7')
print(a/b)

with localcontext() as ctx:
    ctx.prec = 3  # 定义数字的位数（如果是0.123类型的数值，不包括小数点前面的0）
    print(a/b)

with localcontext() as ctx:
    ctx.prec = 50
    print(a/b)

## 分数使用模块中的类fractions.Fraction

In [None]:
from fractions import Fraction
a, b = Fraction(5, 4), Fraction(7, 16)
c = a * b
print("分数c是{}, 数值类型是{}, 分子是{}, 分母是{}, 值是{}".format(c, type(c),  c.numerator, c.denominator, float(c)))

# 生成由特定整数范围内的证书组成的，最接近某个数的分数
print(c.limit_denominator(1000) )
print(Fraction('3.141592653589793').limit_denominator(100))

# 浮点数转成一个分数
x = 3.141592653589793
Fraction(*x.as_integer_ratio())

## 复数使用类complex，函数操作使用模块cmath和numpy

In [None]:
a = complex(2, 4)
b = 3 - 5j
print(a, b)

print("复数a的实部是{real}, 虚部是{imag}, 共轭值是{conjugate}".format(real=a.real, imag=a.imag, conjugate=a.conjugate()))

print(a+b, a-b, a*b, format(a/b, '^100.2f'))
format(abs(a), '=>30.1f')

In [None]:
import cmath
cmath.sin(a), cmath.cos(a), cmath.exp(a), cmath.sqrt(-1)

In [None]:
import numpy as np
c = np.array([2+3j, 4+5j, 6-7j, 8-9j])
c, c+2, np.sin(c), np.cos(c)

## 数据集grid网格计算使用库Numpy，注意跟序列计算是不一样的

In [None]:
a = [1,2,3,4]
# a*2 相当于a.expand(a)，长度增加了。区别是是否改变原来的值
c = a.extend(a)
print(c, a)

a = [1,2,3,4]
c = a*2
print(c, a)

a = [1, 2, 3, "hello"]
a*2

In [None]:
import numpy as np

a, b = np.array([1,2,3,4]), np.array([4,3,2,1])
# a*2对数组进行运算的时候是针对数组里面的每个元素的计算
print(a*2)

a+2, a+b, a*b, a**b

In [None]:
# numpy在底层是大块的连续内存，由同一类型的数据组成。比如创建一个10000 * 10000的二维浮点数组，用numpy很容易
grid = np.zeros(shape=(10000, 10000), dtype=float)
np.shape(grid), len(grid)

grid += 10
grid

In [None]:
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
print(a)

 # [ ]做为分片操作，对于一个数据集来说就是查询其中的某些值
print(a[0, :])   # the first row, row 0
print(a[:, 1])  # column 1
print(a[1:3, 0:2]) # row 1,2 and column 0,1

a[1:3, 0:2] += 100
print(a)

b = a + [200, 201, 202, 202]  # 广播一个row向量列表操作；每一行都会加上相应的值
print(b)
# a + [1, 2, 3]  # 如果列表长度不对，则改语句会报错

c = np.where(a<100, a, 'N/A')  # if a < 100, a; not, equal "N/A"，但注意这会改变整个集合的dtype，后面不能再做数学运算了
print(a)
print(c)
# c +1  # 会报错
d = np.where(c=="N/A", 0, c)
d_formated = np.int8(d)  # 还要对整个集合做一个格式化
d_formated

In [None]:
# 矩阵和线性代数方面的操作例如矩阵乘法、求行列式，解线性方程等，用numpy库里面的matrix和linalg
e = np.matrix([[1, -2, 3], [0, 4, 5], [7, 8, -9]])
print("原矩阵是:\n{}\n转置之后是:\n{}\n逆矩阵(数值的倒数)是:\n{}\n".format(e, e.T, e.I))

print("行列式是:\n{}\n特征值是:\n{}\n".format(np.linalg.det(e), np.linalg.eigvals(e)))

f = np.matrix([[2], [3], [4]])  # 单列三行矩阵
print(e * f)# 矩阵的乘法
x = np.linalg.solve(e, f)  # 求解方程式 x : e*x = f
print(x)
print(e)
print(e*x)
print(f)

## 无限inf和没有数值nan

In [17]:
a = float('inf')
b = float('-inf')
c = float('nan')
print(a, b, c, type(c))

import math
print(math.isinf(a))

print(a+45)
print(a*10)  # 无限加有限的值还是无限
print(10 / a)  # 无限在分母上就是0


print(a / a)  # 无限/无限 = nan
print(a + b) # 无限-无限 = nan
print(c + 23)
print(c / 2)
print(c * 2)
print(math.sqrt(c))  # nan的所有计算结果都是nan

print(c == c)  # nan在比较的时候永远不会被判定为相等
print(math.isnan(c))  # 唯一安全检测NaN的方法

(inf, -inf, nan, <type 'float'>)
True
inf
inf
0.0
nan
nan
nan
nan
nan
nan
False
True


## 字节串

In [4]:
data = b'\x00\x124v\x00x\x90\xab\x00\xcd\xef\x01\x00#\x004'
len(data)
# print(int.from_bytes(data, 'big'))  # 'big'指明最高位在字节流首部，Python3 codes
# print(int.from_bytes(data, 'little'))  # 'little'指明最高位在字节流尾部，Python3 codes

import sys
sys.byteorder  # 系统默认字节流的高位在尾部


'little'

In [6]:
x = 94525377821947740945920721189797940
# print(x.to_bytes(16, 'big'))  # 指定字节数和字节序， Python3 codes
# print(x.to_bytes(32, 'little'))  # Python3 codes
x = 0x01020304
# print(x.to_bytes(4, 'big'))
# print(x.to_bytes(4, 'little'))

AttributeError: 'int' object has no attribute 'to_bytes'

In [9]:
x = 523 ** 23
print(x)
# print(x.to_bytes(16, 'little'))
print(x.bit_length())
nbytes, rem = divmod(x.bit_length(), 8)
if rem:
    nbytes += 1
# print(x.to_bytes(nbytes, 'little'))  # Python3 codes

335381300113661875107536852714019056160355655333978849017944067
208


# 格式输出使用模块format

## 传统方式%

In [41]:
x =1234.5678
print('%0.2f' % x)
print('%10.1f' % x)
print('%-10.1f' % x)

1234.57
    1234.6
1234.6    


## format模式

In [47]:
x = 1234.56789
print(format(x, '0.2f'))  # 顶首位输出，保留小数点后两位
print(format(x, '>100.1f'))  # 共一百个字符位，右对齐，小数点后1位
print(format(x, '.<100.1f'))  # 共一百个字符位，左对齐，小数点后1位，填充小数点
print(format(x, '^100.3f'))  # 居中
print(format(x, ','))  # 千位的逗号，inclusion of thousands separator
print(format(x, '0,.1f'))  # 千位的逗号的另外一种表示方法
print(format(-x, '0.2f'))
print(format(x, 'e'))  # 科学计数法
print(format(x, '0.2e'))

1234.57
                                                                                              1234.6
1234.6..............................................................................................
                                              1234.568                                              
1,234.56789
1,234.6
-1234.57
1.234568e+03
1.23e+03


# Date Time

## 时间的单位转换使用模块datetime，更复杂的时间处理比如时区，模糊时间，节日等使用第三方库dateutil，专门的时区处理使用模块pytz

### 特定的字符串转化为日期 datetime.striptime

In [36]:
from datetime import datetime

text = '2012-09-23'
y = datetime.strptime(text, '%Y-%m-%d')

z = datetime.now()
print(z)
diff = z - y
print(diff, type(diff))
# timedelta([days[, seconds[, microseconds[, milliseconds[, minutes[, hours[, weeks]]]]]]])

2017-12-14 15:00:33.171000
(datetime.timedelta(1908, 54033, 171000), <type 'datetime.timedelta'>)


### 相对时间计算datetime.timedelta

In [13]:
from datetime import timedelta
a = timedelta(days=2, hours=6)
b = timedelta(hours=34.5)  # 时间差也可以叠加运算，和响应的属性
c = a + b
print(c.days)
print(c.seconds, c.seconds / 3600)
print(c.total_seconds() / 3600)
dir(c)

3
(59400, 16)
88.5


['__abs__',
 '__add__',
 '__class__',
 '__delattr__',
 '__div__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__pos__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmul__',
 '__rsub__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 'days',
 'max',
 'microseconds',
 'min',
 'resolution',
 'seconds',
 'total_seconds']

### 直接日期定义datetime.datetime

In [17]:
# datetime可以正确处理闰年
a = datetime(2012, 3, 1)
b = datetime(2012, 2, 28)
print(a - b)
a = datetime(2017, 3, 1)
b = datetime(2017, 2, 28)
print(a-b)

2 days, 0:00:00
1 day, 0:00:00


### 时间运算

In [29]:
from datetime import datetime, timedelta


a = datetime(2017, 9, 23)  # 指定时间
print(a)
print(a + timedelta(days=10))  # 指定时间前后

b = datetime(2017, 12, 21)
d = b - a  # 差几天
print(d.days)

now = datetime.today()  # 当前时间
print(now)
print(now + timedelta(minutes=10)) 

2017-09-23 00:00:00
2017-10-03 00:00:00
89
2017-12-14 14:39:51.357000
2017-12-14 14:49:51.357000
3
(59400, 16)
88.5
2 days, 0:00:00
1 day, 0:00:00


### 更复杂的相对时间使用dateutil.relativedelta.relativedelta

In [23]:
from dateutil.relativedelta import relativedelta


a = datetime(2012, 9, 23)
# a + timedelta(months=1)  # datetime最长能处理到天数，周数。TypeError: 'months' is an invalid keyword argument for this function

print(a + relativedelta(months=+1))
print(a + relativedelta(months=+4))

b = datetime(2012, 12, 21)
d = b - a
print(d, type(d))
d = relativedelta(months=+2, days=+28)
print(d, type(d))
print(d.months, d.days)
%pdoc relativedelta

2012-10-23 00:00:00
2013-01-23 00:00:00
(datetime.timedelta(89), <type 'datetime.timedelta'>)
(relativedelta(months=+2, days=+28), <class 'dateutil.relativedelta.relativedelta'>)
(2, 28)


In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# 计算上周五的日期
d = datetime.today()
print(d)
print(d + relativedelta(weekday=FR))  # 下一个星期五
print(d + relativedelta(weekday=FR(-1)))  # 上一个星期五

# 列出指定的日期或时间范围，最简单的方法如下
def date_range(start, stop, step):
    while start < stop:
        yield start
        start += step

for d in date_range(datetime(2017,12, 11), datetime(2017, 12, 31), timedelta(hours=12)):
    print(d)

### 时区pytz

In [40]:
from datetime import datetime, timedelta
import pytz

d = datetime(2012, 12, 21, 9, 30, 0)  # naive time: 没有tzinfo的datetime.datetime
print(d)

central = pytz.timezone('US/Central')  # 生成特定时区的对象
loc_d = central.localize(d)  # 通过系统数据，计算特定时区的当地时间并附加上时区信息tzinfo，变成带有tzinfo的datetime.datetime
print(loc_d)

bang_d = loc_d.astimezone(pytz.timezone('Asia/Kolkata'))  # 计算新时区内的时间。注意： astimezone() cannot be applied to a naive datetime
print(bang_d)

china_d = loc_d.astimezone(pytz.timezone('Asia/Shanghai'))
print(china_d)

utc_d = loc_d.astimezone(pytz.utc)
print(utc_d)

later_utc_d = utc_d + timedelta(minutes=30)  # 因为还是datetime，所以可以和timedelta进行运算
print(later_utc_d)

print("A naive datetime is like {} and a local time is like {}, but they are both datetime.datetime object.".format(d, loc_d))

# 想知道一个时区的具体名称，可以使用ISO3166国家代码作为key来查询

print(pytz.country_timezones['IN'])
print(pytz.country_timezones['CN'])


2012-12-21 09:30:00
2012-12-21 09:30:00-06:00
2012-12-21 21:00:00+05:30
2012-12-21 23:30:00+08:00
2012-12-21 15:30:00+00:00
2012-12-21 16:00:00+00:00
A naive datetime is like 2012-12-21 09:30:00 and a local time is like 2012-12-21 09:30:00-06:00, but they are both datetime.datetime object.
[u'Asia/Kolkata']
[u'Asia/Shanghai', u'Asia/Urumqi']


# HTML
## 有些时候需要对&entity或者&#code这样的实体替换为响应的文本，或者需要生成文本，但对特定的字符比如<>&做转义处理

In [None]:
import html
s = 'Elements are written as "<tag>text</tag>".'
print(s)
print(html.escape(s))
print(html.escape(s, quote=False))

from html.parser import HTMLParser
s = "Spicy &quot;Jalape&#241;o&quot."
p = HTMLParser()
print(p.unescape(s))

from xml.sax.saxutils import unescape
t = 'The prompt is &gt;&gt;&gt;'
print(unescape(t))

# from xml.etree.ElmentTree import parse  # Python3 codes

# 文件读写

## 文件位置目录和路径，使用模块os

### 文件路径组合分解