# 输入输出基础


In [5]:
name = input('your name:')
gender = input('you are a boy?(y/n)') # 输入的类型永远是字符串型（str）

your name: Jack
you are a boy?(y/n) y


In [4]:
welcome_str = 'Welcome to the matrix {prefix} {name}.'
welcome_dic = {
    'prefix': 'Mr.' if gender == 'y' else 'Mrs',
    'name': name
}

print('authorizing...')
print(welcome_str.format(**welcome_dic))

authorizing...
Welcome to the matrix Mr. Jack.


**输入的类型永远是字符串**

In [10]:
a = input()
b = input()

print('a + b = {}'.format(a + b))
print('type of a is {}, type of b is {}'.format(type(a), type(b)))
print('a + b = {}'.format(int(a) + int(b)))

 1
 2


a + b = 12
type of a is <class 'str'>, type of b is <class 'str'>
a + b = 3


# 文件输入输出

NLP 任务的基本步骤，也就是下面的四步：

1. 读取文件；

2. 去除所有标点符号和换行符，并把所有大写变成小写；

3. 合并相同的词，统计每个词出现的频率，并按照词频从大到小排序；

4. 将结果按行输出到文件 out.txt。

In [16]:
import re

# 你不用太关心这个函数
def parse(text):
    # 使用正则表达式去除标点符号和换行符
    text = re.sub(r'[^\w ]', ' ', text)

    # 转为小写
    text = text.lower()
    
    # 生成所有单词的列表
    word_list = text.split(' ')
    
    # 去除空白单词
    word_list = filter(None, word_list)
    
    # 生成单词和词频的字典
    word_cnt = {}
    for word in word_list:
        if word not in word_cnt:
            word_cnt[word] = 0
        word_cnt[word] += 1
    
    # 按照词频排序
    sorted_word_cnt = sorted(word_cnt.items(), key=lambda kv: kv[1], reverse=True)
    
    return sorted_word_cnt

with open('in.txt', 'r') as fin:
    text = fin.read()

word_and_freq = parse(text)

with open('out.txt', 'w') as fout: # 输出到文件
    for word, freq in word_and_freq:
        fout.write('{} {}\n'.format(word, freq)) 

print(word_and_freq)


[('and', 15), ('be', 13), ('will', 11), ('to', 11), ('the', 10), ('of', 10), ('a', 8), ('we', 8), ('day', 6), ('able', 6), ('every', 6), ('together', 6), ('i', 5), ('have', 5), ('dream', 5), ('that', 5), ('one', 5), ('with', 5), ('this', 5), ('in', 4), ('shall', 4), ('free', 4), ('when', 4), ('little', 3), ('black', 3), ('white', 3), ('made', 3), ('faith', 3), ('at', 3), ('last', 3), ('children', 2), ('nation', 2), ('by', 2), ('their', 2), ('today', 2), ('alabama', 2), ('boys', 2), ('girls', 2), ('join', 2), ('hands', 2), ('mountain', 2), ('places', 2), ('all', 2), ('it', 2), ('our', 2), ('hope', 2), ('up', 2), ('freedom', 2), ('ring', 2), ('from', 2), ('god', 2), ('men', 2), ('my', 1), ('four', 1), ('live', 1), ('where', 1), ('they', 1), ('not', 1), ('judged', 1), ('color', 1), ('skin', 1), ('but', 1), ('content', 1), ('character', 1), ('down', 1), ('its', 1), ('vicious', 1), ('racists', 1), ('right', 1), ('there', 1), ('as', 1), ('sisters', 1), ('brothers', 1), ('valley', 1), ('exalt

# JSON 序列化实战

In [17]:
import json

params = {
    'symbol': '123456',
    'type': 'limit',
    'price': 123.4,
    'amount': 23
}

params_str = json.dumps(params)

print('after json serialization')
print('type of params_str = {}, params_str = {}'.format(type(params_str), params))

original_params = json.loads(params_str)

print('after json deserialization')
print('type of original_params = {}, original_params = {}'.format(type(original_params), original_params))

after json serialization
type of params_str = <class 'str'>, params_str = {'symbol': '123456', 'type': 'limit', 'price': 123.4, 'amount': 23}
after json deserialization
type of original_params = <class 'dict'>, original_params = {'symbol': '123456', 'type': 'limit', 'price': 123.4, 'amount': 23}


- json.dumps() 这个函数，接受 Python 的基本数据类型，然后将其序列化为 string；
- json.loads() 这个函数，接受一个合法字符串，然后将其反序列化为 Python 的基本数据类型。

In [18]:
import json

params = {
    'symbol': '123456',
    'type': 'limit',
    'price': 123.4,
    'amount': 23
}

with open('params.json', 'w') as fout:
    params_str = json.dump(params, fout)

with open('params.json', 'r') as fin:
    original_params = json.load(fin)

print('after json deserialization')
print('type of original_params = {}, original_params = {}'.format(type(original_params), original_params))


after json deserialization
type of original_params = <class 'dict'>, original_params = {'symbol': '123456', 'type': 'limit', 'price': 123.4, 'amount': 23}


# 注意点

I/O 操作需谨慎，一定要进行充分的错误处理，并细心编码，防止出现编码漏洞；

编码时，对内存占用和磁盘占用要有充分的估计，这样在出错时可以更容易找到原因；

JSON 序列化是很方便的工具，要结合实战多多练习；

代码尽量简洁、清晰，哪怕是初学阶段，也要有一颗当元帅的心。

# 思考题

In [19]:
import re

# 你不用太关心这个函数
def parse(text):
    # 使用正则表达式去除标点符号和换行符
    text = re.sub(r'[^\w ]', ' ', text)

    # 转为小写
    text = text.lower()
    
    # 生成所有单词的列表
    word_list = text.split(' ')
    
    # 去除空白单词
    word_list = filter(None, word_list)
    
    # 生成单词和词频的字典
    word_cnt = {}
    for word in word_list:
        if word not in word_cnt:
            word_cnt[word] = 0
        word_cnt[word] += 1
    
    # 按照词频排序
    sorted_word_cnt = sorted(word_cnt.items(), key=lambda kv: kv[1], reverse=True)
    
    return sorted_word_cnt

word_and_freq = {}
with open('in.txt', 'r') as fin:
    lineList = fin.readlines(2)
    for line in lineList:
        word_and_freq_tmp = parse(line)
        word_and_freq.update(word_and_freq_tmp)

with open('out.txt', 'w') as fout: # 输出到文件
    for word, freq in word_and_freq:
        fout.write('{} {}\n'.format(word, freq)) 

print(word_and_freq)

TypeError: expected string or bytes-like object, got 'list'