# 数据加载，存储与文件格式

In [1]:
import numpy as np
import pandas as pd

## 1. 读写文本文件格式

### 1.1 读
- 读取csv文件 read_csv (file_path or buf,usecols,encoding): 
    - file_path ：文件路径, 
    - usecols :指定读取的列名， 
    - encoding :编码

In [2]:
df = pd.read_csv(r'ex1.csv',)
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [3]:
pd.read_table(r'ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [4]:
pd.read_csv(r'ex2.csv',)

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,python


In [5]:
# header 使用默认的列名参数, 默认为0，即第0行，不想用第0行可以选择None
pd.read_csv(r'ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [6]:
# 使用自定义的列名
# names 用于结果的列名列表，使用header=None后就不需要在使用names
pd.read_csv(r'ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [7]:
# index_col 把列转换成行索引！！！
df1 = pd.read_csv(r'ex2.csv', names=['a', 'b', 'c', 'd', 'message'], index_col='message')
df1

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
python,9,10,11,12


In [8]:
pd.read_csv(r'csv_mindex.csv')

Unnamed: 0,key1,key2,value1,value2
0,one,a,1,2
1,one,b,3,4
2,one,c,5,6
3,one,d,7,8
4,two,a,9,10
5,two,b,11,12
6,two,c,13,14
7,two,d,15,16


In [9]:
# 使用多列转换成层级行索引！！！
pa = pd.read_csv(r'csv_mindex.csv', index_col=['key1', 'key2'])
pa

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [10]:
df2 = pd.read_table(r'ex3.txt', sep='\s+')
df2

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


### 1.2 写
- **DataFrame**: to_csv (file_path or buf,sep,columns,header,index,na_rep,mode)： 
    - file_path ：保存文件路径,默认None, 
    - sep :分隔符,默认',' , 
    - columns :是否保留某列数据,默认None, 
    - header ：是否保留列名,默认True, 
    - index :是否保留行索引,默认True, 
    - na_rep :指定字符串来代替空值,默认是空字符, 
    - mode :默认'w',追加'a
- **Series**: Series.to_csv
    - (_path=None_,_index=True_,_sep='_,_'_,_na\_rep=''_,_header=False_,_mode='w'_,_encoding=None_\)


In [11]:
df2 = pd.read_csv(r'ex1.csv')
df2

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [12]:
# 把 df2 写到  以逗号作为分隔符的csv文件中
df2.to_csv('out_ex1.csv')

### 分块读取大文件

In [13]:
# 对于太大的文件，可用chunksize进行分块读取，选择读取部分数据，返回一个可迭代的对象
agg1 = pd.read_csv(r'agg_match_stats_1.csv', chunksize=10)
print(agg1)
print(type(agg1))

<pandas.io.parsers.TextFileReader object at 0x7f35af07fc18>
<class 'pandas.io.parsers.TextFileReader'>


In [14]:
# 获取读取到的DataFrame数据，可一直往下读取
print(agg1.get_chunk(5))# 读取5行，上面设定的是10行，这里可以更改
# print(type(agg1.get_chunk()))

                       date  game_size  \
0  2017-11-26T01:47:01+0000         97   
1  2017-11-26T01:47:01+0000         97   
2  2017-11-26T01:47:01+0000         97   
3  2017-11-26T01:47:01+0000         97   
4  2017-11-26T01:47:01+0000         97   

                                            match_id match_mode  party_size  \
0  2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
1  2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
2  2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
3  2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
4  2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   

   player_assists  player_dbno  player_dist_ride  player_dist_walk  \
0               0            0           0.00000       2082.823000   
1               0            0           0.00000       1118.815000   
2               0            0           0.00000         25.53

In [15]:
print(agg1.get_chunk())

                        date  game_size  \
5   2017-11-26T01:47:01+0000         97   
6   2017-11-26T01:47:01+0000         97   
7   2017-11-26T01:47:01+0000         97   
8   2017-11-26T01:47:01+0000         97   
9   2017-11-26T01:47:01+0000         97   
10  2017-11-26T01:47:01+0000         97   
11  2017-11-26T01:47:01+0000         97   
12  2017-11-26T01:47:01+0000         97   
13  2017-11-26T01:47:01+0000         97   
14  2017-11-26T01:47:01+0000         97   

                                             match_id match_mode  party_size  \
5   2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
6   2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
7   2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
8   2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
9   2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...        tpp           1   
10  2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwi

In [16]:
agg1 = pd.read_csv(r'agg_match_stats_1.csv', iterator=True)

In [17]:
agg1

<pandas.io.parsers.TextFileReader at 0x7f35af07f5f8>

In [18]:
# 注意是延续的读取
agg1.get_chunk(10)

Unnamed: 0,date,game_size,match_id,match_mode,party_size,player_assists,player_dbno,player_dist_ride,player_dist_walk,player_dmg,player_kills,player_name,player_survive_time,team_id,team_placement
0,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,0.0,2082.823,0,0,fuckeddup,661.491,100001,50
1,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,0.0,1118.815,0,0,nojiongegg,741.359,100002,47
2,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,0.0,25.533026,73,0,Darthmoca,83.255,100006,97
3,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,997.51,1032.66785,345,3,gk1715,1144.816,100007,30
4,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,4296.35938,2300.32349,449,4,Angeliaboby,1112.843,100021,32
5,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,3094.9873,1396.51355,0,0,SwagChimp,788.667,100031,46
6,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,1,0,8064.371,948.5888,104,1,ZzxcZC,1274.036,100032,22
7,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,1,0,0.0,38.11086,68,0,likesthtodrink,110.313,100037,92
8,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,0.0,0.0,0,0,BDress,1401.912,100043,97
9,2017-11-26T01:47:01+0000,97,2U4GBNA0Yml_MDIpsXDjMltu0-r8UqS8skCECcelwiP7iu...,tpp,1,0,0,0.0,1021.32025,55,0,Zimple3cBackz,701.062,100044,49


### JSON数据

In [19]:
import json

In [20]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [21]:
# json.loads 将json字符串数据转换成Python对象
res = json.loads(obj)
res

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 30, 'name': 'Scott', 'pets': ['Zeus', 'Zuko']},
  {'age': 38, 'name': 'Katie', 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [22]:
# 将python对象转换成json格式
res1 = json.dumps(res)
res1

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [23]:
sib = pd.DataFrame(res['siblings'], columns=['name', 'age', 'pets'])
sib

Unnamed: 0,name,age,pets
0,Scott,30,"[Zeus, Zuko]"
1,Katie,38,"[Sixes, Stache, Cisco]"


## 2. 二进制数据格式

In [24]:
frame = pd.read_csv('ex1.csv')
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [25]:
# 实现数据的高效二进制格式存储的最简单的方法，使用Python内置的pickle序列化
# pandas对象都有一个用于将数据以pickle格式保存到磁盘上的to_pickle方法
frame.to_pickle('frame_pickle')

In [26]:
pd.read_pickle('frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


### HDF5格式
- 一般用于存储大规模的科学数组数据

In [27]:
frame = pd.DataFrame({'a':np.random.randn(100)})
frame

Unnamed: 0,a
0,-0.953183
1,1.435637
2,0.172020
3,0.952253
4,0.542208
5,-0.936377
6,1.853504
7,-0.275103
8,0.891624
9,0.775763


In [28]:
s = pd.HDFStore('mydata.h5')

In [29]:
s['obj1'] = frame

In [30]:
s['obj1_col'] = frame['a']

In [31]:
s

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[100,1])                                       
/obj1_col            series       (shape->[100])                                         
/obj2                frame_table  (typ->appendable,nrows->100,ncols->1,indexers->[index])
/obj3                frame_table  (typ->appendable,nrows->100,ncols->1,indexers->[index])

In [32]:
s['obj1']

Unnamed: 0,a
0,-0.953183
1,1.435637
2,0.172020
3,0.952253
4,0.542208
5,-0.936377
6,1.853504
7,-0.275103
8,0.891624
9,0.775763


In [33]:
# 两种存储模式： fixed, table
s.put('obj2', frame, format='table') # 和代码： s['obj2'] = frame  等效

In [34]:
s.select('obj2', where=['index>=10 and index<=15'])

Unnamed: 0,a
10,-0.456686
11,-0.307624
12,-0.918829
13,0.307364
14,1.422742
15,-1.256347


In [35]:
# pd.read_hdf('mydata.h5', 'obj3', where=['index<5'])

## 3. Web APIs交互

In [36]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp


<Response [200]>

In [37]:
# 通过响应对象的json方法，返回一个包含解析过得的json字典，加载到一个python对象中
data = resp.json()

In [38]:
data[0]['title']

'Fix EX03 errors in generic.py'

In [39]:
issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
issues

Unnamed: 0,number,title,labels,state
0,30448,Fix EX03 errors in generic.py,[],open
1,30447,TST: Add tests for if_exists keyword argument ...,[],open
2,30446,BUG: Fix wrong error in df drop with non uniqu...,[],open
3,30445,Check that dtype is preserved when appending c...,[],open
4,30444,TST: Add tests for fixed issues,"[{'id': 127685, 'node_id': 'MDU6TGFiZWwxMjc2OD...",open
5,30441,TST: clean up skips and xfails,[],open
6,30440,Replaced .format{}/% with f-strings in core/to...,"[{'id': 211029535, 'node_id': 'MDU6TGFiZWwyMTE...",open
7,30439,DataFrame.loc multiple columns replace,[],open
8,30436,PERF: avoid casting to float in IntegerArray r...,"[{'id': 849023693, 'node_id': 'MDU6TGFiZWw4NDk...",open
9,30435,ENH/PERF: allow mask to be None in our masked ...,"[{'id': 849023693, 'node_id': 'MDU6TGFiZWw4NDk...",open


## 4. 数据库交互
- pandas 
- sqlalchemy 
- pymysql

In [9]:
# 导入必要模块
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

# 初始化数据库连接
# 用户名root密码，端口3306 数据库名称
engine = create_engine('mysql+pymysql://root:llr123@localhost:3306/tlxy1')

# 查询语句
sql = """
    select * from teacher;
"""

df = pd.read_sql(sql, engine)
df

Unnamed: 0,id,name,note,sal,dname
0,1,zhangsan,zheng yi ge yi,7500.0,人力部
1,2,小__,zheng liang ge yi,5000.0,教育部
2,3,小露露,zheng wu ge yi,5000.0,教育部
3,4,dana,zheng wu bai yi,20000.0,教育部
4,5,吴燕青,zheng ge pi,5000.0,教育部
5,6,小si,zheng bu zhao,5000.0,人力部


In [10]:
# 新建
df = pd.DataFrame({'id':[1,2,3,4], 'num':[34,56,78,90]})
# df
# 写入到数据库
df.to_sql('shuju', engine, index=False)
print('ok')

ok
