In [2]:
# Pandas有两个主要也是最重要的数据结构：Series和DataFame

In [3]:
import pandas as pd

### Series

#### 1. 通过list构建Series

In [7]:
ser_obj = pd.Series(range(10, 20))
print(ser_obj.head(3))

print(ser_obj)

print(type(ser_obj))

0    10
1    11
2    12
dtype: int64
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
<class 'pandas.core.series.Series'>


#### 2. 获取数据和索引

In [8]:
# 获取数据
print(ser_obj.values)

# 获取索引
print(ser_obj.index)

[10 11 12 13 14 15 16 17 18 19]
RangeIndex(start=0, stop=10, step=1)


#### 3. 通过索引获取数据

In [9]:
print(ser_obj[0])

print(ser_obj[8])

10
18


#### 4. 索引与数据的对应关系不被运算结果影响

In [10]:
print(ser_obj * 2)

print(ser_obj > 15)

0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool


#### 5. 通过dict构建Series

In [11]:
year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5}
ser_obj2 = pd.Series(year_data)

print(ser_obj2.head())

print(ser_obj2.index)

2001    17.8
2002    20.1
2003    16.5
dtype: float64
Int64Index([2001, 2002, 2003], dtype='int64')


#### name属性

In [16]:
ser_obj2.name = 'temp'
ser_obj2.index.name = 'year'
print(ser_obj2.head())

year
2001    17.8
2002    20.1
2003    16.5
Name: temp, dtype: float64


### DataFrame

#### 1. 通过ndarray构建DataFrame

In [19]:
import numpy as np

In [20]:
array = np.random.randn(5, 4)
print(array)

df_obj = pd.DataFrame(array)
print(df_obj.head())

[[-0.97869668 -0.63668702 -0.71247899 -0.73857081]
 [-0.4971664  -0.38767146 -0.10778661  1.39391516]
 [ 1.27641069  0.31529898  1.71674759 -0.29316255]
 [-0.17057332 -1.04612857 -1.36819239  0.74509776]
 [-0.08002971  0.80512509  0.26630918 -1.19601822]]
          0         1         2         3
0 -0.978697 -0.636687 -0.712479 -0.738571
1 -0.497166 -0.387671 -0.107787  1.393915
2  1.276411  0.315299  1.716748 -0.293163
3 -0.170573 -1.046129 -1.368192  0.745098
4 -0.080030  0.805125  0.266309 -1.196018


#### 2. 通过dict构建DataFame

In [21]:
dict_data = {'A': 1, 
             'B': pd.Timestamp('20170426'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([3] * 4,dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'ITCast' }

df_obj2 = pd.DataFrame(dict_data)
print(df_obj2)

   A          B    C  D       E       F
0  1 2017-04-26  1.0  3  Python  ITCast
1  1 2017-04-26  1.0  3    Java  ITCast
2  1 2017-04-26  1.0  3     C++  ITCast
3  1 2017-04-26  1.0  3       C  ITCast


#### 3. 通过列索引获取列数据(Series)

In [24]:
print(df_obj2['A'])

print(type(df_obj2['A']))

print(df_obj2.E)

0    1
1    1
2    1
3    1
Name: A, dtype: int64
<class 'pandas.core.series.Series'>
0    Python
1      Java
2       C++
3         C
Name: E, dtype: object


#### 4. 增加列数据

In [25]:
df_obj2['G'] = df_obj2['D'] + 4
print(df_obj2.head())

   A          B    C  D       E       F  G
0  1 2017-04-26  1.0  3  Python  ITCast  7
1  1 2017-04-26  1.0  3    Java  ITCast  7
2  1 2017-04-26  1.0  3     C++  ITCast  7
3  1 2017-04-26  1.0  3       C  ITCast  7


In [26]:
df_obj2['H'] = pd.Series(range(4)) + 6
print(df_obj2.head())

   A          B    C  D       E       F  G  H
0  1 2017-04-26  1.0  3  Python  ITCast  7  6
1  1 2017-04-26  1.0  3    Java  ITCast  7  7
2  1 2017-04-26  1.0  3     C++  ITCast  7  8
3  1 2017-04-26  1.0  3       C  ITCast  7  9


#### 5. 删除列

In [28]:
del(df_obj2['G'])

print(df_obj2.head())

   A          B    C  D       E       F
0  1 2017-04-26  1.0  3  Python  ITCast
1  1 2017-04-26  1.0  3    Java  ITCast
2  1 2017-04-26  1.0  3     C++  ITCast
3  1 2017-04-26  1.0  3       C  ITCast
