In [1]:
import numpy as np
import pandas as pd

# From dict of Series or dicts （从 `Series的字典` 或者 `字典的字典`）

In [2]:
# 字典的key是不可变对象，所以只有从value上嵌套字典.
# Series的字典
d1 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}

# 字典的字典 
d2 = {
     66: {'a': 1.0, 'b': 2.0, 'c': 3.0},
     88: {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': 4.0}
}

df1 = pd.DataFrame(d1)
print(df1, '\n')

df2 = pd.DataFrame(d2)
print(df2)

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0 

    66   88
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0


In [3]:
# index 、columns 属性
print(df1.index, df1.columns, sep='\n')

Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['one', 'two'], dtype='object')


In [4]:
pd.DataFrame(d1, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [5]:
pd.DataFrame(d1, index=["d", "b", "a"], columns=["one", "two", "three"])

Unnamed: 0,one,two,three
d,,4.0,
b,2.0,2.0,
a,1.0,1.0,


In [6]:
pd.DataFrame(d1, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


# From dict of ndarrays / lists （从 `多维数组的字典` 或者 `列表的字典`）

In [7]:
# 从多维数组的字典
d1 = {22: np.linspace(2, 8, 4), '44': np.linspace(6, 9, 4)}
print(d1, d1.keys(), d1.values(), sep='\n')

df1 = pd.DataFrame(d1, index=['you', 'and', 'me', 'together'])
df1

{22: array([2., 4., 6., 8.]), '44': array([6., 7., 8., 9.])}
dict_keys([22, '44'])
dict_values([array([2., 4., 6., 8.]), array([6., 7., 8., 9.])])


Unnamed: 0,22,44
you,2.0,6.0
and,4.0,7.0
me,6.0,8.0
together,8.0,9.0


In [8]:
# 从列表的字典
d2 = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
print(d2, d2.keys(), d2.values(), sep='\n')

df2 = pd.DataFrame(d2, index=['you', 'and', 'me', 'two'])
df2

{'one': [1.0, 2.0, 3.0, 4.0], 'two': [4.0, 3.0, 2.0, 1.0]}
dict_keys(['one', 'two'])
dict_values([[1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0]])


Unnamed: 0,one,two
you,1.0,4.0
and,2.0,3.0
me,3.0,2.0
two,4.0,1.0


# From a list of dicts （从字典列表）

In [9]:
data1= [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]

pd.DataFrame(data1)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [10]:
df = pd.DataFrame(data1, index=[66, "second"])
df

Unnamed: 0,a,b,c
66,1,2,
second,5,10,20.0


In [11]:
pd.DataFrame(data1, columns=["a", "b"])

Unnamed: 0,a,b
0,1,2
1,5,10


# Column selection, addition, deletion （行 / 列的选择，添加，删除）

In [12]:
dict = {
    "you": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "and": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
    "me": pd.Series([3, 6, 'hh', np.nan], index=["a", "b", "c", "d"])
}
df3 = pd.DataFrame(dict)
df3

Unnamed: 0,you,and,me
a,1.0,1.0,3
b,2.0,2.0,6
c,3.0,3.0,hh
d,,4.0,


In [13]:
df3[88] = df3['you']
df3

Unnamed: 0,you,and,me,88
a,1.0,1.0,3,1.0
b,2.0,2.0,6,2.0
c,3.0,3.0,hh,3.0
d,,4.0,,


In [14]:
df3['bool'] = df3['you'] >3
df3

Unnamed: 0,you,and,me,88,bool
a,1.0,1.0,3,1.0,False
b,2.0,2.0,6,2.0,False
c,3.0,3.0,hh,3.0,False
d,,4.0,,,False


In [15]:
# 删除列
del df3['bool']
print(df3)
print('-'*40)

three = df3.pop(88)
print(three)
df3

   you  and   me   88
a  1.0  1.0    3  1.0
b  2.0  2.0    6  2.0
c  3.0  3.0   hh  3.0
d  NaN  4.0  NaN  NaN
----------------------------------------
a    1.0
b    2.0
c    3.0
d    NaN
Name: 88, dtype: float64


Unnamed: 0,you,and,me
a,1.0,1.0,3
b,2.0,2.0,6
c,3.0,3.0,hh
d,,4.0,


In [16]:
# When inserting a scalar value, it will naturally be propagated to fill the column.
df3["foo"] = "bar"
df3

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [17]:
df3

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [18]:
print(df3.loc['a'], type(df3.loc['a']), sep='\n')  # 返回Series

print('-'*50)

print(df3.loc[['a']], type(df3.loc[['a']]), sep='\n')  # 返回DataFrame
df3.loc[['a', 'b']]  # 同时多选几行


you    1.0
and    1.0
me       3
foo    bar
Name: a, dtype: object
<class 'pandas.core.series.Series'>
--------------------------------------------------
   you  and me  foo
a  1.0  1.0  3  bar
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar


In [19]:
df3.loc[['a', 'b'], ['you', 'me']]  # 选择某几行，某几列


Unnamed: 0,you,me
a,1.0,3
b,2.0,6


In [20]:
df3[:2]  # 与  df.loc[['a', 'b']]  是等价的

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar


### pandas provides a suite of methods in order to get purely integer based indexing. 
*`The semantics follow closely Python and NumPy slicing.`* These are 0-based indexing. When slicing, the start bound is included, while the upper bound is excluded. 

In [43]:
df3

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [21]:
# iloc  即  integer location  ----->    通过整数进行定位
print(type(df3.iloc[1]))
df3.iloc[1]  # 此时选择了一行

<class 'pandas.core.series.Series'>


you    2.0
and    2.0
me       6
foo    bar
Name: b, dtype: object

In [22]:
df3.iloc[1, 3]  # 选了第二行第四列的  'bar'

'bar'

In [23]:
df3.iloc[[0, 1], [2, 3]]  # 任意行，任意列

Unnamed: 0,me,foo
a,3,bar
b,6,bar


In [24]:
df3.iloc[[0], [2, 3]]  # 任意行，任意列

Unnamed: 0,me,foo
a,3,bar


In [44]:
df3

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [28]:
df3.iloc[:, 3]

a    bar
b    bar
c    bar
d    bar
Name: foo, dtype: object

In [26]:
df_iloc = df3.iloc[1:3]  # 当选取多行时，返回的是DataFrame 。与 df[1:3] 效果一样！

print(type(df_iloc))

print('-'*60)

df_iloc

<class 'pandas.core.frame.DataFrame'>
------------------------------------------------------------


Unnamed: 0,you,and,me,foo
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar


In [46]:
df3[1:3]

Unnamed: 0,you,and,me,foo
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar


### 列

In [42]:
df3

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [29]:
print(type(df3['me']))

df3['me']  # 选择一列，此时是 Series


<class 'pandas.core.series.Series'>


a      3
b      6
c     hh
d    NaN
Name: me, dtype: object

In [30]:
print(type(df3[['me']]))

df3[['me']]  # 选择一列，此时是 DataFrame


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,me
a,3
b,6
c,hh
d,


In [31]:
print(type(df3[['you', 'me']]))

df3[['you', 'me']]  # 选择多列

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,you,me
a,1.0,3
b,2.0,6
c,3.0,hh
d,,


# Data alignment and arithmetic （数据对齐和算术）

In [32]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])

df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])

df + df2

Unnamed: 0,A,B,C,D
0,-1.840248,-0.80802,0.127729,
1,-1.740809,-1.435237,0.0695,
2,1.874142,0.591251,0.367313,
3,-0.540752,-1.636702,-0.102473,
4,-0.379926,1.398344,0.208331,
5,1.626567,-2.314872,3.020013,
6,-1.052804,-0.182034,3.156073,
7,,,,
8,,,,
9,,,,


In [33]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-0.227103,0.348671,-1.29876,-2.062686
2,0.708601,2.059956,-0.434785,-2.927806
3,1.365791,0.267408,-2.240659,-1.430358
4,1.075857,3.397386,-0.47115,-3.4509
5,2.392228,-0.870314,1.388926,-3.189594
6,0.868761,0.816783,1.765262,-2.829707
7,0.245225,0.765242,-0.593027,-2.215263
8,1.491898,1.032383,-0.194406,-2.808095
9,2.044728,2.675458,-2.421344,-2.209052


In [34]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
print(df)
df * 5

          A         B         C         D
0  0.158332 -0.506229 -0.582605  0.501990
1 -0.131064 -0.015205 -0.186167 -0.417183
2 -2.430449  0.921527 -1.074045 -1.613487
3 -0.908592 -0.489643 -0.500819 -0.742402
4  1.680143  1.417204 -0.590750  1.109605
5 -1.901218  0.901120 -1.444614 -0.675246
6  0.495250  0.692578  2.600896  0.348107
7 -1.121249 -0.005437  0.551466 -0.876129
8 -0.873369 -0.436904  1.614463 -0.384311
9  0.072594 -1.239359 -0.095603  1.026004


Unnamed: 0,A,B,C,D
0,0.791662,-2.531144,-2.913024,2.509952
1,-0.655321,-0.076025,-0.930835,-2.085916
2,-12.152247,4.607636,-5.370225,-8.067434
3,-4.54296,-2.448215,-2.504096,-3.712011
4,8.400715,7.086022,-2.953749,5.548025
5,-9.506088,4.505598,-7.223071,-3.376232
6,2.47625,3.462892,13.004481,1.740534
7,-5.606247,-0.027183,2.757331,-4.380643
8,-4.366847,-2.184519,8.072314,-1.921557
9,0.362972,-6.196793,-0.478013,5.130021


In [35]:
# 布尔值
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
print(df1, '\n')

df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print(df2)

       a      b
0   True  False
1  False   True
2   True   True 

       a      b
0  False   True
1   True   True
2   True  False


In [36]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


# 转置

In [37]:
df = df[:3]
df

Unnamed: 0,A,B,C,D
0,0.158332,-0.506229,-0.582605,0.50199
1,-0.131064,-0.015205,-0.186167,-0.417183
2,-2.430449,0.921527,-1.074045,-1.613487


In [38]:
df.T

Unnamed: 0,0,1,2
A,0.158332,-0.131064,-2.430449
B,-0.506229,-0.015205,0.921527
C,-0.582605,-0.186167,-1.074045
D,0.50199,-0.417183,-1.613487


In [39]:
df.transpose()

Unnamed: 0,0,1,2
A,0.158332,-0.131064,-2.430449
B,-0.506229,-0.015205,0.921527
C,-0.582605,-0.186167,-1.074045
D,0.50199,-0.417183,-1.613487
