In [1]:
import numpy as np
import pandas as pd

# From dict of Series or dicts （从 `Series的字典` 或者 `字典的字典`）

In [2]:
# 字典的key是不可变对象，所以只有从value上嵌套字典.
# Series的字典
d1 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}

# 字典的字典 
d2 = {
     66: {'a': 1.0, 'b': 2.0, 'c': 3.0},
     88: {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': 4.0}
}

df1 = pd.DataFrame(d1)
print(df1, '\n')

df2 = pd.DataFrame(d2)
print(df2)

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0 

    66   88
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0


In [3]:
# index 、columns 属性
print(df1.index, df1.columns, sep='\n')

Index(['a', 'b', 'c', 'd'], dtype='object')
Index(['one', 'two'], dtype='object')


In [4]:
pd.DataFrame(d1, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [5]:
pd.DataFrame(d1, index=["d", "b", "a"], columns=["one", "two", "three"])

Unnamed: 0,one,two,three
d,,4.0,
b,2.0,2.0,
a,1.0,1.0,


In [6]:
pd.DataFrame(d1, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


# From dict of ndarrays / lists （从 `多维数组的字典` 或者 `列表的字典`）

In [7]:
# 从多维数组的字典
d1 = {22: np.linspace(2, 8, 4), '44': np.linspace(6, 9, 4)}
print(d1, d1.keys(), d1.values(), sep='\n')

df1 = pd.DataFrame(d1, index=['you', 'and', 'me', 'together'])
df1

{22: array([2., 4., 6., 8.]), '44': array([6., 7., 8., 9.])}
dict_keys([22, '44'])
dict_values([array([2., 4., 6., 8.]), array([6., 7., 8., 9.])])


Unnamed: 0,22,44
you,2.0,6.0
and,4.0,7.0
me,6.0,8.0
together,8.0,9.0


In [8]:
# 从列表的字典
d2 = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
print(d2, d2.keys(), d2.values(), sep='\n')

df2 = pd.DataFrame(d2, index=['you', 'and', 'me', 'two'])
df2

{'one': [1.0, 2.0, 3.0, 4.0], 'two': [4.0, 3.0, 2.0, 1.0]}
dict_keys(['one', 'two'])
dict_values([[1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0]])


Unnamed: 0,one,two
you,1.0,4.0
and,2.0,3.0
me,3.0,2.0
two,4.0,1.0


# From a list of dicts （从字典列表）

In [9]:
data1= [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]

pd.DataFrame(data1)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [10]:
df = pd.DataFrame(data1, index=[66, "second"])
df

Unnamed: 0,a,b,c
66,1,2,
second,5,10,20.0


In [11]:
pd.DataFrame(data1, columns=["a", "b"])

Unnamed: 0,a,b
0,1,2
1,5,10


# Column selection, addition, deletion （行 / 列的选择，添加，删除）

In [12]:
d = {
    "you": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "and": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
    "me": pd.Series([3, 6, 'hh', np.nan], index=["a", "b", "c", "d"])
}
df = pd.DataFrame(d)
df

Unnamed: 0,you,and,me
a,1.0,1.0,3
b,2.0,2.0,6
c,3.0,3.0,hh
d,,4.0,


In [13]:
df[88] = df['you']
df

Unnamed: 0,you,and,me,88
a,1.0,1.0,3,1.0
b,2.0,2.0,6,2.0
c,3.0,3.0,hh,3.0
d,,4.0,,


In [14]:
df['bool'] = df['you'] >3
df

Unnamed: 0,you,and,me,88,bool
a,1.0,1.0,3,1.0,False
b,2.0,2.0,6,2.0,False
c,3.0,3.0,hh,3.0,False
d,,4.0,,,False


In [15]:
# 删除列
del df['bool']
print(df)
print('-'*40)

three = df.pop(88)
print(three)
df

   you  and   me   88
a  1.0  1.0    3  1.0
b  2.0  2.0    6  2.0
c  3.0  3.0   hh  3.0
d  NaN  4.0  NaN  NaN
----------------------------------------
a    1.0
b    2.0
c    3.0
d    NaN
Name: 88, dtype: float64


Unnamed: 0,you,and,me
a,1.0,1.0,3
b,2.0,2.0,6
c,3.0,3.0,hh
d,,4.0,


In [16]:
# When inserting a scalar value, it will naturally be propagated to fill the column.
df["foo"] = "bar"
df

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [17]:
df

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar
d,,4.0,,bar


In [18]:
print(df.loc['a'], type(df.loc['a']), sep='\n')  # 返回Series

print('-'*50)

print(df.loc[['a']], type(df.loc[['a']]), sep='\n')  # 返回DataFrame
df.loc[['a', 'b']]  # 同时多选几行


you    1.0
and    1.0
me       3
foo    bar
Name: a, dtype: object
<class 'pandas.core.series.Series'>
--------------------------------------------------
   you  and me  foo
a  1.0  1.0  3  bar
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar


In [19]:
df[:2]  # 与  df.loc[['a', 'b']]  是等价的

Unnamed: 0,you,and,me,foo
a,1.0,1.0,3,bar
b,2.0,2.0,6,bar


In [20]:
# iloc  即  integer location  ----->    通过整数进行定位
print(type(df.iloc[1]))
df.iloc[1]  # 此时选择了一行

<class 'pandas.core.series.Series'>


you    2.0
and    2.0
me       6
foo    bar
Name: b, dtype: object

In [21]:
df.iloc[1, 3]  # 选了第二行第四列的  'bar'

'bar'

In [22]:
df_iloc = df.iloc[1:3]  # 当选取多行时，返回的是DataFrame 。与 df[1:3] 效果一样！

print(type(df_iloc[1:3]))

print('-'*60)

df_iloc

<class 'pandas.core.frame.DataFrame'>
------------------------------------------------------------


Unnamed: 0,you,and,me,foo
b,2.0,2.0,6,bar
c,3.0,3.0,hh,bar


### 列

In [23]:
print(type(df['me']))

df['me']  # 选择一列，此时是 Series


<class 'pandas.core.series.Series'>


a      3
b      6
c     hh
d    NaN
Name: me, dtype: object

In [24]:
print(type(df[['me']]))

df[['me']]  # 选择一列，此时是 DataFrame


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,me
a,3
b,6
c,hh
d,


In [25]:
print(type(df[['you', 'me']]))

df[['you', 'me']]  # 选择多列

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,you,me
a,1.0,3
b,2.0,6
c,3.0,hh
d,,


# Data alignment and arithmetic （数据对齐和算术）

In [26]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])

df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])

df + df2

Unnamed: 0,A,B,C,D
0,0.044883,-1.189861,1.260412,
1,2.016788,-0.899623,-1.284193,
2,-0.084905,-0.094317,-0.134294,
3,-2.328626,2.24841,0.164236,
4,-0.766228,0.96464,0.518112,
5,-0.127745,1.419026,0.00372,
6,-1.331292,0.009874,0.454196,
7,,,,
8,,,,
9,,,,


In [27]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,1.609468,1.125569,-1.519171,-0.348604
2,-0.085081,1.177218,-0.381342,-0.937959
3,-0.791915,2.093386,-2.131925,-1.478554
4,0.817578,1.047994,-0.163338,1.544089
5,0.048098,0.543786,-0.351269,1.000679
6,-1.319096,0.816745,-0.2232,0.989203
7,0.650491,2.633897,-1.251432,-0.306079
8,-0.477049,0.397278,-1.875945,0.584436
9,1.069858,-0.725488,0.259813,1.39886


In [28]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
print(df)
df * 5

          A         B         C         D
0 -0.374194  1.504160  0.845490  0.545119
1  1.086572 -0.593695 -0.887943 -1.212587
2 -1.821853  0.056103 -0.025210 -1.350573
3 -1.535750 -0.149855  1.469039 -0.016558
4  0.885229  0.130293 -0.265494  0.259021
5 -0.540190  0.144446  0.033388 -0.166332
6 -0.792132 -0.841507 -0.223448  1.659438
7  0.772919  0.223515  0.135309 -0.168520
8 -0.408729 -1.032775 -0.970329  0.710332
9 -0.070368 -1.838735 -1.125842  0.736696


Unnamed: 0,A,B,C,D
0,-1.870971,7.520799,4.22745,2.725596
1,5.43286,-2.968475,-4.439717,-6.062936
2,-9.109267,0.280513,-0.126051,-6.752866
3,-7.678749,-0.749275,7.345193,-0.082792
4,4.426146,0.651463,-1.32747,1.295105
5,-2.700951,0.722232,0.166941,-0.831659
6,-3.960662,-4.207534,-1.117241,8.29719
7,3.864595,1.117576,0.676546,-0.8426
8,-2.043647,-5.163875,-4.851647,3.55166
9,-0.351842,-9.193677,-5.629209,3.683478


In [29]:
# 布尔值
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
print(df1, '\n')

df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print(df2)

       a      b
0   True  False
1  False   True
2   True   True 

       a      b
0  False   True
1   True   True
2   True  False


In [30]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


# 转置

In [31]:
df = df[:3]
df

Unnamed: 0,A,B,C,D
0,-0.374194,1.50416,0.84549,0.545119
1,1.086572,-0.593695,-0.887943,-1.212587
2,-1.821853,0.056103,-0.02521,-1.350573


In [32]:
df.T

Unnamed: 0,0,1,2
A,-0.374194,1.086572,-1.821853
B,1.50416,-0.593695,0.056103
C,0.84549,-0.887943,-0.02521
D,0.545119,-1.212587,-1.350573


In [33]:
df.transpose()

Unnamed: 0,0,1,2
A,-0.374194,1.086572,-1.821853
B,1.50416,-0.593695,0.056103
C,0.84549,-0.887943,-0.02521
D,0.545119,-1.212587,-1.350573
