In [25]:
import pandas as pd
import numpy as np

# pandas Data Structures

## 1. Series

A Series is a **one-dimensional array-like** object containing a sequence of values and an associated array of data labels, called its **index**.

In [2]:
obj = pd.Series([4, 7, -5, -3])
obj

0    4
1    7
2   -5
3   -3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5, -3], dtype=int64)

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj.index = ['a', 'b', 'c', 'd']
obj

a    4
b    7
c   -5
d   -3
dtype: int64

In [7]:
l = [obj['a'], obj[['d', 'b', 'c']]]
print(*l, sep='\n-------\n')

4
-------
d   -3
b    7
c   -5
dtype: int64


In [8]:
obj[obj>2]

a    4
b    7
dtype: int64

In [9]:
obj*2

a     8
b    14
c   -10
d    -6
dtype: int64

In [10]:
obj.isnull()

a    False
b    False
c    False
d    False
dtype: bool

## 2.DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string,boolean, etc.). 

![在这里插入图片描述](https://img-blog.csdnimg.cn/20190808151414474.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzMwNzQ4ODYz,size_16,color_FFFFFF,t_70)

In [11]:
data = {
        'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
        }
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [13]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [14]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [15]:
pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'])

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [19]:
df = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six'])
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [20]:
df['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [21]:
df.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [23]:
df['debt'] = 16.5
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [27]:
df.debt = np.arange(6.)
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [28]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df['debt'] = val
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [30]:
df['eastern'] = df['state'] == 'Ohio'
df

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [31]:
del df['eastern']
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [32]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df = pd.DataFrame(pop)
df

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [33]:
df.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [34]:
df

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


## 3. Index Objects

Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index

In [35]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [36]:
obj.index

Index(['a', 'b', 'c'], dtype='object')

![在这里插入图片描述](https://img-blog.csdnimg.cn/20190808152922315.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzMwNzQ4ODYz,size_16,color_FFFFFF,t_70)

# Essential Functionality

## 1. Reindexing

In [37]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [39]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [40]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [41]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [44]:
frame.reindex(columns=['Texas', 'California', 'Ohio'])

Unnamed: 0,Texas,California,Ohio
a,1,2,0
c,4,5,3
d,7,8,6


## 2. Dropping Entries from an Axis

In [45]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [46]:
obj.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [47]:
obj.drop(['c', 'd'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [48]:
df = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
df.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [50]:
df.drop(['two'], axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


## 3. Indexing, Selection, and Filtering

In [72]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [75]:
l = [
    obj['b'], 
    obj[1], 
    obj[2:4], 
    obj['b':'c'],
    obj[['b', 'a', 'd']],
    obj[[1, 3]],
    obj[obj < 2]]
print(*l, sep='\n-----------\n')

1.0
-----------
1.0
-----------
c    2.0
d    3.0
dtype: float64
-----------
b    1.0
c    2.0
dtype: float64
-----------
b    1.0
a    0.0
d    3.0
dtype: float64
-----------
b    1.0
d    3.0
dtype: float64
-----------
a    0.0
b    1.0
dtype: float64


In [76]:
df = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [79]:
l = [
    df['two'],
    df[['three', 'one']],
    df[:2],
    df[df['three'] > 5],
]
print(*l, sep='\n-------------------------\n')

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
-------------------------
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
-------------------------
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
-------------------------
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [81]:
l = [
    df.loc['Colorado', ['two', 'three']],
    df.iloc[2, [3, 0, 1]],
    df.iloc[[1, 2], [3, 0, 1]],
    df.loc[:'Utah', 'two'],
    df.iloc[:, :3][df.three > 5]
]
print(*l, sep='\n-------------------------\n')

two      5
three    6
Name: Colorado, dtype: int32
-------------------------
four    11
one      8
two      9
Name: Utah, dtype: int32
-------------------------
          four  one  two
Colorado     7    4    5
Utah        11    8    9
-------------------------
Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32
-------------------------
          one  two  three
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


![在这里插入图片描述](https://img-blog.csdnimg.cn/20190808161330163.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzMwNzQ4ODYz,size_16,color_FFFFFF,t_70)

## 4. Function Application and Mapping

In [82]:
df = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df

Unnamed: 0,b,d,e
Utah,-0.194653,1.149793,-0.186418
Ohio,-0.216835,-2.568416,-0.457348
Texas,-1.759585,0.780678,-0.514356
Oregon,-1.007272,2.244954,-1.104277


In [83]:
np.abs(df)

Unnamed: 0,b,d,e
Utah,0.194653,1.149793,0.186418
Ohio,0.216835,2.568416,0.457348
Texas,1.759585,0.780678,0.514356
Oregon,1.007272,2.244954,1.104277


In [85]:
func = lambda x: x.max() - x.min()
df.apply(func)

b    1.564932
d    4.813369
e    0.917859
dtype: float64

In [86]:
df.apply(func, axis=1)

Utah      1.344446
Ohio      2.351581
Texas     2.540263
Oregon    3.349230
dtype: float64

In [88]:
format = lambda x: f'{x:.2f}'
df.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.19,1.15,-0.19
Ohio,-0.22,-2.57,-0.46
Texas,-1.76,0.78,-0.51
Oregon,-1.01,2.24,-1.1


In [89]:
df['e'].map(format)

Utah      -0.19
Ohio      -0.46
Texas     -0.51
Oregon    -1.10
Name: e, dtype: object

## 5. Sorting and Ranking

In [90]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [91]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [92]:
df = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [93]:
df.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [94]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [95]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [96]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [97]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [98]:
df = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
df

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [99]:
df.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [100]:
df.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [101]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

## 6. Summarizing and Computing Descriptive Statistics

In [102]:
df = pd.DataFrame([
                    [1.4, np.nan], 
                    [7.1, -4.5],
                    [np.nan, np.nan], 
                    [0.75, -1.3]
                   ],
                    index=['a', 'b', 'c', 'd'],
                    columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [103]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [104]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [106]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [107]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


![在这里插入图片描述](https://img-blog.csdnimg.cn/20190808164220780.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzMwNzQ4ODYz,size_16,color_FFFFFF,t_70)

## 7. Unique Values, Value Counts, and Membership

In [108]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [109]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [110]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [111]:
obj.isin(['c', 'd'])

0     True
1    False
2     True
3    False
4    False
5    False
6    False
7     True
8     True
dtype: bool

In [112]:
obj[obj.isin(['a', 'c'])]

0    c
1    a
3    a
4    a
7    c
8    c
dtype: object

![在这里插入图片描述](https://img-blog.csdnimg.cn/20190808164753505.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzMwNzQ4ODYz,size_16,color_FFFFFF,t_70)