# Pandas

In [1]:
from pandas.io.parsers import read_csv

df = read_csv('WHO_first9cols.csv')
print('DataFrame', df)

DataFrame                       Country  CountryID  Continent  \
0                 Afghanistan          1          1   
1                     Albania          2          2   
2                     Algeria          3          3   
3                     Andorra          4          2   
4                      Angola          5          3   
5         Antigua and Barbuda          6          4   
6                   Argentina          7          5   
7                     Armenia          8          2   
8                   Australia          9          6   
9                     Austria         10          2   
10                 Azerbaijan         11          2   
11                    Bahamas         12          4   
12                    Bahrain         13          1   
13                 Bangladesh         14          7   
14                   Barbados         15          4   
15                    Belarus         16          2   
16                    Belgium         17          2   


In [2]:
print('Shape', df.shape)
print('Length', len(df))

Shape (202, 9)
Length 202


In [7]:
print('Column headers:\n', df.columns)
print('Data types:\n', df.dtypes)

Column headers:
 Index(['Country', 'CountryID', 'Continent', 'Adolescent fertility rate (%)',
       'Adult literacy rate (%)',
       'Gross national income per capita (PPP international $)',
       'Net primary school enrolment ratio female (%)',
       'Net primary school enrolment ratio male (%)',
       'Population (in thousands) total'],
      dtype='object')
Data types:
 Country                                                    object
CountryID                                                   int64
Continent                                                   int64
Adolescent fertility rate (%)                             float64
Adult literacy rate (%)                                   float64
Gross national income per capita (PPP international $)    float64
Net primary school enrolment ratio female (%)             float64
Net primary school enrolment ratio male (%)               float64
Population (in thousands) total                           float64
dtype: object


## 索引

In [8]:
print("Index:\n", df.index)

Index:
 RangeIndex(start=0, stop=202, step=1)


## 获取DataFrame中的数据，是Numpy的ndarray类型的

In [13]:
values = df.values
print("Values:\n:", values)
type(values)

Values:
: [['Afghanistan' 1 1 ... nan nan 26088.0]
 ['Albania' 2 2 ... 93.0 94.0 3172.0]
 ['Algeria' 3 3 ... 94.0 96.0 33351.0]
 ...
 ['Yemen' 200 1 ... 65.0 85.0 21732.0]
 ['Zambia' 201 3 ... 94.0 90.0 11696.0]
 ['Zimbabwe' 202 3 ... 88.0 87.0 13228.0]]


numpy.ndarray

## Series
- 带标签的一维数据结构
- 对DataFrame的某一列索引，会得到一个Series对象

In [16]:
country_col = df['Country']
print("Type of country_col:\n", type(country_col))

Type of country_col:
 <class 'pandas.core.series.Series'>


In [17]:
print("Shape:\n", country_col.shape)
print("Index:\n", country_col.index)
print("Values:\n", country_col.values)
print("Name:\n", country_col.name)

Shape:
 (202,)
Index:
 RangeIndex(start=0, stop=202, step=1)
Values:
 ['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada'
 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia'
 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Cook Islands' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' 'Denmark'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji'
 'Finland' 'France' 'French Polynesia' 'Gabon' 'Gambia' 'Georgia'
 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau'
 'Guyana' 'Haiti' 'Honduras' 'Hong Kong, China' 'Hungary'

## Series的切片仍然是Series

In [18]:
print("Last 2 countries:\n", country_col[-2:])
print("Last 2 countries type:\n", type(country_col[-2:]))

Last 2 countries:
 200      Zambia
201    Zimbabwe
Name: Country, dtype: object
Last 2 countries type:
 <class 'pandas.core.series.Series'>


- Numpy的函数也可以对DataFrame和Series对象使用
- DataFrame, Series和Numpy数组ndarray之间可以进行算数运算

In [24]:
import numpy as np
last_col = df.columns[-1]
print("Mean of last col:\n", np.mean(df[last_col]))
print("Series minus ndarray:\n", df[last_col]-df[last_col].values)

Mean of last col:
 34099.64021164021
Series minus ndarray:
 0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      0.0
10     0.0
11     0.0
12     0.0
13     0.0
14     0.0
15     0.0
16     0.0
17     0.0
18     0.0
19     NaN
20     0.0
21     0.0
22     0.0
23     0.0
24     0.0
25     0.0
26     0.0
27     0.0
28     0.0
29     0.0
      ... 
172    0.0
173    0.0
174    0.0
175    0.0
176    NaN
177    0.0
178    0.0
179    0.0
180    0.0
181    0.0
182    0.0
183    0.0
184    0.0
185    0.0
186    0.0
187    0.0
188    0.0
189    0.0
190    0.0
191    0.0
192    0.0
193    0.0
194    0.0
195    0.0
196    0.0
197    0.0
198    NaN
199    0.0
200    0.0
201    0.0
Name: Population (in thousands) total, Length: 202, dtype: float64


# Query data in Pandas

In [32]:
sunspots = read_csv('SN_y_tot_V2.0.csv', index_col = 0)
print('head 2:\n', sunspots.head(2))
print('tail 2:\n', sunspots.tail(2))

head 2:
                           Yearly mean total sunspot number  \
Gregorian calendar year                                      
1700.5                                                 8.3   
1701.5                                                18.3   

                          Yearly mean standard deviation  \
Gregorian calendar year                                    
1700.5                                              -1.0   
1701.5                                              -1.0   

                          Number of observations used  \
Gregorian calendar year                                 
1700.5                                             -1   
1701.5                                             -1   

                          Definitive/provisional marker  
Gregorian calendar year                                  
1700.5                                                1  
1701.5                                                1  
tail 2:
                           Yearl

In [38]:
last_data = sunspots.index[-1]
print("Last value: ", sunspots.loc[last_data])

Last value:   Yearly mean total sunspot number       21.7
 Yearly mean standard deviation          2.5
 Number of observations used         11444.0
 Definitive/provisional marker           1.0
Name: 2017.5, dtype: float64


In [39]:
sunspots.index

Float64Index([1700.5, 1701.5, 1702.5, 1703.5, 1704.5, 1705.5, 1706.5, 1707.5,
              1708.5, 1709.5,
              ...
              2008.5, 2009.5, 2010.5, 2011.5, 2012.5, 2013.5, 2014.5, 2015.5,
              2016.5, 2017.5],
             dtype='float64', name='Gregorian calendar year', length=318)

### 可以按索引范围查询

In [37]:
print("Values slicing by index:\n", sunspots[2000.5:2018.5])

Values slicing by index:
                           Yearly mean total sunspot number  \
Gregorian calendar year                                      
2000.5                                               173.9   
2001.5                                               170.4   
2002.5                                               163.6   
2003.5                                                99.3   
2004.5                                                65.3   
2005.5                                                45.8   
2006.5                                                24.7   
2007.5                                                12.6   
2008.5                                                 4.2   
2009.5                                                 4.8   
2010.5                                                24.9   
2011.5                                                80.8   
2012.5                                                84.5   
2013.5                                      

### 也可以用一组下标索引

In [40]:
print("Slicing from a list of indices:\n", sunspots.iloc[[2, 4, -1, -2]])

Slicing from a list of indices:
                           Yearly mean total sunspot number  \
Gregorian calendar year                                      
1702.5                                                26.7   
1704.5                                                60.0   
2017.5                                                21.7   
2016.5                                                39.8   

                          Yearly mean standard deviation  \
Gregorian calendar year                                    
1702.5                                              -1.0   
1704.5                                              -1.0   
2017.5                                               2.5   
2016.5                                               3.9   

                          Number of observations used  \
Gregorian calendar year                                 
1702.5                                             -1   
1704.5                                             -1   
2017.

### 有2种方式选取特定数据：iloc和iat，iat更快
传2个参数，分别是行序号和列序号

In [43]:
print("Scalar with iloc:\n", sunspots.iloc[-1,0])
print("Scalar with iat:\n", sunspots.iat[-1,0])

Scalar with iloc:
 21.7
Scalar with iat:
 21.7


### loc和iloc
- loc的参数是标签名，iloc的参数是从0开始的编号

### 使用布尔值选取数据

In [53]:
first_col = sunspots.columns[0]
sunspots[sunspots[first_col] > np.mean(sunspots.iloc[:,0])]
# 或者sunspots[sunspots.iloc[:,0] > np.mean(sunspots.iloc[:,0])]，结果是一样的

Unnamed: 0_level_0,Yearly mean total sunspot number,Yearly mean standard deviation,Number of observations used,Definitive/provisional marker
Gregorian calendar year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1705.5,96.7,-1.0,-1,1
1717.5,105.0,-1.0,-1,1
1718.5,100.0,-1.0,-1,1
1726.5,130.0,-1.0,-1,1
1727.5,203.3,-1.0,-1,1
1728.5,171.7,-1.0,-1,1
1729.5,121.7,-1.0,-1,1
1736.5,116.7,-1.0,-1,1
1737.5,135.0,-1.0,-1,1
1738.5,185.0,-1.0,-1,1


# Statistics with Pandas DataFrame

In [55]:
sunspots.describe()

Unnamed: 0,Yearly mean total sunspot number,Yearly mean standard deviation,Number of observations used,Definitive/provisional marker
count,318.0,318.0,318.0,318.0
mean,79.196855,4.649057,954.069182,1.0
std,61.985539,5.295702,2155.041512,0.0
min,0.0,-1.0,-1.0,1.0
25%,24.95,-1.0,-1.0,1.0
50%,66.25,4.2,365.0,1.0
75%,116.025,8.9,365.0,1.0
max,269.3,19.1,11444.0,1.0


# Data aggregation with Pandas

In [60]:
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
import numpy as np

seed(40)

df = pd.DataFrame({'Weather': ['cold', 'hot', 'cold', 'hot', 'cold', 'hot', 'cold'],
                   'Food': ['soup', 'icecream', 'chocolate', 'icecream', 'soup', 'soup','icecream'],
                   'Price': 10*rand(7),
                   'Number': randint(1, 9, size=(7,))})
print(df)

        Food  Number     Price Weather
0       soup       8  4.076870    cold
1   icecream       4  0.553660     hot
2  chocolate       4  7.885349    cold
3   icecream       4  2.873052     hot
4       soup       2  4.503506    cold
5       soup       1  3.039123     hot
6   icecream       2  5.263995    cold


In [63]:
weather_group = df.groupby('Weather')
print("Type of weather_group:\n", type(weather_group))
i = 0
for name, group in weather_group:
    i += 1
    print("Group", i, name)
    print(group)

Type of weather_group:
 <class 'pandas.core.groupby.DataFrameGroupBy'>
Group 1 cold
        Food  Number     Price Weather
0       soup       8  4.076870    cold
2  chocolate       4  7.885349    cold
4       soup       2  4.503506    cold
6   icecream       2  5.263995    cold
Group 2 hot
       Food  Number     Price Weather
1  icecream       4  0.553660     hot
3  icecream       4  2.873052     hot
5      soup       1  3.039123     hot


In [65]:
wf_group = df.groupby(['Weather', 'Food'])
print("WF groups", wf_group.groups)

WF groups {('cold', 'chocolate'): Int64Index([2], dtype='int64'), ('cold', 'icecream'): Int64Index([6], dtype='int64'), ('cold', 'soup'): Int64Index([0, 4], dtype='int64'), ('hot', 'icecream'): Int64Index([1, 3], dtype='int64'), ('hot', 'soup'): Int64Index([5], dtype='int64')}


In [67]:
print("WF aggregated\n", wf_group.agg([np.mean, np.median]))

WF aggregated
                   Number            Price          
                    mean median      mean    median
Weather Food                                       
cold    chocolate      4      4  7.885349  7.885349
        icecream       2      2  5.263995  5.263995
        soup           5      5  4.290188  4.290188
hot     icecream       4      4  1.713356  1.713356
        soup           1      1  3.039123  3.039123


## Concatenating and appending DataFrames

In [68]:
print("df: 3\n", df[0:3])

df: 3
         Food  Number     Price Weather
0       soup       8  4.076870    cold
1   icecream       4  0.553660     hot
2  chocolate       4  7.885349    cold


In [75]:
print(df[:3].append(df[5:]))

        Food  Number     Price Weather
0       soup       8  4.076870    cold
1   icecream       4  0.553660     hot
2  chocolate       4  7.885349    cold
5       soup       1  3.039123     hot
6   icecream       2  5.263995    cold


In [82]:
print(pd.concat([df[0:3], df[6:]]))

        Food  Number     Price Weather
0       soup       8  4.076870    cold
1   icecream       4  0.553660     hot
2  chocolate       4  7.885349    cold
6   icecream       2  5.263995    cold


# Joining DataFrames

In [97]:
dest = pd.DataFrame({"Employ ID": [5, 3, 9],
                     "Destination": ['Hankou', 'Hanyang', 'Wuchang']})
tips = pd.DataFrame({"Employ ID": [5, 9, 7],
                     "Tips": [10, 5, 2.5]})
print(dest)
print(tips)

  Destination  Employ ID
0      Hankou          5
1     Hanyang          3
2     Wuchang          9
   Employ ID  Tips
0          5  10.0
1          9   5.0
2          7   2.5


In [99]:
print("Merge on Employ ID\n", pd.merge(dest, tips, on = "Employ ID"))

Merge on Employ ID
   Destination  Employ ID  Tips
0      Hankou          5  10.0
1     Wuchang          9   5.0


In [100]:
print("Inner join with merge\n", pd.merge(dest, tips, how="inner"))

Inner join with merge
   Destination  Employ ID  Tips
0      Hankou          5  10.0
1     Wuchang          9   5.0


In [101]:
print("Inner join with merge\n", pd.merge(dest, tips, how="outer"))

Inner join with merge
   Destination  Employ ID  Tips
0      Hankou          5  10.0
1     Hanyang          3   NaN
2     Wuchang          9   5.0
3         NaN          7   2.5


# Handling missing values

In [145]:
from pandas.io.parsers import read_csv

df = read_csv("WHO_first9cols.csv")
df = df[['Country', df.columns[-2]]][:2]
print(df)

       Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          NaN
1      Albania                                         94.0


In [132]:
print(pd.isnull(df))

   Country  Net primary school enrolment ratio male (%)
0    False                                         True
1    False                                        False


In [146]:
df[df.columns[-1]] = df[df.columns[-1]].replace(np.nan, 0)
print(df)

       Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          0.0
1      Albania                                         94.0


## 替换指定列

In [144]:
values = {df.columns[-1]: -1}
df.fillna(value = values, inplace = True)
print(df)

       Country  Net primary school enrolment ratio male (%)
0  Afghanistan                                          0.0
1      Albania                                         94.0


## Dealing with Dates

In [153]:
print(pd.date_range('2/8/2017', periods=55, freq='W'))

DatetimeIndex(['2017-02-12', '2017-02-19', '2017-02-26', '2017-03-05',
               '2017-03-12', '2017-03-19', '2017-03-26', '2017-04-02',
               '2017-04-09', '2017-04-16', '2017-04-23', '2017-04-30',
               '2017-05-07', '2017-05-14', '2017-05-21', '2017-05-28',
               '2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25',
               '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23',
               '2017-07-30', '2017-08-06', '2017-08-13', '2017-08-20',
               '2017-08-27', '2017-09-03', '2017-09-10', '2017-09-17',
               '2017-09-24', '2017-10-01', '2017-10-08', '2017-10-15',
               '2017-10-22', '2017-10-29', '2017-11-05', '2017-11-12',
               '2017-11-19', '2017-11-26', '2017-12-03', '2017-12-10',
               '2017-12-17', '2017-12-24', '2017-12-31', '2018-01-07',
               '2018-01-14', '2018-01-21', '2018-01-28', '2018-02-04',
               '2018-02-11', '2018-02-18', '2018-02-25'],
              dtype

In [155]:
print(pd.to_datetime(['20071021', '20180921'], format='%Y%m%d'))

DatetimeIndex(['2007-10-21', '2018-09-21'], dtype='datetime64[ns]', freq=None)


## Pivot table

In [156]:
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
import numpy as np

seed(40)

df = pd.DataFrame({'Weather': ['cold', 'hot', 'cold', 'hot', 'cold', 'hot', 'cold'],
                   'Food': ['soup', 'icecream', 'chocolate', 'icecream', 'soup', 'soup','icecream'],
                   'Price': 10*rand(7),
                   'Number': randint(1, 9, size=(7,))})
print(df)

        Food  Number     Price Weather
0       soup       8  4.076870    cold
1   icecream       4  0.553660     hot
2  chocolate       4  7.885349    cold
3   icecream       4  2.873052     hot
4       soup       2  4.503506    cold
5       soup       1  3.039123     hot
6   icecream       2  5.263995    cold


In [163]:
print(pd.pivot_table(df, values='Price', index='Weather', columns='Food', aggfunc=np.mean))

Food     chocolate  icecream      soup
Weather                               
cold      7.885349  5.263995  4.290188
hot            NaN  1.713356  3.039123
