In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 通过list创建一个序列，默认整数索引
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64

In [3]:
# 通过索引单独访问, 得到单个值
s[0], s[1], s[2], s[3], s[4], s[5]

(1.0, 3.0, 5.0, nan, 6.0, 8.0)

In [4]:
# 分片，得到Series对象
s[0:2], type(s[0:2])

(0    1
 1    3
 dtype: float64, pandas.core.series.Series)

In [5]:
# 过滤na值
s[s.notnull()]

0    1
1    3
2    5
4    6
5    8
dtype: float64

In [6]:
# 获得na值
s[s.isnull()]

3   NaN
dtype: float64

In [7]:
# 过滤小于5的值
s[s < 5]

0    1
1    3
dtype: float64

In [8]:
# 过滤大与3且小于8的值
s[(s < 8) &  (s > 3)]

2    5
4    6
dtype: float64

In [9]:
# 过滤小于3或大于6的值
s[(s < 3) | (s > 6)]

0    1
5    8
dtype: float64

In [10]:
# 获得序列数值
s.values, list(s.values)

(array([  1.,   3.,   5.,  nan,   6.,   8.]), [1.0, 3.0, 5.0, nan, 6.0, 8.0])

In [11]:
# 获得序列索引
s.index, type(s.index), list(s.index)

(Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
 pandas.core.index.Int64Index,
 [0, 1, 2, 3, 4, 5])

In [12]:
# 进行四则运算
s * 2

0     2
1     6
2    10
3   NaN
4    12
5    16
dtype: float64

In [13]:
#创建一个指定索引Series：
s_index = pd.Series([1, 2, 3, 4, 5], index=['a', 'v', 'i', 'm', 'g'])
s_index

a    1
v    2
i    3
m    4
g    5
dtype: int64

In [14]:
# 构建DataFrame
names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
babydataset = zip(names, births)
df = pd.DataFrame(data = babydataset, columns=['Names', 'Births'])
df

Unnamed: 0,Names,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


In [15]:
# 将数据存如births.csv,索引和标题不保存
df.to_csv('births.csv', index=False, header=False)

In [16]:
ls

 驱动器 C 中的卷是 OS
 卷的序列号是 86BB-ED90

 c:\gitGitgit\Note\MYSTUDY\PriticeWithR\Assignment1 的目录

2015/04/15  13:56    <DIR>          .
2015/04/15  13:56    <DIR>          ..
2015/04/15  12:00    <DIR>          .ipynb_checkpoints
2015/04/15  13:56             8,373 assignment1note.ipynb
2015/04/15  13:56                50 births.csv
2015/04/15  11:45             2,259 pollutantmean.py
               3 个文件         10,682 字节
               3 个目录 401,753,018,368 可用字节


In [17]:
# 读取csv文件
location = r"c:\gitGitgit\Note\MYSTUDY\PriticeWithR\Assignment1\births.csv"
df = pd.read_csv(location)
df

Unnamed: 0,Bob,968
0,Jessica,155
1,Mary,77
2,John,578
3,Mel,973


In [18]:
# 指定列名读取
df = pd.read_csv(location, names=["names", "Births"])
df

Unnamed: 0,names,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


In [19]:
# 读取完毕后删除文件
import os
os.remove(location)

In [20]:
ls

 驱动器 C 中的卷是 OS
 卷的序列号是 86BB-ED90

 c:\gitGitgit\Note\MYSTUDY\PriticeWithR\Assignment1 的目录

2015/04/15  13:56    <DIR>          .
2015/04/15  13:56    <DIR>          ..
2015/04/15  12:00    <DIR>          .ipynb_checkpoints
2015/04/15  13:56             8,373 assignment1note.ipynb
2015/04/15  11:45             2,259 pollutantmean.py
               2 个文件         10,632 字节
               3 个目录 401,753,018,368 可用字节


In [21]:
df

Unnamed: 0,names,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


In [22]:
# 取单列的值, 即从数据框中提取一个Series
df['names'], type(df['names'])

(0        Bob
 1    Jessica
 2       Mary
 3       John
 4        Mel
 Name: names, dtype: object, pandas.core.series.Series)

In [23]:
# 提取子数据框，使用[[]] 语法
df[['Births']], type(df[['Births']])

(   Births
 0     968
 1     155
 2      77
 3     578
 4     973, pandas.core.frame.DataFrame)

In [24]:
# 取多列（提取子数据框）
df[['names','Births']]

Unnamed: 0,names,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


In [25]:
# 使用loc提取指定行列,提取Series
df.loc[0:2,"names"]

0        Bob
1    Jessica
2       Mary
Name: names, dtype: object

In [26]:
# 提取DataFrame
df.loc[0:2,["names"]]

Unnamed: 0,names
0,Bob
1,Jessica
2,Mary


In [27]:
# 任务1：获得Births最小的名字
df['Births'].plot()
# 过滤出最小的Births
MinValue = df['Births'].min()
MinValue

77

In [28]:
# 过滤出最小Births对应的名字
MinName = df['names'][df['Births'] == MinValue].values
MinName

array(['Mary'], dtype=object)

In [29]:
# 直接过滤出来
df[df['Births'] == df['Births'].min()]

Unnamed: 0,names,Births
2,Mary,77


In [30]:
# 合并多个序列为一个序列
hs = np.hstack([[1,2,3,4], [2,3,4,5], [7,8,9,0,1,3,42]])
hs

array([ 1,  2,  3,  4,  2,  3,  4,  5,  7,  8,  9,  0,  1,  3, 42])

In [31]:
# 求序列的平均值
np.mean(hs)

6.2666666666666666