In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

### Series 
- Series 由data和index 两部分组成， 如果不指定index，就默认从0开始的数字。
- Series的values是一个numpy的array类型
- Series很像一个字典，但不是字典，字典可以转换成Series类型
- Series中数据的类型可以不相同
- Time-Series 以时间为索引的Series

In [156]:
num_list = [4,7,-5,3]

In [219]:
num_series = pd.Series(num_list)

In [220]:
print(num_series)

0    4
1    7
2   -5
3    3
dtype: int64


In [221]:
num_series[0]

4

In [170]:
num_series.astype(float)

0    4.0
1    7.0
2   -5.0
3    3.0
dtype: float64

In [160]:
list(num_series.index)

[0, 1, 2, 3]

In [29]:
num_series.values

array([ 4,  7, -5,  3])

In [30]:
#series 的 in  看的是 index
4 in num_series , 3 in num_series

(False, True)

In [120]:
# 定义series时指定index
num_index = ['a','b','c','d']
num_series_2 = pd.Series(num_list,index = num_index)

In [121]:
num_series_2

a    4
b    7
c   -5
d    3
dtype: int64

In [118]:
# 字典转换为series
num_keys = {'a':1,'b':2,'c':3}
pd.Series(num_keys)

a    1
b    2
c    3
dtype: int64

In [54]:
# unique 去重
num_series_3 = pd.Series([4,2,3,4,4])
num_series_3.unique()

array([4, 2, 3])

### DataFrame
series 为一个index对应一个数据， dataframe为一个index对应一个数组


总的数据同series一样，像一个大字典.

没有指定index时，默认从0开始的数, 没有指定columns时，默认也是从0开始的数

表格型数据结构，每个series构成一列

二维数组，包括行标，和列标，
- index 为行标
- columns 为列标

In [164]:
# 直接创建
df = pd.DataFrame({'name':['jack','rose'],'age':[21,20],'sex':['m','f']})
df

Unnamed: 0,name,age,sex
0,jack,21,m
1,rose,20,f


In [142]:
# 取部分指定列
data[['name','age']]

Unnamed: 0,name,age
0,jack,21
1,rose,20


In [150]:
# 多个字典有相同的key时， 可以组成list然后变成dataframe
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [211]:
#  从列表创建, 二维数组，每个item包含一组信息
data = [['Google',10],['Runoob',12],['Wiki',13]]

df = pd.DataFrame(data,columns=['Site','Age'],dtype=float)
df

Unnamed: 0,Site,Age
0,Google,10.0
1,Runoob,12.0
2,Wiki,13.0


In [215]:
# 从列标创建， feature分别以list存储时， 可以使用zip，
id_list = [1,2,3,4,5]
sex_list = ['m','m','f','m','f']
age_list = [19,20,22,19,20]
data = list(zip(id_list,sex_list,age_list))

In [216]:
df = pd.DataFrame(data,columns=['id','sex','age'])
df

Unnamed: 0,id,sex,age
0,1,m,19
1,2,m,20
2,3,f,22
3,4,m,19
4,5,f,20


In [147]:
#将一个numpy array转换成dataframe
df = pd.DataFrame(np.random.rand(3,4),index=list('123'), columns=list('abcd'))  
df

Unnamed: 0,a,b,c,d
1,0.100119,0.158655,0.028392,0.989578
2,0.152391,0.170247,0.710682,0.639555
3,0.735634,0.957967,0.781353,0.570966


### 数据类型

In [223]:
df = pd.DataFrame({'name':['jack','rose'],'age':[21,20],'sex':['m','f']})
df

Unnamed: 0,name,age,sex
0,jack,21,m
1,rose,20,f


In [224]:
df.dtypes

name    object
age      int64
sex     object
dtype: object

In [175]:
df2 = df.astype(str)
df2.dtypes

name    object
age     object
sex     object
dtype: object

### dataframe不能像数组一样直接切片，需要加上iloc 和 loc

In [229]:
df.index=  ['a','b']

In [231]:
# 使用loc 返回指定行
df.loc['a']

name    jack
age       21
sex        m
Name: a, dtype: object

In [226]:
df.loc[[0,1]]

Unnamed: 0,name,age,sex
0,jack,21,m
1,rose,20,f


In [228]:
df.loc[0][0]

'jack'

In [232]:
df.iloc[1]

name    rose
age       20
sex        f
Name: b, dtype: object

In [191]:
df.iloc[0][0]

'jack'

In [192]:
df.iloc[1,:]

name    rose
age       20
sex        f
Name: 1, dtype: object

### 常用函数

In [None]:
# cumsum  累计求和
a = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})
a.cumsum() 

In [209]:
# group by  groupby默认是在axis=0上进行分组的
df = pd.DataFrame({'name':['LI','ZHANG','ZHANG','LI','WANG'], 
                   'score_a' : [1,1,1,2,2], 
                   'score_b' : [1,-1,0,1,2], 
                   'score_c' : [3,4,5,6,7]})

In [200]:
df_grouped = df.groupby('name')

In [201]:
list(df_grouped)

[('LI',   name  score_a  score_b  score_c
  0   LI        1        1        3
  3   LI        2        1        6),
 ('WANG',    name  score_a  score_b  score_c
  4  WANG        2        2        7),
 ('ZHANG',     name  score_a  score_b  score_c
  1  ZHANG        1       -1        4
  2  ZHANG        1        0        5)]

In [208]:
for group_name , group_data in df_grouped:
    print(group_name)
    print(group_data)

LI
  name  score_a  score_b  score_c
0   LI        1        1        3
3   LI        2        1        6
WANG
   name  score_a  score_b  score_c
4  WANG        2        2        7
ZHANG
    name  score_a  score_b  score_c
1  ZHANG        1       -1        4
2  ZHANG        1        0        5


### iris 示例分析

In [202]:
iris_data = pd.read_csv('Iris.txt',header = None  ,index_col = None ,encoding = 'utf-8')
iris_data.head(5)
#header表示将第几行的数据作为列名，上面的数据舍弃
#index_col 表示将第几列的数据作为行名。
#sep=' '表示数据间使用空格作为分隔符，如果分隔符是逗号，只需换成 ‘，’即可, 或 sep = '\t'

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


列名缺少数据的具体业务含义，所以要更改列名。

In [204]:
iris_data.columns = ['sepal_length','sepal_width','petal_length','petal_width','class']
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [205]:
s = iris_data['petal_length']
s.values 

array([1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4,
       1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7, 1.5, 1. , 1.7, 1.9, 1.6,
       1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 1.4, 1.5, 1.2, 1.3, 1.5, 1.3,
       1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4, 1.6, 1.4, 1.5, 1.4, 4.7, 4.5,
       4.9, 4. , 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4. , 4.7, 3.6,
       4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4. , 4.9, 4.7, 4.3, 4.4, 4.8, 5. ,
       4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1, 4. , 4.4,
       4.6, 4. , 3.3, 4.2, 4.2, 4.2, 4.3, 3. , 4.1, 6. , 5.1, 5.9, 5.6,
       5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.3, 5.5, 5. , 5.1, 5.3, 5.5,
       6.7, 6.9, 5. , 5.7, 4.9, 6.7, 4.9, 5.7, 6. , 4.8, 4.9, 5.6, 5.8,
       6.1, 6.4, 5.6, 5.1, 5.6, 6.1, 5.6, 5.5, 4.8, 5.4, 5.6, 5.1, 5.1,
       5.9, 5.7, 5.2, 5. , 5.2, 5.4, 5.1])

In [206]:
s[:10].sort_values(ascending=True) # 排序 ， 默认升序

2    1.3
0    1.4
1    1.4
4    1.4
6    1.4
8    1.4
3    1.5
7    1.5
9    1.5
5    1.7
Name: petal_length, dtype: float64

#### 取petal_width的最大值的数据

In [207]:
pmw = iris_data.petal_width.max()
iris_data[iris_data.petal_width == pmw]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
100,6.3,3.3,6.0,2.5,Iris-virginica
109,7.2,3.6,6.1,2.5,Iris-virginica
144,6.7,3.3,5.7,2.5,Iris-virginica


### 求和 ， axis = 0 所有行加和， axis=1 所有列加和

In [59]:
iris_data[:10].sum(axis = 1) 

0    10.2
1     9.5
2     9.4
3     9.4
4    10.2
5    11.4
6     9.7
7    10.1
8     8.9
9     9.6
dtype: float64

Unnamed: 0,a,b
0,1,4
1,3,9
2,6,15


### 统计

In [62]:
print(iris_data.describe() )
print(iris_data.shape)
iris_data['class'].value_counts()

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
(150, 5)


Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Name: class, dtype: int64

### groupby


In [9]:
import pandas as pd
df = pd.DataFrame({'name':['LI','ZHANG','ZHANG','LI','WANG'], 'score_a' : [1,1,1,2,2], 'score_b' : [1,-1,0,1,2], 'score_c' : [3,4,5,6,7]})

### 文件操作  

In [None]:
# 读excel
df = pd.read_excel()

In [64]:
df.to_csv('test_2.csv')
#将DataFrame文件转换成cvs文件

df.to_csv("./test.csv", encoding="utf_8_sig")

In [None]:
### 导出excel
writer = pd.ExcelWriter('make_excel.xlsx')
df.to_excel(writer)
writer.save()

In [50]:
df1 = pd.DataFrame({'activity_id': ['2','2','5','2','2','2'],
                    'leader_id': ['56', '56', '56', '66', '66', '66'],
                       'member_id': [1, 2, 3, 4, 5, 6]})
df2 = pd.DataFrame({'activity_id': ['2','2','5','2','2','2'],
                    'leader_id': ['55', '55', '56', '56', '56', '66'],
                       'member_id': [1, 2, 3, 4, 5, 6]})

In [51]:
df1

Unnamed: 0,activity_id,leader_id,member_id
0,2,56,1
1,2,56,2
2,5,56,3
3,2,66,4
4,2,66,5
5,2,66,6


In [87]:
print(df1.leader_id =='56') #得到一个Bool类型的 series,bool型数据用于数据过滤
print(df1.leader_id == '66')
print((df1.leader_id == '56') & (df1.leader_id =='66'))

#两个 Series 类型可以用 & 做 与操作  ，得到的还是一个series


0     True
1     True
2     True
3    False
4    False
5    False
Name: leader_id, dtype: bool
0    False
1    False
2    False
3     True
4     True
5     True
Name: leader_id, dtype: bool
0    False
1    False
2    False
3    False
4    False
5    False
Name: leader_id, dtype: bool


## isin( )

In [69]:
df1.isin(df2)   #两个Dataframe取交集 ， 返回的是bool类型的Dataframe , shape为df1的

Unnamed: 0,activity_id,leader_id,member_id
0,True,False,True
1,True,False,True
2,True,True,True
3,True,False,True
4,True,False,True
5,True,True,True


In [110]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df2 = pd.DataFrame({'A': [1, 3, 3, 3,4], 'B': ['e', 'f', 'f', 'e','f']})

df['A'].isin(df2['A']) #也可以只选取某些列

0     True
1    False
2     True
Name: A, dtype: bool