In [1]:
import pandas as pd
import numpy as np

# 2.DataFrame类型的创建

#### 等长度列表组成的字典：

In [2]:
data = {'city':['India','Canada','France','Italy','USA','Spain'],
       'continent':['Asia','North America','Europe','Europe','North America','Europe'],
       'score':[90,80,50,70,60,40]}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,city,continent,score
0,India,Asia,90
1,Canada,North America,80
2,France,Europe,50
3,Italy,Europe,70
4,USA,North America,60
5,Spain,Europe,40


#### 等长度ndarray数组组成的字典：

In [4]:
data = {'one':np.array([0,1,2,3,4]) , 'two':np.array([5,6,7,8,9])}

In [5]:
df = pd.DataFrame(data)
df

Unnamed: 0,one,two
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


#### 包含字典的字典类型：

In [6]:
data = {'Canada':{2000:80 , 2001:90 },
       'France':{2000:70 , 2001:80},
       'Italy':{2000:60 , 2001:85, 2002:100}}

In [7]:
# 字典的键将作为列，内部字典的键将作为行索引
df = pd.DataFrame(data)
df

Unnamed: 0,Canada,France,Italy
2000,80.0,70.0,60
2001,90.0,80.0,85
2002,,,100


#### 包含Series的字典：

In [8]:
data = {'Canada':pd.Series([80,90],index=[2000,2001]),
       'France':pd.Series([70,80],index=[2000,2001])}

In [9]:
df = pd.DataFrame(data)
df

Unnamed: 0,Canada,France
2000,80,70
2001,90,80


#### 二维ndarray对象：

In [10]:
data = np.arange(6).reshape(2,3)
data

array([[0, 1, 2],
       [3, 4, 5]])

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5


# 3.指定列和索引

In [12]:
data = {'city':['India','Canada','France','Italy','USA','Spain'],
       'continent':['Asia','North America','Europe','Europe','North America','Europe'],
       'score':[90,80,50,70,60,40]}

In [13]:
# 通过columns指定列顺序，通过index指定索引
df = pd.DataFrame(data , columns=['continent','city','score'] , index=['one','two','three','four','five','six'])
df

Unnamed: 0,continent,city,score
one,Asia,India,90
two,North America,Canada,80
three,Europe,France,50
four,Europe,Italy,70
five,North America,USA,60
six,Europe,Spain,40


In [14]:
#  如果columns传递的列不在数据中，dataframe中会出现缺失值
df = pd.DataFrame(data , columns=['continent','city','score','year'] , index=['one','two','three','four','five','six'])
df

Unnamed: 0,continent,city,score,year
one,Asia,India,90,
two,North America,Canada,80,
three,Europe,France,50,
four,Europe,Italy,70,
five,North America,USA,60,
six,Europe,Spain,40,


In [15]:
# 索引和列也拥有name属性，如果有的话会被显式出来

In [16]:
df.index.name = '索引名'

In [17]:
df.columns.name = '列名'

In [18]:
df

列名,continent,city,score,year
索引名,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Asia,India,90,
two,North America,Canada,80,
three,Europe,France,50,
four,Europe,Italy,70,
five,North America,USA,60,
six,Europe,Spain,40,


In [19]:
# 和Series类似，DataFrame的values属性会将包含在DataFrame中的数据以二维ndarray的形式返回
# 如果DataFrame中的列具有不同的数据类型dtype，会自动选择适合所有列的类型

In [20]:
df.values

array([['Asia', 'India', 90, nan],
       ['North America', 'Canada', 80, nan],
       ['Europe', 'France', 50, nan],
       ['Europe', 'Italy', 70, nan],
       ['North America', 'USA', 60, nan],
       ['Europe', 'Spain', 40, nan]], dtype=object)

# 4.行和列的获取：

## 4.1 列的获取

In [21]:
data = {'city':['India','Canada','France','Italy','USA','Spain'],
       'continent':['Asia','North America','Europe','Europe','North America','Europe'],
       'score':[90,80,50,70,60,40]}

In [22]:
df = pd.DataFrame(data , columns=['continent','city','score'] , index=['one','two','three','four','five','six'])
df

Unnamed: 0,continent,city,score
one,Asia,India,90
two,North America,Canada,80
three,Europe,France,50
four,Europe,Italy,70
five,North America,USA,60
six,Europe,Spain,40


#### DataFrame中的一列，可以通过字典型标记或点属性的方式检索为Series对象

In [23]:
df['continent']

one               Asia
two      North America
three           Europe
four            Europe
five     North America
six             Europe
Name: continent, dtype: object

In [24]:
df.city

one       India
two      Canada
three    France
four      Italy
five        USA
six       Spain
Name: city, dtype: object

#### 列的引用是可以修改的

In [25]:
df['score'] = 100
df

Unnamed: 0,continent,city,score
one,Asia,India,100
two,North America,Canada,100
three,Europe,France,100
four,Europe,Italy,100
five,North America,USA,100
six,Europe,Spain,100


In [26]:
df['score'] = np.array([100,90,80,70,60,50])
df

Unnamed: 0,continent,city,score
one,Asia,India,100
two,North America,Canada,90
three,Europe,France,80
four,Europe,Italy,70
five,North America,USA,60
six,Europe,Spain,50


#### 将Series赋值给一列时，Series的索引将会按照DataFrame的索引重新排序，并在空缺的地方填充缺失值

In [27]:
value = pd.Series([90,80,70] , index=['one','three','six'])

df['score'] = value
df

Unnamed: 0,continent,city,score
one,Asia,India,90.0
two,North America,Canada,
three,Europe,France,80.0
four,Europe,Italy,
five,North America,USA,
six,Europe,Spain,70.0


#### 如果被赋值的列并不存在，则会生成一个新的列

In [28]:
value = df['continent'] == 'Europe'
value

one      False
two      False
three     True
four      True
five     False
six       True
Name: continent, dtype: bool

In [29]:
df['labels'] = value
df

Unnamed: 0,continent,city,score,labels
one,Asia,India,90.0,False
two,North America,Canada,,False
three,Europe,France,80.0,True
four,Europe,Italy,,True
five,North America,USA,,False
six,Europe,Spain,70.0,True


#### 可以通过 del 关键字删除指定列

In [30]:
del df['labels']

In [31]:
df

Unnamed: 0,continent,city,score
one,Asia,India,90.0
two,North America,Canada,
three,Europe,France,80.0
four,Europe,Italy,
five,North America,USA,
six,Europe,Spain,70.0


## 4.2 行的获取

#### 行可以通过iloc或loc属性按位置或名称的方式进行选取

In [32]:
df.iloc[0]

continent     Asia
city         India
score         90.0
Name: one, dtype: object

In [33]:
df.loc['one']

continent     Asia
city         India
score         90.0
Name: one, dtype: object

# 5.head()和tail()：

In [34]:
data = {'city':['India','Canada','France','Italy','USA','Spain'],
       'continent':['Asia','North America','Europe','Europe','North America','Europe'],
       'score':[90,80,50,70,60,40]}

In [35]:
df = pd.DataFrame(data)
df

Unnamed: 0,city,continent,score
0,India,Asia,90
1,Canada,North America,80
2,France,Europe,50
3,Italy,Europe,70
4,USA,North America,60
5,Spain,Europe,40


In [36]:
df.head()

Unnamed: 0,city,continent,score
0,India,Asia,90
1,Canada,North America,80
2,France,Europe,50
3,Italy,Europe,70
4,USA,North America,60


In [37]:
df.tail()

Unnamed: 0,city,continent,score
1,Canada,North America,80
2,France,Europe,50
3,Italy,Europe,70
4,USA,North America,60
5,Spain,Europe,40
