# Mod08 Introducing DataFrame Object

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.__version__

'1.0.5'

In [4]:
np.__version__

'1.19.1'

## The Pandas DataFrame Object

### DataFrame as a generalized NumPy array

In [5]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [6]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [7]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


Like the ``Series`` object, the ``DataFrame`` has an ``index`` attribute

In [10]:
states.index   #index 為列

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

``DataFrame`` has a ``columns`` attribute

In [9]:
states.columns   #標題為 欄

Index(['population', 'area'], dtype='object')

In [11]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]])

### DataFrame as specialized dictionary

In [12]:
states['area']   #series 資料型態要相同

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [13]:
states[['area']]  #dataframe 異質性, 欄位的資料型態可以不同

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


### Constructing DataFrame objects

#### From a single Series object

In [14]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [15]:
pd.DataFrame(population, columns=['population'])  

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


#### From a list of dicts

In [16]:
data = [{'a': i, 'b': 2 * i}      #欄位名稱a, b  index 沒給, 自動編號  list comperhancn???
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


#### From a dictionary of Series objects

In [17]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

In [None]:
pd.DataFrame({'population': population,
              'area': area})

#### From a two-dimensional NumPy array

In [20]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.04599,0.10079
b,0.521365,0.040007
c,0.59917,0.376166


#### From files

In [24]:
# pd.read_csv?

In [25]:
mb = pd.read_csv("./data/microbiome.csv")    # 也可以寫成, "data/microbiome.csv" 因為是目前目錄
mb

Unnamed: 0,Taxon,Patient,Group,Tissue,Stool
0,Firmicutes,1,0,136,4182
1,Firmicutes,2,1,1174,703
2,Firmicutes,3,0,408,3946
3,Firmicutes,4,1,831,8605
4,Firmicutes,5,0,693,50
...,...,...,...,...,...
65,Other,10,1,203,6
66,Other,11,0,392,6
67,Other,12,1,28,25
68,Other,13,0,12,22


'''
### 拿到資料的處理步驟 

1.先取得筆數 len(mb)
2.查到資料型態 mb.dtype
3.查資訊  mb.info
4.如果info有發現NaN再考慮處理空值
5.看內容mb.head()  mb.tail()
'''

In [31]:
%timeit mb.shape   # 得到筆數

334 ns ± 4.35 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [32]:
%timeit len(mb)   # 得到筆數

196 ns ± 2.62 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [35]:
mb.dtypes

Taxon      object
Patient     int64
Group       int64
Tissue      int64
Stool       int64
dtype: object

In [38]:
mb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Taxon    70 non-null     object
 1   Patient  70 non-null     int64 
 2   Group    70 non-null     int64 
 3   Tissue   70 non-null     int64 
 4   Stool    70 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 2.9+ KB


In [39]:
mb.head()

Unnamed: 0,Taxon,Patient,Group,Tissue,Stool
0,Firmicutes,1,0,136,4182
1,Firmicutes,2,1,1174,703
2,Firmicutes,3,0,408,3946
3,Firmicutes,4,1,831,8605
4,Firmicutes,5,0,693,50


In [54]:
mb.hasnans   

AttributeError: 'DataFrame' object has no attribute 'hasnans'

In [40]:
mb.tail()

Unnamed: 0,Taxon,Patient,Group,Tissue,Stool
65,Other,10,1,203,6
66,Other,11,0,392,6
67,Other,12,1,28,25
68,Other,13,0,12,22
69,Other,14,1,305,32


In [45]:
pd.read_csv("./data/microbiome.csv", header=None).head()

Unnamed: 0,0,1,2,3,4
0,Taxon,Patient,Group,Tissue,Stool
1,Firmicutes,1,0,136,4182
2,Firmicutes,2,1,1174,703
3,Firmicutes,3,0,408,3946
4,Firmicutes,4,1,831,8605


In [46]:
mb = pd.read_table("./data/microbiome.csv", sep=',')    

In [48]:
dx = pd.read_excel('./data/MID2.xls', sheet_name='Sheet 1', header=None)    #如果沒有欄位的標題,
dx.head()

Unnamed: 0,0,1
0,"Archaea ""Crenarchaeota"" Thermoprotei Acidiloba...",2
1,"Archaea ""Crenarchaeota"" Thermoprotei Acidiloba...",14
2,"Archaea ""Crenarchaeota"" Thermoprotei Desulfuro...",23
3,"Archaea ""Crenarchaeota"" Thermoprotei Desulfuro...",1
4,"Archaea ""Crenarchaeota"" Thermoprotei Desulfuro...",2


In [49]:
# Load the first sheet of the JSON file into a data frame
dj = pd.read_json('data/data.json', orient='columns')

# View the first ten rows
dj.head(10)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0
2,9,2015-01-01 00:00:02,0
3,6,2015-01-01 00:00:03,0
4,6,2015-01-01 00:00:04,0
5,9,2015-01-01 00:00:05,0
6,7,2015-01-01 00:00:06,0
7,1,2015-01-01 00:00:07,0
8,6,2015-01-01 00:00:08,0
9,9,2015-01-01 00:00:09,0


## Lab

<b>有一個 Dictionary 如下，透過該 Dictionary 建立一個 DataFrame，並顯示 columns 屬性</b>

In [None]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

<b>匯入 data/tips.csv 的檔案到 DataFrame，顯示 DataFrame 前 10 筆資料，並顯示 dtypes 屬性</b>

In [None]:
df = pd.read_csv('data/tips.csv')