In [139]:
import numpy as np
import pandas as pd

In [140]:
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [141]:
data['b']

0.5

In [142]:
'a' in data

True

In [143]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [144]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [145]:
data['a':'c'] ##slicing

a    0.25
b    0.50
c    0.75
dtype: float64

In [146]:
data[0:2] ##implicit slicing

a    0.25
b    0.50
dtype: float64

In [147]:
data[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [148]:
data[['a','d']] ## fancy indexing

a    0.25
d    1.00
dtype: float64

In [149]:
data = pd.Series(['a','b','c'], index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [150]:
## 명시적 인덱스 사용
data[1]

'a'

In [151]:
## 슬라이싱 할 때 암묵적 인덱스 사용
data[1:3]

3    b
5    c
dtype: object

In [152]:
data.loc[1]

'a'

In [153]:
data.loc[1:3]

1    a
3    b
dtype: object

In [154]:
data.iloc[1]

'b'

In [155]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [156]:
pop = pd.Series({'Cali' : 38332521,'Tex' : 26448193,'NY' : 19651127,'Flo' : 19552860,'Penn' : 13002700})
area = pd.Series({'Cali':423967,'Tex':695662,'NY':141297,'Flo':170312,'Penn':119280})
data = pd.DataFrame({'area':area,'pop':pop})
data

Unnamed: 0,area,pop
Cali,423967,38332521
Tex,695662,26448193
NY,141297,19651127
Flo,170312,19552860
Penn,119280,13002700


In [157]:
data['area']

Cali    423967
Tex     695662
NY      141297
Flo     170312
Penn    119280
Name: area, dtype: int64

In [158]:
data.area

Cali    423967
Tex     695662
NY      141297
Flo     170312
Penn    119280
Name: area, dtype: int64

In [159]:
data.pop

<bound method DataFrame.pop of         area       pop
Cali  423967  38332521
Tex   695662  26448193
NY    141297  19651127
Flo   170312  19552860
Penn  119280  13002700>

In [160]:
data['pop']

Cali    38332521
Tex     26448193
NY      19651127
Flo     19552860
Penn    13002700
Name: pop, dtype: int64

In [161]:
data['density'] = data['pop']/data['area']

In [162]:
data

Unnamed: 0,area,pop,density
Cali,423967,38332521,90.413926
Tex,695662,26448193,38.01874
NY,141297,19651127,139.076746
Flo,170312,19552860,114.806121
Penn,119280,13002700,109.009893


In [163]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.19280000e+05, 1.30027000e+07, 1.09009893e+02]])

In [164]:
data.index

Index(['Cali', 'Tex', 'NY', 'Flo', 'Penn'], dtype='object')

In [165]:
data.T

Unnamed: 0,Cali,Tex,NY,Flo,Penn
area,423967.0,695662.0,141297.0,170312.0,119280.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,13002700.0
density,90.41393,38.01874,139.0767,114.8061,109.0099


In [166]:
data

Unnamed: 0,area,pop,density
Cali,423967,38332521,90.413926
Tex,695662,26448193,38.01874
NY,141297,19651127,139.076746
Flo,170312,19552860,114.806121
Penn,119280,13002700,109.009893


In [167]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [168]:
data['area']

Cali    423967
Tex     695662
NY      141297
Flo     170312
Penn    119280
Name: area, dtype: int64

In [169]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
Cali,423967,38332521
Tex,695662,26448193
NY,141297,19651127


In [170]:
data.loc[:'Flo',:'pop']

Unnamed: 0,area,pop
Cali,423967,38332521
Tex,695662,26448193
NY,141297,19651127
Flo,170312,19552860


In [171]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [172]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [173]:
np.exp(ser) ## type(ser) = pandas.core.series.Series, ser는 array

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [174]:
np.sin(df*np.pi / 4) ## 각각의 원소에 대한 sin 함수 적용 및 결과 도출

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


## 유니버셜 함수: 인덱스 정렬

In [175]:
area = pd.Series({'Alaska':1723337,'Texas':695662,'California':423967},name='area')
population = pd.Series({'California':38332521,'Texas':26448193,'New York':19651127},name='population')

In [176]:
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [177]:
area.index | population.index

  """Entry point for launching an IPython kernel.


Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [178]:
area.index.union(population)

Index([19651127, 26448193, 38332521, 'Alaska', 'California', 'Texas'], dtype='object')

In [179]:
A= pd.Series([2,4,6],index=[0,1,2])
B= pd.Series([1,3,5],index=[1,2,3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [180]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [181]:
A = pd.DataFrame(rng.randint(0,20,(2,2)),columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


## 유니버셜 함수: DataFrame과 Series 간의 연산

In [182]:
A = rng.randint(10, size=(3,4))
A

array([[4, 0, 9, 5],
       [8, 0, 9, 2],
       [6, 3, 8, 2]])

In [183]:
A - A[0] ## 2차원 DataFrame - 1차원 Series : broadcasting --> element-wise 연산

array([[ 0,  0,  0,  0],
       [ 4,  0,  0, -3],
       [ 2,  3, -1, -3]])

In [184]:
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,4,0,0,-3
2,2,3,-1,-3


In [185]:
df.subtract(df['R'],axis=0)

Unnamed: 0,Q,R,S,T
0,4,0,9,5
1,8,0,9,2
2,3,0,5,-1


In [186]:
df

Unnamed: 0,Q,R,S,T
0,4,0,9,5
1,8,0,9,2
2,6,3,8,2


# 데이터 누락 처리하기

In [187]:
vals1 = np.array([1,None,2,3])
vals1

array([1, None, 2, 3], dtype=object)

In [188]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [189]:
vals2 = np.array([1,np.nan,3,4])
vals2.dtype

dtype('float64')

In [190]:
1 + np.nan

nan

In [191]:
0*np.nan

nan

In [192]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [193]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

In [194]:
pd.Series([1,np.nan,2,None,pd.NA],dtype='Int32') ## dtype='Int32'가 nullable

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [195]:
data=pd.Series([1,np.nan,'hello',None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [196]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [197]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [198]:
data.dropna()

0        1
2    hello
dtype: object

## 다중 인덱스된 Series

In [199]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [200]:
pop[('California',2010):('Texas',2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [201]:
%xmode plain

Exception reporting mode: Plain


In [202]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [203]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [204]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [205]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [206]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [207]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [208]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [209]:
fDF = f_u18.unstack()
fDF

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [210]:
fDF.stack()

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [211]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], ## 다중 인덱스
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.918232,0.72423
a,2,0.262233,0.494051
b,1,0.530195,0.428895
b,2,0.731867,0.436319


In [212]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [213]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [214]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [215]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [216]:
pop.index.names=['state','year']

In [217]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [218]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,47.0,38.2,39.0,38.3,40.0,35.7
2013,2,38.0,37.6,43.0,37.4,42.0,36.7
2014,1,33.0,36.9,43.0,35.1,23.0,38.2
2014,2,53.0,37.3,33.0,37.9,39.0,36.6


In [219]:
columns

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [220]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,39.0,38.3
2013,2,43.0,37.4
2014,1,43.0,35.1
2014,2,33.0,37.9


In [221]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [222]:
pop['California',2000]

33871648

In [223]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [224]:
pop.loc['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [225]:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [226]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,47.0,38.2,39.0,38.3,40.0,35.7
2013,2,38.0,37.6,43.0,37.4,42.0,36.7
2014,1,33.0,36.9,43.0,35.1,23.0,38.2
2014,2,53.0,37.3,33.0,37.9,39.0,36.6


In [227]:
health_data['Guido','HR']

year  visit
2013  1        39.0
      2        43.0
2014  1        43.0
      2        33.0
Name: (Guido, HR), dtype: float64

In [228]:
## health_data[2013,1] : error
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,47.0,38.2
2013,2,38.0,37.6


In [229]:
health_data.loc[:,('Bob','HR')]

year  visit
2013  1        47.0
      2        38.0
2014  1        33.0
      2        53.0
Name: (Bob, HR), dtype: float64

In [230]:
idx = pd.IndexSlice
health_data.loc[idx[:,1],idx[:,'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,47.0,39.0,40.0
2014,1,33.0,43.0,23.0


In [231]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.553021
      2      0.263539
c     1      0.363461
      2      0.979328
b     1      0.637948
      2      0.789410
dtype: float64

In [232]:
data = data.sort_index()
data

char  int
a     1      0.553021
      2      0.263539
b     1      0.637948
      2      0.789410
c     1      0.363461
      2      0.979328
dtype: float64

In [233]:
data['a':'b']

char  int
a     1      0.553021
      2      0.263539
b     1      0.637948
      2      0.789410
dtype: float64

In [234]:
data['b':'d']

char  int
b     1      0.637948
      2      0.789410
c     1      0.363461
      2      0.979328
dtype: float64

In [235]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [236]:
pop.unstack()

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [237]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [238]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [239]:
pop.T

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [240]:
pop1 = pop.unstack()
pop1.T

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [241]:
pop.unstack()

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [242]:
pop_flat= pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [243]:
pop_flat.set_index(['state','year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [244]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [245]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [246]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=0)

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])