In [4]:
import numpy as np
import pandas as pd

import datetime
from datetime import datetime, date


pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)


import matplotlib.pyplot as plt
%matplotlib inline

sp500 = pd.read_csv("../data/sp500.csv", 
                    index_col='Symbol', 
                    usecols=[0, 2, 3, 7])

In [5]:
np.random.seed(123456)
df = pd.DataFrame({'foo':np.random.random(10000), 'key':range(100, 10100)})
df[:5]

        foo  key
0  0.126970  100
1  0.966718  101
2  0.260476  102
3  0.897237  103
4  0.376750  104

In [6]:
df[df.key==10099]

           foo    key
9999  0.272283  10099

In [7]:
%timeit df[df.key==10099]

144 µs ± 553 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [8]:
df_with_index = df.set_index(['key'])
df_with_index[:5]

          foo
key          
100  0.126970
101  0.966718
102  0.260476
103  0.897237
104  0.376750

In [9]:
df_with_index.loc[10099]

foo    0.272283
Name: 10099, dtype: float64

In [10]:
%timeit df_with_index.loc[10099]

33 µs ± 236 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [11]:
temps = pd.DataFrame({ "City": ["Missoula", "Philadelphia"],
                       "Temperature": [70, 80] })
temps

           City  Temperature
0      Missoula           70
1  Philadelphia           80

In [12]:
temps.columns

Index(['City', 'Temperature'], dtype='object')

In [13]:
df_i64 = pd.DataFrame(np.arange(10, 20), index=np.arange(0, 10))
df_i64[:5]

    0
0  10
1  11
2  12
3  13
4  14

In [14]:
df_i64.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [15]:
df_range = pd.DataFrame(np.arange(10, 15))
df_range[:5]

    0
0  10
1  11
2  12
3  13
4  14

In [16]:
df_range.index

RangeIndex(start=0, stop=5, step=1)

In [17]:
#부동소수점 방식
df_f64 = pd.DataFrame(np.arange(0, 1000, 5), 
                      np.arange(0.0, 100.0, 0.5))
df_f64.iloc[:5] 

      0
0.0   0
0.5   5
1.0  10
1.5  15
2.0  20

In [18]:
df_f64.index

Float64Index([ 0.0,  0.5,  1.0,  1.5,  2.0,  2.5,  3.0,
               3.5,  4.0,  4.5,
              ...
              95.0, 95.5, 96.0, 96.5, 97.0, 97.5, 98.0,
              98.5, 99.0, 99.5],
             dtype='float64', length=200)

In [19]:
#간격을 이용
df_interval = pd.DataFrame({ "A": [1, 2, 3, 4]},
                    index = pd.IntervalIndex.from_breaks(
                        [0, 0.5, 1.0, 1.5, 2.0]))
df_interval

            A
(0.0, 0.5]  1
(0.5, 1.0]  2
(1.0, 1.5]  3
(1.5, 2.0]  4

In [20]:
df_interval.index

IntervalIndex([(0.0, 0.5], (0.5, 1.0], (1.0, 1.5], (1.5, 2.0]], dtype='interval[float64, right]')

In [23]:
df_categorical = pd.DataFrame({'A': np.arange(6),
                               'B': list('aabbca')})
df_categorical['B'] = df_categorical['B'].astype('category') 
                                          
df_categorical

   A  B
0  0  a
1  1  a
2  2  b
3  3  b
4  4  c
5  5  a

In [20]:
df_categorical = df_categorical.set_index('B')
df_categorical.index

CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, dtype='category', name='B')

In [21]:
df_categorical.loc['a']

   A
B   
a  0
a  1
a  5

In [22]:
#날짜형식 인덱스 유형
rng = pd.date_range('5/1/2017', periods=5, freq='H')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2017-05-01 00:00:00    1.239792
2017-05-01 01:00:00   -0.400611
2017-05-01 02:00:00    0.718247
2017-05-01 03:00:00    0.430499
2017-05-01 04:00:00    1.155432
Freq: H, dtype: float64

In [23]:
ts.index

DatetimeIndex(['2017-05-01 00:00:00',
               '2017-05-01 01:00:00',
               '2017-05-01 02:00:00',
               '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [24]:
periods = pd.PeriodIndex(['2017-1', '2017-2', '2017-3'], freq='M')
periods

PeriodIndex(['2017-01', '2017-02', '2017-03'], dtype='period[M]')

In [25]:
period_series = pd.Series(np.random.randn(len(periods)), 
                          index=periods)
period_series

2017-01   -0.449276
2017-02    2.472977
2017-03   -0.716023
Freq: M, dtype: float64

In [26]:
date_times = pd.DatetimeIndex(pd.date_range('5/1/2017', 
                                            periods=5, 
                                            freq='H'))
date_times

DatetimeIndex(['2017-05-01 00:00:00',
               '2017-05-01 01:00:00',
               '2017-05-01 02:00:00',
               '2017-05-01 03:00:00',
               '2017-05-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [27]:
df_date_times = pd.DataFrame(np.arange(0, len(date_times)), 
                             index=date_times)
df_date_times

                     0
2017-05-01 00:00:00  0
2017-05-01 01:00:00  1
2017-05-01 02:00:00  2
2017-05-01 03:00:00  3
2017-05-01 04:00:00  4

In [28]:
df_date_times.index = pd.DatetimeIndex(pd.date_range('6/1/2017', 
                                                     periods=5, 
                                                     freq='H'))
df_date_times

                     0
2017-06-01 00:00:00  0
2017-06-01 01:00:00  1
2017-06-01 02:00:00  2
2017-06-01 03:00:00  3
2017-06-01 04:00:00  4

In [29]:
s = pd.Series(np.arange(0, 5), index=list('abcde'))
s

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [30]:
s['b']

1

In [31]:
s.loc['b']

1

In [32]:
df = pd.DataFrame([ np.arange(10, 12), 
                    np.arange(12, 14)], 
                  columns=list('ab'), 
                  index=list('vw'))
df

    a   b
v  10  11
w  12  13

In [33]:
df['a']

v    10
w    12
Name: a, dtype: int64

In [34]:
df.loc['w']

a    12
b    13
Name: w, dtype: int64

In [35]:
s['b':'d']

b    1
c    2
d    3
dtype: int64

In [36]:
s.loc['b':'d']

b    1
c    2
d    3
dtype: int64

In [37]:
s.loc[['a', 'c', 'e']]

a    0
c    2
e    4
dtype: int64

In [43]:
sp500[:5]

                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897

In [44]:
#초기화
index_moved_to_col = sp500.reset_index()
index_moved_to_col[:5]

  Symbol                  Sector   Price  Book Value
0    MMM             Industrials  141.14      26.668
1    ABT             Health Care   39.60      15.573
2   ABBV             Health Care   53.95       2.954
3    ACN  Information Technology   79.79       8.326
4    ACE              Financials  102.91      86.897

In [45]:
index_moved_to_col.set_index('Sector')[:5]

                       Symbol   Price  Book Value
Sector                                           
Industrials               MMM  141.14      26.668
Health Care               ABT   39.60      15.573
Health Care              ABBV   53.95       2.954
Information Technology    ACN   79.79       8.326
Financials                ACE  102.91      86.897

In [46]:
reindexed = sp500.reindex(index=['MMM', 'ABBV', 'FOO'])
reindexed

             Sector   Price  Book Value
Symbol                                 
MMM     Industrials  141.14      26.668
ABBV    Health Care   53.95       2.954
FOO             NaN     NaN         NaN

In [47]:
sp500.reindex(columns=['Price', 
                       'Book Value', 
                       'NewCol'])[:5]

         Price  Book Value  NewCol
Symbol                            
MMM     141.14      26.668     NaN
ABT      39.60      15.573     NaN
ABBV     53.95       2.954     NaN
ACN      79.79       8.326     NaN
ACE     102.91      86.897     NaN