In [1]:
import pandas as pd
import numpy as np

# DataFrame
* Like a table in `SQL`, relations
* Like a mapping from a label to a `Series`

# Creation

In [6]:
# From dict of lists
df = pd.DataFrame({
    'ticker': ['AAPL', 'AAPL', 'MSFT', 'IBM', 'YHOO'],
    'date': ['2015-12-30', '2015-12-31', '2015-12-30', '2015-12-30', '2015-12-30'],
    'open': [426.23, 427.81, 42.3, 101.65, 35.53]
})
df

Unnamed: 0,ticker,date,open
0,AAPL,2015-12-30,426.23
1,AAPL,2015-12-31,427.81
2,MSFT,2015-12-30,42.3
3,IBM,2015-12-30,101.65
4,YHOO,2015-12-30,35.53


In [20]:
# From list of dicts, i.e. from JSON
df2 = pd.DataFrame([
    {'ticker': 'AAPL', 'date': '2015-12-30', 'open': 426.23}
    , {'ticker': 'AAPL', 'date': '2015-12-31', 'open': 427.23}
    , {'ticker': 'AAPL', 'open': 42.23, 'date': '2015-12-30'}
])
df2

Unnamed: 0,ticker,date,open
0,AAPL,2015-12-30,426.23
1,AAPL,2015-12-31,427.23
2,AAPL,2015-12-30,42.23


# Accessors
* Use `loc`, `iloc` primarily

In [21]:
# By column
df['ticker'] # or df2.ticker

0    AAPL
1    AAPL
2    MSFT
3     IBM
4    YHOO
Name: ticker, dtype: object

In [22]:
df[['ticker', 'date']] # multi-columns

Unnamed: 0,ticker,date
0,AAPL,2015-12-30
1,AAPL,2015-12-31
2,MSFT,2015-12-30
3,IBM,2015-12-30
4,YHOO,2015-12-30


In [24]:
df[df.open > 400] # by row filter

Unnamed: 0,ticker,date,open
0,AAPL,2015-12-30,426.23
1,AAPL,2015-12-31,427.81


In [25]:
df.loc[df.open > 400, ['ticker', 'open']] # row filter, columns

Unnamed: 0,ticker,open
0,AAPL,426.23
1,AAPL,427.81


In [36]:
df.iloc[[0, 1, 2], 0:2] # row index, columns slice

Unnamed: 0,ticker,date
0,AAPL,2015-12-30
1,AAPL,2015-12-31
2,MSFT,2015-12-30


# Modify a DataFrame
* Subsets without warnings
* standardize on `np.nan` instead of `None` for `NULL`

In [41]:
df1a = df.copy()
df1a['close'] = df1a['open']
# In-place modification is preferred
df1a.loc[df1a.date == '2015-12-31', ['close']] = 5000
df1a

Unnamed: 0,ticker,date,open,close
0,AAPL,2015-12-30,426.23,426.23
1,AAPL,2015-12-31,427.81,5000.0
2,MSFT,2015-12-30,42.3,42.3
3,IBM,2015-12-30,101.65,101.65
4,YHOO,2015-12-30,35.53,35.53


## Selective modification!

In [61]:
# Selective modification
df1a_ex2 = df.copy()
df1a_ex2['close'] = np.nan
# Note how the assignment is a data frame but the selection is a series
df1a_ex2.loc[df1a_ex2.open > 400, ['close']] = df1a_ex2.loc[df1a_ex2.open > 400, 'open']
df1a_ex2

Unnamed: 0,ticker,date,open,close
0,AAPL,2015-12-30,426.23,426.23
1,AAPL,2015-12-31,427.81,427.81
2,MSFT,2015-12-30,42.3,
3,IBM,2015-12-30,101.65,
4,YHOO,2015-12-30,35.53,


## Don't do this!

In [44]:
# Don't do this!
df1b = df.copy()
df1b['close'] = df1a['open']
df1b_view = df1a[df1a.date == '2015-12-31'] # not copy by reference!
df1b_view['close'] = 5000 # causes warning: assigning to view

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [48]:
# To avoid this use a copy - no warning!
df1c_copy = df1a[df1a.date != '2015-12-31'].copy()
df1c_copy['close'] = 21
df1c_copy

Unnamed: 0,ticker,date,open,close
0,AAPL,2015-12-30,426.23,21
2,MSFT,2015-12-30,42.3,21
3,IBM,2015-12-30,101.65,21
4,YHOO,2015-12-30,35.53,21


In [57]:
# Implicit left join is performed on index labels
close = pd.Series({0: 430.0, 1: 430.0, 2: 43.5, 3: 43.5, 5: 1200})
df1d = df.copy()
df1d['close'] = close
# this does element by element explicit assignment
#df1d['close'] = close.array 
df1d

Unnamed: 0,ticker,date,open,close
0,AAPL,2015-12-30,426.23,430.0
1,AAPL,2015-12-31,427.81,430.0
2,MSFT,2015-12-30,42.3,43.5
3,IBM,2015-12-30,101.65,43.5
4,YHOO,2015-12-30,35.53,
