Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
5159 lines (4334 sloc) 191.546 kb
"""
DataFrame
---------
An efficient 2D container for potentially mixed-type time series or other
labeled data series.
Similar to its R counterpart, data.frame, except providing automatic data
alignment and a host of useful data manipulation methods having to do with the
labeling information
"""
from __future__ import division
# pylint: disable=E1101,E1103
# pylint: disable=W0212,W0231,W0703,W0622
import functools
import collections
import itertools
import sys
import types
import warnings
from numpy import nan as NA
import numpy as np
import numpy.ma as ma
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _maybe_upcast, is_sequence,
_infer_dtype_from_scalar, _values_from_object,
is_list_like, _maybe_box_datetimelike,
is_categorical_dtype, is_object_dtype,
_possibly_infer_to_datetimelike)
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import (maybe_droplevels,
convert_to_index_sliceable,
check_bool_indexer)
from pandas.core.internals import (BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks)
from pandas.core.series import Series
from pandas.core.categorical import Categorical
import pandas.computation.expressions as expressions
from pandas.computation.eval import eval as _eval
from numpy import percentile as _quantile
from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
OrderedDict, raise_with_traceback)
from pandas import compat
from pandas.sparse.array import SparseArray
from pandas.util.decorators import deprecate, Appender, Substitution, \
deprecate_kwarg
from pandas.tseries.period import PeriodIndex
from pandas.tseries.index import DatetimeIndex
import pandas.core.algorithms as algos
import pandas.core.common as com
import pandas.core.format as fmt
import pandas.core.nanops as nanops
import pandas.core.ops as ops
import pandas.lib as lib
import pandas.algos as _algos
from pandas.core.config import get_option
#----------------------------------------------------------------------
# Docstring templates
_shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame',
axes_single_arg="{0, 1, 'index', 'columns'}")
_numeric_only_doc = """numeric_only : boolean, default None
Include only float, int, boolean data. If None, will attempt to use
everything, then use only numeric data
"""
_merge_doc = """
Merge DataFrame objects by performing a database-style join operation by
columns or indexes.
If joining columns on columns, the DataFrame indexes *will be
ignored*. Otherwise if joining indexes on indexes or indexes on a column or
columns, the index will be passed on.
Parameters
----------%s
right : DataFrame
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
* left: use only keys from left frame (SQL: left outer join)
* right: use only keys from right frame (SQL: right outer join)
* outer: use union of keys from both frames (SQL: full outer join)
* inner: use intersection of keys from both frames (SQL: inner join)
on : label or list
Field names to join on. Must be found in both DataFrames. If on is
None and not merging on indexes, then it merges on the intersection of
the columns by default.
left_on : label or list, or array-like
Field names to join on in left DataFrame. Can be a vector or list of
vectors of the length of the DataFrame to use a particular vector as
the join key instead of columns
right_on : label or list, or array-like
Field names to join on in right DataFrame or vector/list of vectors per
left_on docs
left_index : boolean, default False
Use the index from the left DataFrame as the join key(s). If it is a
MultiIndex, the number of keys in the other DataFrame (either the index
or a number of columns) must match the number of levels
right_index : boolean, default False
Use the index from the right DataFrame as the join key. Same caveats as
left_index
sort : boolean, default False
Sort the join keys lexicographically in the result DataFrame
suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
side, respectively
copy : boolean, default True
If False, do not copy data unnecessarily
Examples
--------
>>> A >>> B
lkey value rkey value
0 foo 1 0 foo 5
1 bar 2 1 bar 6
2 baz 3 2 qux 7
3 foo 4 3 bar 8
>>> merge(A, B, left_on='lkey', right_on='rkey', how='outer')
lkey value_x rkey value_y
0 foo 1 foo 5
1 foo 4 foo 5
2 bar 2 bar 6
3 bar 2 bar 8
4 baz 3 NaN NaN
5 NaN NaN qux 7
Returns
-------
merged : DataFrame
The output type will the be same as 'left', if it is a subclass
of DataFrame.
"""
#----------------------------------------------------------------------
# DataFrame class
class DataFrame(NDFrame):
""" Two-dimensional size-mutable, potentially heterogeneous tabular data
structure with labeled axes (rows and columns). Arithmetic operations
align on both row and column labels. Can be thought of as a dict-like
container for Series objects. The primary pandas data structure
Parameters
----------
data : numpy ndarray (structured or homogeneous), dict, or DataFrame
Dict can contain Series, arrays, constants, or list-like objects
index : Index or array-like
Index to use for resulting frame. Will default to np.arange(n) if
no indexing information part of input data and no index provided
columns : Index or array-like
Column labels to use for resulting frame. Will default to
np.arange(n) if no column labels are provided
dtype : dtype, default None
Data type to force, otherwise infer
copy : boolean, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input
Examples
--------
>>> d = {'col1': ts1, 'col2': ts2}
>>> df = DataFrame(data=d, index=index)
>>> df2 = DataFrame(np.random.randn(10, 5))
>>> df3 = DataFrame(np.random.randn(10, 5),
... columns=['a', 'b', 'c', 'd', 'e'])
See also
--------
DataFrame.from_records : constructor from tuples, also record arrays
DataFrame.from_dict : from dicts of Series, arrays, or dicts
DataFrame.from_csv : from CSV files
DataFrame.from_items : from sequence of (key, value) pairs
pandas.read_csv, pandas.read_table, pandas.read_clipboard
"""
_auto_consolidate = True
@property
def _constructor(self):
return DataFrame
_constructor_sliced = Series
@property
def _constructor_expanddim(self):
from pandas.core.panel import Panel
return Panel
def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False):
if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)
if isinstance(data, DataFrame):
data = data._data
if isinstance(data, BlockManager):
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
dtype=dtype, copy=copy)
elif isinstance(data, dict):
mgr = self._init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords
# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = _masked_rec_array_to_mgr(data, index, columns, dtype,
copy)
# a masked array
else:
mask = ma.getmaskarray(data)
if mask.any():
data, fill_value = _maybe_upcast(data, copy=True)
data[mask] = fill_value
else:
data = data.copy()
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = dict((k, data[k]) for k in data_columns)
if columns is None:
columns = data_columns
mgr = self._init_dict(data, index, columns, dtype=dtype)
elif getattr(data, 'name', None):
mgr = self._init_dict({data.name: data}, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
elif isinstance(data, (list, types.GeneratorType)):
if isinstance(data, types.GeneratorType):
data = list(data)
if len(data) > 0:
if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
arrays, columns = _to_arrays(data, columns, dtype=dtype)
columns = _ensure_index(columns)
# set the index
if index is None:
if isinstance(data[0], Series):
index = _get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = _default_index(len(data[0]))
else:
index = _default_index(len(data))
mgr = _arrays_to_mgr(arrays, columns, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
else:
mgr = self._init_dict({}, index, columns, dtype=dtype)
elif isinstance(data, collections.Iterator):
raise TypeError("data argument can't be an iterator")
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as e:
exc = TypeError('DataFrame constructor called with '
'incompatible data and dtype: %s' % e)
raise_with_traceback(exc)
if arr.ndim == 0 and index is not None and columns is not None:
if isinstance(data, compat.string_types) and dtype is None:
dtype = np.object_
if dtype is None:
dtype, data = _infer_dtype_from_scalar(data)
values = np.empty((len(index), len(columns)), dtype=dtype)
values.fill(data)
mgr = self._init_ndarray(values, index, columns, dtype=dtype,
copy=False)
else:
raise PandasError('DataFrame constructor not properly called!')
NDFrame.__init__(self, mgr, fastpath=True)
def _init_dict(self, data, index, columns, dtype=None):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
"""
if columns is not None:
columns = _ensure_index(columns)
# prefilter if columns passed
data = dict((k, v) for k, v in compat.iteritems(data)
if k in columns)
if index is None:
index = extract_index(list(data.values()))
else:
index = _ensure_index(index)
arrays = []
data_names = []
for k in columns:
if k not in data:
# no obvious "empty" int column
if dtype is not None and issubclass(dtype.type,
np.integer):
continue
if dtype is None:
# 1783
v = np.empty(len(index), dtype=object)
else:
v = np.empty(len(index), dtype=dtype)
v.fill(NA)
else:
v = data[k]
data_names.append(k)
arrays.append(v)
else:
keys = list(data.keys())
if not isinstance(data, OrderedDict):
keys = _try_sort(keys)
columns = data_names = Index(keys)
arrays = [data[k] for k in keys]
return _arrays_to_mgr(arrays, data_names, index, columns,
dtype=dtype)
def _init_ndarray(self, values, index, columns, dtype=None,
copy=False):
# input must be a ndarray, list, Series, index
if isinstance(values, Series):
if columns is None:
if values.name is not None:
columns = [values.name]
if index is None:
index = values.index
else:
values = values.reindex(index)
# zero len case (GH #2234)
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)
# helper to create the axes as indexes
def _get_axes(N, K, index=index, columns=columns):
# return axes or defaults
if index is None:
index = _default_index(N)
else:
index = _ensure_index(index)
if columns is None:
columns = _default_index(K)
else:
columns = _ensure_index(columns)
return index, columns
# we could have a categorical type passed or coerced to 'category'
# recast this to an _arrays_to_mgr
if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype):
if not hasattr(values,'dtype'):
values = _prep_ndarray(values, copy=copy)
values = values.ravel()
elif copy:
values = values.copy()
index, columns = _get_axes(len(values),1)
return _arrays_to_mgr([ values ], columns, index, columns,
dtype=dtype)
# by definition an array here
# the dtypes will be coerced to a single dtype
values = _prep_ndarray(values, copy=copy)
if dtype is not None:
if values.dtype != dtype:
try:
values = values.astype(dtype)
except Exception as orig:
e = ValueError("failed to cast to '%s' (Exception was: %s)"
% (dtype, orig))
raise_with_traceback(e)
index, columns = _get_axes(*values.shape)
values = values.T
# if we don't have a dtype specified, then try to convert objects
# on the entire block; this is to convert if we have datetimelike's
# embedded in an object type
if dtype is None and is_object_dtype(values):
values = _possibly_infer_to_datetimelike(values)
return create_block_manager_from_blocks([values], [columns, index])
@property
def axes(self):
return [self.index, self.columns]
@property
def shape(self):
return (len(self.index), len(self.columns))
def _repr_fits_vertical_(self):
"""
Check length against max_rows.
"""
max_rows = get_option("display.max_rows")
return len(self) <= max_rows
def _repr_fits_horizontal_(self, ignore_width=False):
"""
Check if full repr fits in horizontal boundaries imposed by the display
options width and max_columns. In case off non-interactive session, no
boundaries apply.
ignore_width is here so ipnb+HTML output can behave the way
users expect. display.max_columns remains in effect.
GH3541, GH3573
"""
width, height = fmt.get_console_size()
max_columns = get_option("display.max_columns")
nb_columns = len(self.columns)
# exceed max columns
if ((max_columns and nb_columns > max_columns) or
((not ignore_width) and width and nb_columns > (width // 2))):
return False
if (ignore_width # used by repr_html under IPython notebook
# scripts ignore terminal dims
or not com.in_interactive_session()):
return True
if (get_option('display.width') is not None or
com.in_ipython_frontend()):
# check at least the column row for excessive width
max_rows = 1
else:
max_rows = get_option("display.max_rows")
# when auto-detecting, so width=None and not in ipython front end
# check whether repr fits horizontal by actualy checking
# the width of the rendered repr
buf = StringIO()
# only care about the stuff we'll actually print out
# and to_string on entire frame may be expensive
d = self
if not (max_rows is None): # unlimited rows
# min of two, where one may be None
d = d.iloc[:min(max_rows, len(d))]
else:
return True
d.to_string(buf=buf)
value = buf.getvalue()
repr_width = max([len(l) for l in value.split('\n')])
return repr_width < width
def _info_repr(self):
"""True if the repr should show the info view."""
info_repr_option = (get_option("display.large_repr") == "info")
return info_repr_option and not (
self._repr_fits_horizontal_() and self._repr_fits_vertical_()
)
def __unicode__(self):
"""
Return a string representation for a particular DataFrame
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
py2/py3.
"""
buf = StringIO(u(""))
if self._info_repr():
self.info(buf=buf)
return buf.getvalue()
max_rows = get_option("display.max_rows")
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")
if get_option("display.expand_frame_repr"):
width, _ = fmt.get_console_size()
else:
width = None
self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
line_width=width, show_dimensions=show_dimensions)
return buf.getvalue()
def _repr_html_(self):
"""
Return a html representation for a particular DataFrame.
Mainly for IPython notebook.
"""
# qtconsole doesn't report its line width, and also
# behaves badly when outputting an HTML table
# that doesn't fit the window, so disable it.
# XXX: In IPython 3.x and above, the Qt console will not attempt to
# display HTML, so this check can be removed when support for IPython 2.x
# is no longer needed.
if com.in_qtconsole():
# 'HTML output is disabled in QtConsole'
return None
if self._info_repr():
buf = StringIO(u(""))
self.info(buf=buf)
# need to escape the <class>, should be the first line.
val = buf.getvalue().replace('<', r'&lt;', 1).replace('>',
r'&gt;', 1)
return '<pre>' + val + '</pre>'
if get_option("display.notebook_repr_html"):
max_rows = get_option("display.max_rows")
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")
return ('<div style="max-height:1000px;'
'max-width:1500px;overflow:auto;">\n' +
self.to_html(max_rows=max_rows, max_cols=max_cols,
show_dimensions=show_dimensions) + '\n</div>')
else:
return None
def iteritems(self):
"""Iterator over (column, series) pairs"""
if self.columns.is_unique and hasattr(self, '_item_cache'):
for k in self.columns:
yield k, self._get_item_cache(k)
else:
for i, k in enumerate(self.columns):
yield k, self.icol(i)
def iterrows(self):
"""
Iterate over rows of DataFrame as (index, Series) pairs.
Notes
-----
* ``iterrows`` does **not** preserve dtypes across the rows (dtypes
are preserved across columns for DataFrames). For example,
>>> df = DataFrame([[1, 1.0]], columns=['x', 'y'])
>>> row = next(df.iterrows())[1]
>>> print(row['x'].dtype)
float64
>>> print(df['x'].dtype)
int64
Returns
-------
it : generator
A generator that iterates over the rows of the frame.
"""
columns = self.columns
for k, v in zip(self.index, self.values):
s = Series(v, index=columns, name=k)
yield k, s
def itertuples(self, index=True):
"""
Iterate over rows of DataFrame as tuples, with index value
as first element of the tuple
"""
arrays = []
if index:
arrays.append(self.index)
# use integer indexing because of possible duplicate column names
arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
return zip(*arrays)
if compat.PY3: # pragma: no cover
items = iteritems
def __len__(self):
"""Returns length of info axis, but here we use the index """
return len(self.index)
def dot(self, other):
"""
Matrix multiplication with DataFrame or Series objects
Parameters
----------
other : DataFrame or Series
Returns
-------
dot_product : DataFrame or Series
"""
if isinstance(other, (Series, DataFrame)):
common = self.columns.union(other.index)
if (len(common) > len(self.columns) or
len(common) > len(other.index)):
raise ValueError('matrices are not aligned')
left = self.reindex(columns=common, copy=False)
right = other.reindex(index=common, copy=False)
lvals = left.values
rvals = right.values
else:
left = self
lvals = self.values
rvals = np.asarray(other)
if lvals.shape[1] != rvals.shape[0]:
raise ValueError('Dot product shape mismatch, %s vs %s' %
(lvals.shape, rvals.shape))
if isinstance(other, DataFrame):
return self._constructor(np.dot(lvals, rvals),
index=left.index,
columns=other.columns)
elif isinstance(other, Series):
return Series(np.dot(lvals, rvals), index=left.index)
elif isinstance(rvals, (np.ndarray, Index)):
result = np.dot(lvals, rvals)
if result.ndim == 2:
return self._constructor(result, index=left.index)
else:
return Series(result, index=left.index)
else: # pragma: no cover
raise TypeError('unsupported type: %s' % type(other))
#----------------------------------------------------------------------
# IO methods (to / from other formats)
@classmethod
def from_dict(cls, data, orient='columns', dtype=None):
"""
Construct DataFrame from dict of array-like or dicts
Parameters
----------
data : dict
{field : array-like} or {field : dict}
orient : {'columns', 'index'}, default 'columns'
The "orientation" of the data. If the keys of the passed dict
should be the columns of the resulting DataFrame, pass 'columns'
(default). Otherwise if the keys should be rows, pass 'index'.
dtype : dtype, default None
Data type to force, otherwise infer
Returns
-------
DataFrame
"""
index, columns = None, None
orient = orient.lower()
if orient == 'index':
if len(data) > 0:
# TODO speed up Series case
if isinstance(list(data.values())[0], (Series, dict)):
data = _from_nested_dict(data)
else:
data, index = list(data.values()), list(data.keys())
elif orient != 'columns': # pragma: no cover
raise ValueError('only recognize index or columns for orient')
return cls(data, index=index, columns=columns, dtype=dtype)
@deprecate_kwarg(old_arg_name='outtype', new_arg_name='orient')
def to_dict(self, orient='dict'):
"""Convert DataFrame to dictionary.
Parameters
----------
orient : str {'dict', 'list', 'series', 'split', 'records'}
Determines the type of the values of the dictionary.
- dict (default) : dict like {column -> {index -> value}}
- list : dict like {column -> [values]}
- series : dict like {column -> Series(values)}
- split : dict like
{index -> [index], columns -> [columns], data -> [values]}
- records : list like
[{column -> value}, ... , {column -> value}]
Abbreviations are allowed. `s` indicates `series` and `sp`
indicates `split`.
Returns
-------
result : dict like {column -> {index -> value}}
"""
if not self.columns.is_unique:
warnings.warn("DataFrame columns are not unique, some "
"columns will be omitted.", UserWarning)
if orient.lower().startswith('d'):
return dict((k, v.to_dict()) for k, v in compat.iteritems(self))
elif orient.lower().startswith('l'):
return dict((k, v.tolist()) for k, v in compat.iteritems(self))
elif orient.lower().startswith('sp'):
return {'index': self.index.tolist(),
'columns': self.columns.tolist(),
'data': self.values.tolist()}
elif orient.lower().startswith('s'):
return dict((k, v) for k, v in compat.iteritems(self))
elif orient.lower().startswith('r'):
return [dict((k, v) for k, v in zip(self.columns, row))
for row in self.values]
else:
raise ValueError("orient '%s' not understood" % orient)
def to_gbq(self, destination_table, project_id=None, chunksize=10000,
verbose=True, reauth=False):
"""Write a DataFrame to a Google BigQuery table.
THIS IS AN EXPERIMENTAL LIBRARY
If the table exists, the dataframe will be written to the table using
the defined table schema and column types. For simplicity, this method
uses the Google BigQuery streaming API. The to_gbq method chunks data
into a default chunk size of 10,000. Failures return the complete error
response which can be quite long depending on the size of the insert.
There are several important limitations of the Google streaming API
which are detailed at:
https://developers.google.com/bigquery/streaming-data-into-bigquery.
Parameters
----------
dataframe : DataFrame
DataFrame to be written
destination_table : string
Name of table to be written, in the form 'dataset.tablename'
project_id : str
Google BigQuery Account project ID.
chunksize : int (default 10000)
Number of rows to be inserted in each chunk from the dataframe.
verbose : boolean (default True)
Show percentage complete
reauth : boolean (default False)
Force Google BigQuery to reauthenticate the user. This is useful
if multiple accounts are used.
"""
from pandas.io import gbq
return gbq.to_gbq(self, destination_table, project_id=project_id,
chunksize=chunksize, verbose=verbose,
reauth=reauth)
@classmethod
def from_records(cls, data, index=None, exclude=None, columns=None,
coerce_float=False, nrows=None):
"""
Convert structured or record ndarray to DataFrame
Parameters
----------
data : ndarray (structured dtype), list of tuples, dict, or DataFrame
index : string, list of fields, array-like
Field of array to use as the index, alternately a specific set of
input labels to use
exclude : sequence, default None
Columns or fields to exclude
columns : sequence, default None
Column names to use. If the passed data do not have names
associated with them, this argument provides names for the
columns. Otherwise this argument indicates the order of the columns
in the result (any names not found in the data will become all-NA
columns)
coerce_float : boolean, default False
Attempt to convert values to non-string, non-numeric objects (like
decimal.Decimal) to floating point, useful for SQL result sets
Returns
-------
df : DataFrame
"""
# Make a copy of the input columns so we can modify it
if columns is not None:
columns = _ensure_index(columns)
if com.is_iterator(data):
if nrows == 0:
return cls()
try:
first_row = next(data)
except StopIteration:
return cls(index=index, columns=columns)
dtype = None
if hasattr(first_row, 'dtype') and first_row.dtype.names:
dtype = first_row.dtype
values = [first_row]
if nrows is None:
values += data
else:
values.extend(itertools.islice(data, nrows - 1))
if dtype is not None:
data = np.array(values, dtype=dtype)
else:
data = values
if isinstance(data, dict):
if columns is None:
columns = arr_columns = _ensure_index(sorted(data))
arrays = [data[k] for k in columns]
else:
arrays = []
arr_columns = []
for k, v in compat.iteritems(data):
if k in columns:
arr_columns.append(k)
arrays.append(v)
arrays, arr_columns = _reorder_arrays(arrays, arr_columns,
columns)
elif isinstance(data, (np.ndarray, DataFrame)):
arrays, columns = _to_arrays(data, columns)
if columns is not None:
columns = _ensure_index(columns)
arr_columns = columns
else:
arrays, arr_columns = _to_arrays(data, columns,
coerce_float=coerce_float)
arr_columns = _ensure_index(arr_columns)
if columns is not None:
columns = _ensure_index(columns)
else:
columns = arr_columns
if exclude is None:
exclude = set()
else:
exclude = set(exclude)
result_index = None
if index is not None:
if (isinstance(index, compat.string_types) or
not hasattr(index, "__iter__")):
i = columns.get_loc(index)
exclude.add(index)
if len(arrays) > 0:
result_index = Index(arrays[i], name=index)
else:
result_index = Index([], name=index)
else:
try:
to_remove = [arr_columns.get_loc(field) for field in index]
result_index = MultiIndex.from_arrays(
[arrays[i] for i in to_remove], names=index)
exclude.update(index)
except Exception:
result_index = index
if any(exclude):
arr_exclude = [x for x in exclude if x in arr_columns]
to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
arr_columns = arr_columns.drop(arr_exclude)
columns = columns.drop(exclude)
mgr = _arrays_to_mgr(arrays, arr_columns, result_index,
columns)
return cls(mgr)
def to_records(self, index=True, convert_datetime64=True):
"""
Convert DataFrame to record array. Index will be put in the
'index' field of the record array if requested
Parameters
----------
index : boolean, default True
Include index in resulting record array, stored in 'index' field
convert_datetime64 : boolean, default True
Whether to convert the index to datetime.datetime if it is a
DatetimeIndex
Returns
-------
y : recarray
"""
if index:
if com.is_datetime64_dtype(self.index) and convert_datetime64:
ix_vals = [self.index.to_pydatetime()]
else:
if isinstance(self.index, MultiIndex):
# array of tuples to numpy cols. copy copy copy
ix_vals = lmap(np.array, zip(*self.index.values))
else:
ix_vals = [self.index.values]
arrays = ix_vals + [self[c].get_values() for c in self.columns]
count = 0
index_names = list(self.index.names)
if isinstance(self.index, MultiIndex):
for i, n in enumerate(index_names):
if n is None:
index_names[i] = 'level_%d' % count
count += 1
elif index_names[0] is None:
index_names = ['index']
names = index_names + lmap(str, self.columns)
else:
arrays = [self[c].get_values() for c in self.columns]
names = lmap(str, self.columns)
dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)])
return np.rec.fromarrays(arrays, dtype=dtype, names=names)
@classmethod
def from_items(cls, items, columns=None, orient='columns'):
"""
Convert (key, value) pairs to DataFrame. The keys will be the axis
index (usually the columns, but depends on the specified
orientation). The values should be arrays or Series.
Parameters
----------
items : sequence of (key, value) pairs
Values should be arrays or Series.
columns : sequence of column labels, optional
Must be passed if orient='index'.
orient : {'columns', 'index'}, default 'columns'
The "orientation" of the data. If the keys of the
input correspond to column labels, pass 'columns'
(default). Otherwise if the keys correspond to the index,
pass 'index'.
Returns
-------
frame : DataFrame
"""
keys, values = lzip(*items)
if orient == 'columns':
if columns is not None:
columns = _ensure_index(columns)
idict = dict(items)
if len(idict) < len(items):
if not columns.equals(_ensure_index(keys)):
raise ValueError('With non-unique item names, passed '
'columns must be identical')
arrays = values
else:
arrays = [idict[k] for k in columns if k in idict]
else:
columns = _ensure_index(keys)
arrays = values
return cls._from_arrays(arrays, columns, None)
elif orient == 'index':
if columns is None:
raise TypeError("Must pass columns with orient='index'")
keys = _ensure_index(keys)
arr = np.array(values, dtype=object).T
data = [lib.maybe_convert_objects(v) for v in arr]
return cls._from_arrays(data, columns, keys)
else: # pragma: no cover
raise ValueError("'orient' must be either 'columns' or 'index'")
@classmethod
def _from_arrays(cls, arrays, columns, index, dtype=None):
mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
return cls(mgr)
@classmethod
def from_csv(cls, path, header=0, sep=',', index_col=0,
parse_dates=True, encoding=None, tupleize_cols=False,
infer_datetime_format=False):
"""
Read delimited file into DataFrame
Parameters
----------
path : string file path or file handle / StringIO
header : int, default 0
Row to use at header (skip prior rows)
sep : string, default ','
Field delimiter
index_col : int or sequence, default 0
Column to use for index. If a sequence is given, a MultiIndex
is used. Different default from read_table
parse_dates : boolean, default True
Parse dates. Different default from read_table
tupleize_cols : boolean, default False
write multi_index columns as a list of tuples (if True)
or new (expanded format) if False)
infer_datetime_format: boolean, default False
If True and `parse_dates` is True for a column, try to infer the
datetime format based on the first datetime string. If the format
can be inferred, there often will be a large parsing speed-up.
Notes
-----
Preferable to use read_table for most general purposes but from_csv
makes for an easy roundtrip to and from file, especially with a
DataFrame of time series data
Returns
-------
y : DataFrame
"""
from pandas.io.parsers import read_table
return read_table(path, header=header, sep=sep,
parse_dates=parse_dates, index_col=index_col,
encoding=encoding, tupleize_cols=tupleize_cols,
infer_datetime_format=infer_datetime_format)
def to_sparse(self, fill_value=None, kind='block'):
"""
Convert to SparseDataFrame
Parameters
----------
fill_value : float, default NaN
kind : {'block', 'integer'}
Returns
-------
y : SparseDataFrame
"""
from pandas.core.sparse import SparseDataFrame
return SparseDataFrame(self._series, index=self.index,
default_kind=kind,
default_fill_value=fill_value)
def to_panel(self):
"""
Transform long (stacked) format (DataFrame) into wide (3D, Panel)
format.
Currently the index of the DataFrame must be a 2-level MultiIndex. This
may be generalized later
Returns
-------
panel : Panel
"""
# only support this kind for now
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
len(self.index.levels) != 2):
raise NotImplementedError('Only 2-level MultiIndex are supported.')
if not self.index.is_unique:
raise ValueError("Can't convert non-uniquely indexed "
"DataFrame to Panel")
self._consolidate_inplace()
# minor axis must be sorted
if self.index.lexsort_depth < 2:
selfsorted = self.sortlevel(0)
else:
selfsorted = self
major_axis, minor_axis = selfsorted.index.levels
major_labels, minor_labels = selfsorted.index.labels
shape = len(major_axis), len(minor_axis)
# preserve names, if any
major_axis = major_axis.copy()
major_axis.name = self.index.names[0]
minor_axis = minor_axis.copy()
minor_axis.name = self.index.names[1]
# create new axes
new_axes = [selfsorted.columns, major_axis, minor_axis]
# create new manager
new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
labels=[major_labels, minor_labels],
shape=shape,
ref_items=selfsorted.columns)
return self._constructor_expanddim(new_mgr)
to_wide = deprecate('to_wide', to_panel)
def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
columns=None, header=True, index=True, index_label=None,
mode='w', encoding=None, quoting=None,
quotechar='"', line_terminator='\n', chunksize=None,
tupleize_cols=False, date_format=None, doublequote=True,
escapechar=None, decimal='.', **kwds):
r"""Write DataFrame to a comma-separated values (csv) file
Parameters
----------
path_or_buf : string or file handle, default None
File path or object, if None is provided the result is returned as
a string.
sep : character, default ","
Field delimiter for the output file.
na_rep : string, default ''
Missing data representation
float_format : string, default None
Format string for floating point numbers
columns : sequence, optional
Columns to write
header : boolean or list of string, default True
Write out column names. If a list of string is given it is assumed
to be aliases for the column names
index : boolean, default True
Write row names (index)
index_label : string or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R
nanRep : None
deprecated, use na_rep
mode : str
Python write mode, default 'w'
encoding : string, optional
A string representing the encoding to use in the output file,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
line_terminator : string, default '\\n'
The newline character or character sequence to use in the output
file
quoting : optional constant from csv module
defaults to csv.QUOTE_MINIMAL
quotechar : string (length 1), default '"'
character used to quote fields
doublequote : boolean, default True
Control quoting of `quotechar` inside a field
escapechar : string (length 1), default None
character used to escape `sep` and `quotechar` when appropriate
chunksize : int or None
rows to write at a time
tupleize_cols : boolean, default False
write multi_index columns as a list of tuples (if True)
or new (expanded format) if False)
date_format : string, default None
Format string for datetime objects
decimal: string, default '.'
Character recognized as decimal separator. E.g. use ',' for European data
"""
formatter = fmt.CSVFormatter(self, path_or_buf,
line_terminator=line_terminator,
sep=sep, encoding=encoding,
quoting=quoting, na_rep=na_rep,
float_format=float_format, cols=columns,
header=header, index=index,
index_label=index_label, mode=mode,
chunksize=chunksize, quotechar=quotechar,
engine=kwds.get("engine"),
tupleize_cols=tupleize_cols,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
decimal=decimal)
formatter.save()
if path_or_buf is None:
return formatter.path_or_buf.getvalue()
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
float_format=None, columns=None, header=True, index=True,
index_label=None, startrow=0, startcol=0, engine=None,
merge_cells=True, encoding=None, inf_rep='inf'):
"""
Write DataFrame to a excel sheet
Parameters
----------
excel_writer : string or ExcelWriter object
File path or existing ExcelWriter
sheet_name : string, default 'Sheet1'
Name of sheet which will contain DataFrame
na_rep : string, default ''
Missing data representation
float_format : string, default None
Format string for floating point numbers
columns : sequence, optional
Columns to write
header : boolean or list of string, default True
Write out column names. If a list of string is given it is
assumed to be aliases for the column names
index : boolean, default True
Write row names (index)
index_label : string or sequence, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
startrow :
upper left cell row to dump data frame
startcol :
upper left cell column to dump data frame
engine : string, default None
write engine to use - you can also set this via the options
``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
``io.excel.xlsm.writer``.
merge_cells : boolean, default True
Write MultiIndex and Hierarchical Rows as merged cells.
encoding: string, default None
encoding of the resulting excel file. Only necessary for xlwt,
other writers support unicode natively.
inf_rep : string, default 'inf'
Representation for infinity (there is no native representation for
infinity in Excel)
Notes
-----
If passing an existing ExcelWriter object, then the sheet will be added
to the existing workbook. This can be used to save different
DataFrames to one workbook:
>>> writer = ExcelWriter('output.xlsx')
>>> df1.to_excel(writer,'Sheet1')
>>> df2.to_excel(writer,'Sheet2')
>>> writer.save()
"""
from pandas.io.excel import ExcelWriter
if self.columns.nlevels > 1:
raise NotImplementedError("Writing as Excel with a MultiIndex is "
"not yet implemented.")
need_save = False
if encoding == None:
encoding = 'ascii'
if isinstance(excel_writer, compat.string_types):
excel_writer = ExcelWriter(excel_writer, engine=engine)
need_save = True
formatter = fmt.ExcelFormatter(self,
na_rep=na_rep,
cols=columns,
header=header,
float_format=float_format,
index=index,
index_label=index_label,
merge_cells=merge_cells,
inf_rep=inf_rep)
formatted_cells = formatter.get_formatted_cells()
excel_writer.write_cells(formatted_cells, sheet_name,
startrow=startrow, startcol=startcol)
if need_save:
excel_writer.save()
def to_stata(
self, fname, convert_dates=None, write_index=True, encoding="latin-1",
byteorder=None, time_stamp=None, data_label=None):
"""
A class for writing Stata binary dta files from array-like objects
Parameters
----------
fname : file path or buffer
Where to save the dta file.
convert_dates : dict
Dictionary mapping column of datetime types to the stata internal
format that you want to use for the dates. Options are
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
number or a name.
encoding : str
Default is latin-1. Note that Stata does not support unicode.
byteorder : str
Can be ">", "<", "little", or "big". The default is None which uses
`sys.byteorder`
Examples
--------
>>> writer = StataWriter('./data_file.dta', data)
>>> writer.write_file()
Or with dates
>>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'})
>>> writer.write_file()
"""
from pandas.io.stata import StataWriter
writer = StataWriter(fname, self, convert_dates=convert_dates,
encoding=encoding, byteorder=byteorder,
time_stamp=time_stamp, data_label=data_label,
write_index=write_index)
writer.write_file()
@Appender(fmt.docstring_to_string, indents=1)
def to_string(self, buf=None, columns=None, col_space=None, colSpace=None,
header=True, index=True, na_rep='NaN', formatters=None,
float_format=None, sparsify=None, index_names=True,
justify=None, line_width=None, max_rows=None, max_cols=None,
show_dimensions=False):
"""
Render a DataFrame to a console-friendly tabular output.
"""
if colSpace is not None: # pragma: no cover
warnings.warn("colSpace is deprecated, use col_space",
FutureWarning)
col_space = colSpace
formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
col_space=col_space, na_rep=na_rep,
formatters=formatters,
float_format=float_format,
sparsify=sparsify,
justify=justify,
index_names=index_names,
header=header, index=index,
line_width=line_width,
max_rows=max_rows,
max_cols=max_cols,
show_dimensions=show_dimensions)
formatter.to_string()
if buf is None:
result = formatter.buf.getvalue()
return result
@Appender(fmt.docstring_to_string, indents=1)
def to_html(self, buf=None, columns=None, col_space=None, colSpace=None,
header=True, index=True, na_rep='NaN', formatters=None,
float_format=None, sparsify=None, index_names=True,
justify=None, bold_rows=True, classes=None, escape=True,
max_rows=None, max_cols=None, show_dimensions=False):
"""
Render a DataFrame as an HTML table.
`to_html`-specific options:
bold_rows : boolean, default True
Make the row labels bold in the output
classes : str or list or tuple, default None
CSS class(es) to apply to the resulting html table
escape : boolean, default True
Convert the characters <, >, and & to HTML-safe sequences.=
max_rows : int, optional
Maximum number of rows to show before truncating. If None, show
all.
max_cols : int, optional
Maximum number of columns to show before truncating. If None, show
all.
"""
if colSpace is not None: # pragma: no cover
warnings.warn("colSpace is deprecated, use col_space",
FutureWarning)
col_space = colSpace
formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
col_space=col_space, na_rep=na_rep,
formatters=formatters,
float_format=float_format,
sparsify=sparsify,
justify=justify,
index_names=index_names,
header=header, index=index,
bold_rows=bold_rows,
escape=escape,
max_rows=max_rows,
max_cols=max_cols,
show_dimensions=show_dimensions)
formatter.to_html(classes=classes)
if buf is None:
return formatter.buf.getvalue()
@Appender(fmt.docstring_to_string, indents=1)
def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
header=True, index=True, na_rep='NaN', formatters=None,
float_format=None, sparsify=None, index_names=True,
bold_rows=True, longtable=False, escape=True):
"""
Render a DataFrame to a tabular environment table. You can splice
this into a LaTeX document. Requires \\usepackage{booktabs}.
`to_latex`-specific options:
bold_rows : boolean, default True
Make the row labels bold in the output
longtable : boolean, default False
Use a longtable environment instead of tabular. Requires adding
a \\usepackage{longtable} to your LaTeX preamble.
escape : boolean, default True
When set to False prevents from escaping latex special
characters in column names.
"""
if colSpace is not None: # pragma: no cover
warnings.warn("colSpace is deprecated, use col_space",
FutureWarning)
col_space = colSpace
formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
col_space=col_space, na_rep=na_rep,
header=header, index=index,
formatters=formatters,
float_format=float_format,
bold_rows=bold_rows,
sparsify=sparsify,
index_names=index_names,
escape=escape)
formatter.to_latex(longtable=longtable)
if buf is None:
return formatter.buf.getvalue()
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None):
"""
Concise summary of a DataFrame.
Parameters
----------
verbose : {None, True, False}, optional
Whether to print the full summary.
None follows the `display.max_info_columns` setting.
True or False overrides the `display.max_info_columns` setting.
buf : writable buffer, defaults to sys.stdout
max_cols : int, default None
Determines whether full summary or short summary is printed.
None follows the `display.max_info_columns` setting.
memory_usage : boolean, default None
Specifies whether total memory usage of the DataFrame
elements (including index) should be displayed. None follows
the `display.memory_usage` setting. True or False overrides
the `display.memory_usage` setting. Memory usage is shown in
human-readable units (base-2 representation).
null_counts : boolean, default None
Whether to show the non-null counts
If None, then only show if the frame is smaller than max_info_rows and max_info_columns.
If True, always show counts.
If False, never show counts.
"""
from pandas.core.format import _put_lines
if buf is None: # pragma: no cover
buf = sys.stdout
lines = []
lines.append(str(type(self)))
lines.append(self.index.summary())
if len(self.columns) == 0:
lines.append('Empty %s' % type(self).__name__)
_put_lines(buf, lines)
return
cols = self.columns
# hack
if max_cols is None:
max_cols = get_option(
'display.max_info_columns', len(self.columns) + 1)
max_rows = get_option('display.max_info_rows', len(self) + 1)
if null_counts is None:
show_counts = ((len(self.columns) <= max_cols) and
(len(self) < max_rows))
else:
show_counts = null_counts
exceeds_info_cols = len(self.columns) > max_cols
def _verbose_repr():
lines.append('Data columns (total %d columns):' %
len(self.columns))
space = max([len(com.pprint_thing(k)) for k in self.columns]) + 4
counts = None
tmpl = "%s%s"
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError('Columns must equal counts (%d != %d)' %
(len(cols), len(counts)))
tmpl = "%s non-null %s"
dtypes = self.dtypes
for i, col in enumerate(self.columns):
dtype = dtypes[col]
col = com.pprint_thing(col)
count = ""
if show_counts:
count = counts.iloc[i]
lines.append(_put_str(col, space) +
tmpl % (count, dtype))
def _non_verbose_repr():
lines.append(self.columns.summary(name='Columns'))
def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%3.1f%s %s" % (num, size_qualifier, x)
num /= 1024.0
return "%3.1f%s %s" % (num, size_qualifier, 'PB')
if verbose:
_verbose_repr()
elif verbose is False: # specifically set to False, not nesc None
_non_verbose_repr()
else:
if exceeds_info_cols:
_non_verbose_repr()
else:
_verbose_repr()
counts = self.get_dtype_counts()
dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))]
lines.append('dtypes: %s' % ', '.join(dtypes))
if memory_usage is None:
memory_usage = get_option('display.memory_usage')
if memory_usage: # append memory usage of df to display
# size_qualifier is just a best effort; not guaranteed to catch all
# cases (e.g., it misses categorical data even with object
# categories)
size_qualifier = ('+' if 'object' in counts
or is_object_dtype(self.index) else '')
mem_usage = self.memory_usage(index=True).sum()
lines.append("memory usage: %s\n" %
_sizeof_fmt(mem_usage, size_qualifier))
_put_lines(buf, lines)
def memory_usage(self, index=False):
"""Memory usage of DataFrame columns.
Parameters
----------
index : bool
Specifies whether to include memory usage of DataFrame's
index in returned Series. If `index=True` (default is False)
the first index of the Series is `Index`.
Returns
-------
sizes : Series
A series with column names as index and memory usage of
columns with units of bytes.
Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array.
See Also
--------
numpy.ndarray.nbytes
"""
result = Series([ c.values.nbytes for col, c in self.iteritems() ],
index=self.columns)
if index:
result = Series(self.index.nbytes,
index=['Index']).append(result)
return result
def transpose(self):
"""Transpose index and columns"""
return super(DataFrame, self).transpose(1, 0)
T = property(transpose)
#----------------------------------------------------------------------
# Picklability
# legacy pickle formats
def _unpickle_frame_compat(self, state): # pragma: no cover
from pandas.core.common import _unpickle_array
if len(state) == 2: # pragma: no cover
series, idx = state
columns = sorted(series)
else:
series, cols, idx = state
columns = _unpickle_array(cols)
index = _unpickle_array(idx)
self._data = self._init_dict(series, index, columns, None)
def _unpickle_matrix_compat(self, state): # pragma: no cover
from pandas.core.common import _unpickle_array
# old unpickling
(vals, idx, cols), object_state = state
index = _unpickle_array(idx)
dm = DataFrame(vals, index=index, columns=_unpickle_array(cols),
copy=False)
if object_state is not None:
ovals, _, ocols = object_state
objects = DataFrame(ovals, index=index,
columns=_unpickle_array(ocols),
copy=False)
dm = dm.join(objects)
self._data = dm._data
#----------------------------------------------------------------------
#----------------------------------------------------------------------
# Getting and setting elements
def get_value(self, index, col, takeable=False):
"""
Quickly retrieve single value at passed column and index
Parameters
----------
index : row label
col : column label
takeable : interpret the index/col as indexers, default False
Returns
-------
value : scalar value
"""
if takeable:
series = self._iget_item_cache(col)
return _maybe_box_datetimelike(series.values[index])
series = self._get_item_cache(col)
engine = self.index._engine
return engine.get_value(series.get_values(), index)
def set_value(self, index, col, value, takeable=False):
"""
Put single value at passed column and index
Parameters
----------
index : row label
col : column label
value : scalar value
takeable : interpret the index/col as indexers, default False
Returns
-------
frame : DataFrame
If label pair is contained, will be reference to calling DataFrame,
otherwise a new object
"""
try:
if takeable is True:
series = self._iget_item_cache(col)
return series.set_value(index, value, takeable=True)
series = self._get_item_cache(col)
engine = self.index._engine
engine.set_value(series.values, index, value)
return self
except (KeyError, TypeError):
# set using a non-recursive method & reset the cache
self.loc[index, col] = value
self._item_cache.pop(col, None)
return self
def irow(self, i, copy=False):
return self._ixs(i, axis=0)
def icol(self, i):
return self._ixs(i, axis=1)
def _ixs(self, i, axis=0):
"""
i : int, slice, or sequence of integers
axis : int
"""
# irow
if axis == 0:
"""
Notes
-----
If slice passed, the resulting data will be a view
"""
if isinstance(i, slice):
return self[i]
else:
label = self.index[i]
if isinstance(label, Index):
# a location index by definition
result = self.take(i, axis=axis)
copy=True
else:
new_values = self._data.fast_xs(i)
# if we are a copy, mark as such
copy = isinstance(new_values,np.ndarray) and new_values.base is None
result = Series(new_values, index=self.columns,
name=self.index[i], dtype=new_values.dtype)
result._set_is_copy(self, copy=copy)
return result
# icol
else:
"""
Notes
-----
If slice passed, the resulting data will be a view
"""
label = self.columns[i]
if isinstance(i, slice):
# need to return view
lab_slice = slice(label[0], label[-1])
return self.ix[:, lab_slice]
else:
if isinstance(label, Index):
return self.take(i, axis=1, convert=True)
index_len = len(self.index)
# if the values returned are not the same length
# as the index (iow a not found value), iget returns
# a 0-len ndarray. This is effectively catching
# a numpy error (as numpy should really raise)
values = self._data.iget(i)
if index_len and not len(values):
values = np.array([np.nan] * index_len, dtype=object)
result = self._constructor_sliced.from_array(
values, index=self.index,
name=label, fastpath=True)
# this is a cached value, mark it so
result._set_as_cached(label, self)
return result
def iget_value(self, i, j):
return self.iat[i, j]
def __getitem__(self, key):
# shortcut if we are an actual column
is_mi_columns = isinstance(self.columns, MultiIndex)
try:
if key in self.columns and not is_mi_columns:
return self._getitem_column(key)
except:
pass
# see if we can slice the rows
indexer = convert_to_index_sliceable(self, key)
if indexer is not None:
return self._getitem_slice(indexer)
if isinstance(key, (Series, np.ndarray, Index, list)):
# either boolean or fancy integer index
return self._getitem_array(key)
elif isinstance(key, DataFrame):
return self._getitem_frame(key)
elif is_mi_columns:
return self._getitem_multilevel(key)
else:
return self._getitem_column(key)
def _getitem_column(self, key):
""" return the actual column """
# get column
if self.columns.is_unique:
return self._get_item_cache(key)
# duplicate columns & possible reduce dimensionaility
result = self._constructor(self._data.get(key))
if result.columns.is_unique:
result = result[key]
return result
def _getitem_slice(self, key):
return self._slice(key, axis=0)
def _getitem_array(self, key):
# also raises Exception if object array with NA values
if com.is_bool_indexer(key):
# warning here just in case -- previously __setitem__ was
# reindexing but __getitem__ was not; it seems more reasonable to
# go with the __setitem__ behavior since that is more consistent
# with all other indexing behavior
if isinstance(key, Series) and not key.index.equals(self.index):
warnings.warn("Boolean Series key will be reindexed to match "
"DataFrame index.", UserWarning)
elif len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d.' %
(len(key), len(self.index)))
# check_bool_indexer will throw exception if Series key cannot
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self.take(indexer, axis=0, convert=False)
else:
indexer = self.ix._convert_to_indexer(key, axis=1)
return self.take(indexer, axis=1, convert=True)
def _getitem_multilevel(self, key):
loc = self.columns.get_loc(key)
if isinstance(loc, (slice, Series, np.ndarray, Index)):
new_columns = self.columns[loc]
result_columns = maybe_droplevels(new_columns, key)
if self._is_mixed_type:
result = self.reindex(columns=new_columns)
result.columns = result_columns
else:
new_values = self.values[:, loc]
result = self._constructor(new_values, index=self.index,
columns=result_columns).__finalize__(self)
if len(result.columns) == 1:
top = result.columns[0]
if ((type(top) == str and top == '') or
(type(top) == tuple and top[0] == '')):
result = result['']
if isinstance(result, Series):
result = self._constructor_sliced(result, index=self.index, name=key)
result._set_is_copy(self)
return result
else:
return self._get_item_cache(key)
def _getitem_frame(self, key):
if key.values.dtype != np.bool_:
raise ValueError('Must pass DataFrame with boolean values only')
return self.where(key)
def query(self, expr, **kwargs):
"""Query the columns of a frame with a boolean expression.
.. versionadded:: 0.13
Parameters
----------
expr : string
The query string to evaluate. You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.
kwargs : dict
See the documentation for :func:`pandas.eval` for complete details
on the keyword arguments accepted by :meth:`DataFrame.query`.
Returns
-------
q : DataFrame
Notes
-----
The result of the evaluation of this expression is first passed to
:attr:`DataFrame.loc` and if that fails because of a
multidimensional key (e.g., a DataFrame) then the result will be passed
to :meth:`DataFrame.__getitem__`.
This method uses the top-level :func:`pandas.eval` function to
evaluate the passed query.
The :meth:`~pandas.DataFrame.query` method uses a slightly
modified Python syntax by default. For example, the ``&`` and ``|``
(bitwise) operators have the precedence of their boolean cousins,
:keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
however the semantics are different.
You can change the semantics of the expression by passing the keyword
argument ``parser='python'``. This enforces the same semantics as
evaluation in Python space. Likewise, you can pass ``engine='python'``
to evaluate an expression using Python itself as a backend. This is not
recommended as it is inefficient compared to using ``numexpr`` as the
engine.
The :attr:`DataFrame.index` and
:attr:`DataFrame.columns` attributes of the
:class:`~pandas.DataFrame` instance are placed in the query namespace
by default, which allows you to treat both the index and columns of the
frame as a column in the frame.
The identifier ``index`` is used for the frame index; you can also
use the name of the index to identify it in a query.
For further details and examples see the ``query`` documentation in
:ref:`indexing <indexing.query>`.
See Also
--------
pandas.eval
DataFrame.eval
Examples
--------
>>> from numpy.random import randn
>>> from pandas import DataFrame
>>> df = DataFrame(randn(10, 2), columns=list('ab'))
>>> df.query('a > b')
>>> df[df.a > df.b] # same result as the previous expression
"""
kwargs['level'] = kwargs.pop('level', 0) + 1
res = self.eval(expr, **kwargs)
try:
return self.loc[res]
except ValueError:
# when res is multi-dimensional loc raises, but this is sometimes a
# valid query
return self[res]
def eval(self, expr, **kwargs):
"""Evaluate an expression in the context of the calling DataFrame
instance.
Parameters
----------
expr : string
The expression string to evaluate.
kwargs : dict
See the documentation for :func:`~pandas.eval` for complete details
on the keyword arguments accepted by
:meth:`~pandas.DataFrame.query`.
Returns
-------
ret : ndarray, scalar, or pandas object
See Also
--------
pandas.DataFrame.query
pandas.eval
Notes
-----
For more details see the API documentation for :func:`~pandas.eval`.
For detailed examples see :ref:`enhancing performance with eval
<enhancingperf.eval>`.
Examples
--------
>>> from numpy.random import randn
>>> from pandas import DataFrame
>>> df = DataFrame(randn(10, 2), columns=list('ab'))
>>> df.eval('a + b')
>>> df.eval('c = a + b')
"""
resolvers = kwargs.pop('resolvers', None)
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
index_resolvers = self._get_index_resolvers()
resolvers = dict(self.iteritems()), index_resolvers
kwargs['target'] = self
kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
return _eval(expr, **kwargs)
def select_dtypes(self, include=None, exclude=None):
"""Return a subset of a DataFrame including/excluding columns based on
their ``dtype``.
Parameters
----------
include, exclude : list-like
A list of dtypes or strings to be included/excluded. You must pass
in a non-empty sequence for at least one of these.
Raises
------
ValueError
* If both of ``include`` and ``exclude`` are empty
* If ``include`` and ``exclude`` have overlapping elements
* If any kind of string dtype is passed in.
TypeError
* If either of ``include`` or ``exclude`` is not a sequence
Returns
-------
subset : DataFrame
The subset of the frame including the dtypes in ``include`` and
excluding the dtypes in ``exclude``.
Notes
-----
* To select all *numeric* types use the numpy dtype ``numpy.number``
* To select strings you must use the ``object`` dtype, but note that
this will return *all* object dtype columns
* See the `numpy dtype hierarchy
<http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
* To select Pandas categorical dtypes, use 'category'
Examples
--------
>>> df = pd.DataFrame({'a': np.random.randn(6).astype('f4'),
... 'b': [True, False] * 3,
... 'c': [1.0, 2.0] * 3})
>>> df
a b c
0 0.3962 True 1
1 0.1459 False 2
2 0.2623 True 1
3 0.0764 False 2
4 -0.9703 True 1
5 -1.2094 False 2
>>> df.select_dtypes(include=['float64'])
c
0 1
1 2
2 1
3 2
4 1
5 2
>>> df.select_dtypes(exclude=['floating'])
b
0 True
1 False
2 True
3 False
4 True
5 False
"""
include, exclude = include or (), exclude or ()
if not (com.is_list_like(include) and com.is_list_like(exclude)):
raise TypeError('include and exclude must both be non-string'
' sequences')
selection = tuple(map(frozenset, (include, exclude)))
if not any(selection):
raise ValueError('at least one of include or exclude must be '
'nonempty')
# convert the myriad valid dtypes object to a single representation
include, exclude = map(lambda x:
frozenset(map(com._get_dtype_from_object, x)),
selection)
for dtypes in (include, exclude):
com._invalidate_string_dtypes(dtypes)
# can't both include AND exclude!
if not include.isdisjoint(exclude):
raise ValueError('include and exclude overlap on %s'
% (include & exclude))
# empty include/exclude -> defaults to True
# three cases (we've already raised if both are empty)
# case 1: empty include, nonempty exclude
# we have True, True, ... True for include, same for exclude
# in the loop below we get the excluded
# and when we call '&' below we get only the excluded
# case 2: nonempty include, empty exclude
# same as case 1, but with include
# case 3: both nonempty
# the "union" of the logic of case 1 and case 2:
# we get the included and excluded, and return their logical and
include_these = Series(not bool(include), index=self.columns)
exclude_these = Series(not bool(exclude), index=self.columns)
def is_dtype_instance_mapper(column, dtype):
return column, functools.partial(issubclass, dtype.type)
for column, f in itertools.starmap(is_dtype_instance_mapper,
self.dtypes.iteritems()):
if include: # checks for the case of empty include or exclude
include_these[column] = any(map(f, include))
if exclude:
exclude_these[column] = not any(map(f, exclude))
dtype_indexer = include_these & exclude_these
return self.loc[com._get_info_slice(self, dtype_indexer)]
def _box_item_values(self, key, values):
items = self.columns[self.columns.get_loc(key)]
if values.ndim == 2:
return self._constructor(values.T, columns=items, index=self.index)
else:
return self._box_col_values(values, items)
def _box_col_values(self, values, items):
""" provide boxed values for a column """
return self._constructor_sliced.from_array(values, index=self.index,
name=items, fastpath=True)
def __setitem__(self, key, value):
# see if we can slice the rows
indexer = convert_to_index_sliceable(self, key)
if indexer is not None:
return self._setitem_slice(indexer, value)
if isinstance(key, (Series, np.ndarray, list, Index)):
self._setitem_array(key, value)
elif isinstance(key, DataFrame):
self._setitem_frame(key, value)
else:
# set column
self._set_item(key, value)
def _setitem_slice(self, key, value):
self._check_setitem_copy()
self.ix._setitem_with_indexer(key, value)
def _setitem_array(self, key, value):
# also raises Exception if object array with NA values
if com.is_bool_indexer(key):
if len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d!' %
(len(key), len(self.index)))
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
self._check_setitem_copy()
self.ix._setitem_with_indexer(indexer, value)
else:
if isinstance(value, DataFrame):
if len(value.columns) != len(key):
raise ValueError('Columns must be same length as key')
for k1, k2 in zip(key, value.columns):
self[k1] = value[k2]
else:
indexer = self.ix._convert_to_indexer(key, axis=1)
self._check_setitem_copy()
self.ix._setitem_with_indexer((slice(None), indexer), value)
def _setitem_frame(self, key, value):
# support boolean setting with DataFrame input, e.g.
# df[df > df2] = 0
if key.values.dtype != np.bool_:
raise TypeError('Must pass DataFrame with boolean values only')
self._check_inplace_setting(value)
self._check_setitem_copy()
self.where(-key, value, inplace=True)
def _ensure_valid_index(self, value):
"""
ensure that if we don't have an index, that we can create one from the
passed value
"""
if not len(self.index):
# GH5632, make sure that we are a Series convertible
if is_list_like(value):
try:
value = Series(value)
except:
pass
if not isinstance(value, Series):
raise ValueError('Cannot set a frame with no defined index '
'and a value that cannot be converted to a '
'Series')
self._data = self._data.reindex_axis(value.index.copy(), axis=1,
fill_value=np.nan)
# we are a scalar
# noop
else:
pass
def _set_item(self, key, value):
"""
Add series to DataFrame in specified column.
If series is a numpy-array (not a Series/TimeSeries), it must be the
same length as the DataFrames index or an error will be thrown.
Series/TimeSeries will be conformed to the DataFrames index to
ensure homogeneity.
"""
self._ensure_valid_index(value)
value = self._sanitize_column(key, value)
NDFrame._set_item(self, key, value)
# check if we are modifying a copy
# try to set first as we want an invalid
# value exeption to occur first
if len(self):
self._check_setitem_copy()
def insert(self, loc, column, value, allow_duplicates=False):
"""
Insert column into DataFrame at specified location.
If `allow_duplicates` is False, raises Exception if column
is already contained in the DataFrame.
Parameters
----------
loc : int
Must have 0 <= loc <= len(columns)
column : object
value : int, Series, or array-like
"""
self._ensure_valid_index(value)
value = self._sanitize_column(column, value)
self._data.insert(
loc, column, value, allow_duplicates=allow_duplicates)
def assign(self, **kwargs):
"""
Assign new columns to a DataFrame, returning a new object
(a copy) with all the original columns in addition to the new ones.
.. versionadded:: 0.16.0
Parameters
----------
kwargs : keyword, value pairs
keywords are the column names. If the values are
callable, they are computed on the DataFrame and
assigned to the new columns. If the values are
not callable, (e.g. a Series, scalar, or array),
they are simply assigned.
Returns
-------
df : DataFrame
A new DataFrame with the new columns in addition to
all the existing columns.
Notes
-----
Since ``kwargs`` is a dictionary, the order of your
arguments may not be preserved. The make things predicatable,
the columns are inserted in alphabetical order, at the end of
your DataFrame. Assigning multiple columns within the same
``assign`` is possible, but you cannot reference other columns
created within the same ``assign`` call.
Examples
--------
>>> df = DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})
Where the value is a callable, evaluated on `df`:
>>> df.assign(ln_A = lambda x: np.log(x.A))
A B ln_A
0 1 0.426905 0.000000
1 2 -0.780949 0.693147
2 3 -0.418711 1.098612
3 4 -0.269708 1.386294
4 5 -0.274002 1.609438
5 6 -0.500792 1.791759
6 7 1.649697 1.945910
7 8 -1.495604 2.079442
8 9 0.549296 2.197225
9 10 -0.758542 2.302585
Where the value already exists and is inserted:
>>> newcol = np.log(df['A'])
>>> df.assign(ln_A=newcol)
A B ln_A
0 1 0.426905 0.000000
1 2 -0.780949 0.693147
2 3 -0.418711 1.098612
3 4 -0.269708 1.386294
4 5 -0.274002 1.609438
5 6 -0.500792 1.791759
6 7 1.649697 1.945910
7 8 -1.495604 2.079442
8 9 0.549296 2.197225
9 10 -0.758542 2.302585
"""
data = self.copy()
# do all calculations first...
results = {}
for k, v in kwargs.items():
if callable(v):
results[k] = v(data)
else:
results[k] = v
# ... and then assign
for k, v in sorted(results.items()):
data[k] = v
return data
def _sanitize_column(self, key, value):
# Need to make sure new columns (which go into the BlockManager as new
# blocks) are always copied
def reindexer(value):
# reindex if necessary
if value.index.equals(self.index) or not len(self.index):
value = value.values.copy()
else:
# GH 4107
try:
value = value.reindex(self.index).values
except Exception as e:
# duplicate axis
if not value.index.is_unique:
raise e
# other
raise TypeError('incompatible index of inserted column '
'with frame index')
return value
if isinstance(value, Series):
value = reindexer(value)
elif isinstance(value, DataFrame):
# align right-hand-side columns if self.columns
# is multi-index and self[key] is a sub-frame
if isinstance(self.columns, MultiIndex) and key in self.columns:
loc = self.columns.get_loc(key)
if isinstance(loc, (slice, Series, np.ndarray, Index)):
cols = maybe_droplevels(self.columns[loc], key)
if len(cols) and not cols.equals(value.columns):
value = value.reindex_axis(cols, axis=1)
# now align rows
value = reindexer(value).T
elif isinstance(value, Categorical):
value = value.copy()
elif (isinstance(value, Index) or is_sequence(value)):
from pandas.core.series import _sanitize_index
# turn me into an ndarray
value = _sanitize_index(value, self.index, copy=False)
if not isinstance(value, (np.ndarray, Index)):
if isinstance(value, list) and len(value) > 0:
value = com._possibly_convert_platform(value)
else:
value = com._asarray_tuplesafe(value)
elif value.ndim == 2:
value = value.copy().T
else:
value = value.copy()
# possibly infer to datetimelike
if is_object_dtype(value.dtype):
value = _possibly_infer_to_datetimelike(value.ravel()).reshape(value.shape)
else:
# upcast the scalar
dtype, value = _infer_dtype_from_scalar(value)
value = np.repeat(value, len(self.index)).astype(dtype)
value = com._possibly_cast_to_datetime(value, dtype)
# return unconsolidatables directly
if isinstance(value, (Categorical, SparseArray)):
return value
# broadcast across multiple columns if necessary
if key in self.columns and value.ndim == 1:
if not self.columns.is_unique or isinstance(self.columns,
MultiIndex):
existing_piece = self[key]
if isinstance(existing_piece, DataFrame):
value = np.tile(value, (len(existing_piece.columns), 1))
return np.atleast_2d(np.asarray(value))
@property
def _series(self):
result = {}
for idx, item in enumerate(self.columns):
result[item] = Series(self._data.iget(idx), index=self.index,
name=item)
return result
def lookup(self, row_labels, col_labels):
"""Label-based "fancy indexing" function for DataFrame.
Given equal-length arrays of row and column labels, return an
array of the values corresponding to each (row, col) pair.
Parameters
----------
row_labels : sequence
The row labels to use for lookup
col_labels : sequence
The column labels to use for lookup
Notes
-----
Akin to::
result = []
for row, col in zip(row_labels, col_labels):
result.append(df.get_value(row, col))
Examples
--------
values : ndarray
The found values
"""
n = len(row_labels)
if n != len(col_labels):
raise ValueError('Row labels must have same size as column labels')
thresh = 1000
if not self._is_mixed_type or n > thresh:
values = self.values
ridx = self.index.get_indexer(row_labels)
cidx = self.columns.get_indexer(col_labels)
if (ridx == -1).any():
raise KeyError('One or more row labels was not found')
if (cidx == -1).any():
raise KeyError('One or more column labels was not found')
flat_index = ridx * len(self.columns) + cidx
result = values.flat[flat_index]
else:
result = np.empty(n, dtype='O')
for i, (r, c) in enumerate(zip(row_labels, col_labels)):
result[i] = self.get_value(r, c)
if is_object_dtype(result):
result = lib.maybe_convert_objects(result)
return result
#----------------------------------------------------------------------
# Reindexing and alignment
def _reindex_axes(self, axes, level, limit, method, fill_value, copy):
frame = self
columns = axes['columns']
if columns is not None:
frame = frame._reindex_columns(columns, copy, level, fill_value,
limit)
index = axes['index']
if index is not None:
frame = frame._reindex_index(index, method, copy, level,
fill_value, limit)
return frame
def _reindex_index(self, new_index, method, copy, level, fill_value=NA,
limit=None):
new_index, indexer = self.index.reindex(new_index, method, level,
limit=limit)
return self._reindex_with_indexers({0: [new_index, indexer]},
copy=copy, fill_value=fill_value,
allow_dups=False)
def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
limit=None):
new_columns, indexer = self.columns.reindex(new_columns, level=level,
limit=limit)
return self._reindex_with_indexers({1: [new_columns, indexer]},
copy=copy, fill_value=fill_value,
allow_dups=False)
def _reindex_multi(self, axes, copy, fill_value):
""" we are guaranteed non-Nones in the axes! """
new_index, row_indexer = self.index.reindex(axes['index'])
new_columns, col_indexer = self.columns.reindex(axes['columns'])
if row_indexer is not None and col_indexer is not None:
indexer = row_indexer, col_indexer
new_values = com.take_2d_multi(self.values, indexer,
fill_value=fill_value)
return self._constructor(new_values, index=new_index,
columns=new_columns)
else:
return self._reindex_with_indexers({0: [new_index, row_indexer],
1: [new_columns, col_indexer]},
copy=copy,
fill_value=fill_value)
@Appender(_shared_docs['reindex'] % _shared_doc_kwargs)
def reindex(self, index=None, columns=None, **kwargs):
return super(DataFrame, self).reindex(index=index, columns=columns,
**kwargs)
@Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
limit=None, fill_value=np.nan):
return super(DataFrame, self).reindex_axis(labels=labels, axis=axis,
method=method, level=level,
copy=copy, limit=limit,
fill_value=fill_value)
@Appender(_shared_docs['rename'] % _shared_doc_kwargs)
def rename(self, index=None, columns=None, **kwargs):
return super(DataFrame, self).rename(index=index, columns=columns,
**kwargs)
@Appender(_shared_docs['fillna'] % _shared_doc_kwargs)
def fillna(self, value=None, method=None, axis=None, inplace=False,
limit=None, downcast=None, **kwargs):
return super(DataFrame, self).fillna(value=value, method=method,
axis=axis, inplace=inplace,
limit=limit, downcast=downcast,
**kwargs)
@Appender(_shared_docs['shift'] % _shared_doc_kwargs)
def shift(self, periods=1, freq=None, axis=0, **kwargs):
return super(DataFrame, self).shift(periods=periods, freq=freq,
axis=axis, **kwargs)
def set_index(self, keys, drop=True, append=False, inplace=False,
verify_integrity=False):
"""
Set the DataFrame index (row labels) using one or more existing
columns. By default yields a new object.
Parameters
----------
keys : column label or list of column labels / arrays
drop : boolean, default True
Delete columns to be used as the new index
append : boolean, default False
Whether to append columns to existing index
inplace : boolean, default False
Modify the DataFrame in place (do not create a new object)
verify_integrity : boolean, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method
Examples
--------
>>> indexed_df = df.set_index(['A', 'B'])
>>> indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]])
>>> indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]])
Returns
-------
dataframe : DataFrame
"""
if not isinstance(keys, list):
keys = [keys]
if inplace:
frame = self
else:
frame = self.copy()
arrays = []
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, MultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index.get_level_values(i))
else:
arrays.append(self.index)
to_remove = []
for col in keys:
if isinstance(col, MultiIndex):
# append all but the last column so we don't have to modify
# the end of this loop
for n in range(col.nlevels - 1):
arrays.append(col.get_level_values(n))
level = col.get_level_values(col.nlevels - 1)
names.extend(col.names)
elif isinstance(col, Series):
level = col.values
names.append(col.name)
elif isinstance(col, Index):
level = col
names.append(col.name)
elif isinstance(col, (list, np.ndarray, Index)):
level = col
names.append(None)
else:
level = frame[col].values
names.append(col)
if drop:
to_remove.append(col)
arrays.append(level)
index = MultiIndex.from_arrays(arrays, names=names)
if verify_integrity and not index.is_unique:
duplicates = index.get_duplicates()
raise ValueError('Index has duplicate keys: %s' % duplicates)
for c in to_remove:
del frame[c]
# clear up memory usage
index._cleanup()
frame.index = index
if not inplace:
return frame
def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
col_fill=''):
"""
For DataFrame with multi-level index, return new DataFrame with
labeling information in the columns under the index names, defaulting
to 'level_0', 'level_1', etc. if any are None. For a standard index,
the index name will be used (if set), otherwise a default 'index' or
'level_0' (if 'index' is already taken) will be used.
Parameters
----------
level : int, str, tuple, or list, default None
Only remove the given levels from the index. Removes all levels by
default
drop : boolean, default False
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
inplace : boolean, default False
Modify the DataFrame in place (do not create a new object)
col_level : int or str, default 0
If the columns have multiple levels, determines which level the
labels are inserted into. By default it is inserted into the first
level.
col_fill : object, default ''
If the columns have multiple levels, determines how the other
levels are named. If None then the index name is repeated.
Returns
-------
resetted : DataFrame
"""
if inplace:
new_obj = self
else:
new_obj = self.copy()
def _maybe_casted_values(index, labels=None):
if isinstance(index, PeriodIndex):
values = index.asobject.values
elif (isinstance(index, DatetimeIndex) and
index.tz is not None):
values = index.asobject
else:
values = index.values
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)
# if we have the labels, extract the values with a mask
if labels is not None:
mask = labels == -1
values = values.take(labels)
if mask.any():
values, changed = com._maybe_upcast_putmask(values,
mask, np.nan)
return values
new_index = np.arange(len(new_obj),dtype='int64')
if isinstance(self.index, MultiIndex):
if level is not None:
if not isinstance(level, (tuple, list)):
level = [level]
level = [self.index._get_level_number(lev) for lev in level]
if len(level) < len(self.index.levels):
new_index = self.index.droplevel(level)
if not drop:
names = self.index.names
zipped = lzip(self.index.levels, self.index.labels)
multi_col = isinstance(self.columns, MultiIndex)
for i, (lev, lab) in reversed(list(enumerate(zipped))):
col_name = names[i]
if col_name is None:
col_name = 'level_%d' % i
if multi_col:
if col_fill is None:
col_name = tuple([col_name] *
self.columns.nlevels)
else:
name_lst = [col_fill] * self.columns.nlevels
lev_num = self.columns._get_level_number(col_level)
name_lst[lev_num] = col_name
col_name = tuple(name_lst)
# to ndarray and maybe infer different dtype
level_values = _maybe_casted_values(lev, lab)
if level is None or i in level:
new_obj.insert(0, col_name, level_values)
elif not drop:
name = self.index.name
if name is None or name == 'index':
name = 'index' if 'index' not in self else 'level_0'
if isinstance(self.columns, MultiIndex):
if col_fill is None:
name = tuple([name] * self.columns.nlevels)
else:
name_lst = [col_fill] * self.columns.nlevels
lev_num = self.columns._get_level_number(col_level)
name_lst[lev_num] = name
name = tuple(name_lst)
values = _maybe_casted_values(self.index)
new_obj.insert(0, name, values)
new_obj.index = new_index
if not inplace:
return new_obj
#----------------------------------------------------------------------
# Reindex-based selection methods
def dropna(self, axis=0, how='any', thresh=None, subset=None,
inplace=False):
"""
Return object with labels on given axis omitted where alternately any
or all of the data are missing
Parameters
----------
axis : {0, 1}, or tuple/list thereof
Pass tuple or list to drop on multiple axes
how : {'any', 'all'}
* any : if any NA values are present, drop that label
* all : if all values are NA, drop that label
thresh : int, default None
int value : require that many non-NA values
subset : array-like
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include
inplace : boolean, defalt False
If True, do operation inplace and return None.
Returns
-------
dropped : DataFrame
"""
if isinstance(axis, (tuple, list)):
result = self
for ax in axis:
result = result.dropna(how=how, thresh=thresh,
subset=subset, axis=ax)
else:
axis = self._get_axis_number(axis)
agg_axis = 1 - axis
agg_obj = self
if subset is not None:
ax = self._get_axis(agg_axis)
indices = ax.get_indexer_for(subset)
check = indices == -1
if check.any():
raise KeyError(list(np.compress(check,subset)))
agg_obj = self.take(indices,axis=agg_axis)
count = agg_obj.count(axis=agg_axis)
if thresh is not None:
mask = count >= thresh
elif how == 'any':
mask = count == len(agg_obj._get_axis(agg_axis))
elif how == 'all':
mask = count > 0
else:
if how is not None:
raise ValueError('invalid how option: %s' % how)
else:
raise TypeError('must specify how or thresh')
result = self.take(mask.nonzero()[0], axis=axis, convert=False)
if inplace:
self._update_inplace(result)
else:
return result
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
def drop_duplicates(self, subset=None, take_last=False, inplace=False):
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain columns
Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
Take the last observed row in a row. Defaults to the first row
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy
cols : kwargs only argument of subset [deprecated]
Returns
-------
deduplicated : DataFrame
"""
duplicated = self.duplicated(subset, take_last=take_last)
if inplace:
inds, = (-duplicated).nonzero()
new_data = self._data.take(inds)
self._update_inplace(new_data)
else:
return self[-duplicated]
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
def duplicated(self, subset=None, take_last=False):
"""
Return boolean Series denoting duplicate rows, optionally only
considering certain columns
Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
For a set of distinct duplicate rows, flag all but the last row as
duplicated. Default is for all but the first row to be flagged
cols : kwargs only argument of subset [deprecated]
Returns
-------
duplicated : Series
"""
from pandas.core.groupby import get_group_index
from pandas.core.algorithms import factorize
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
def f(vals):
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
return labels.astype('i8',copy=False), len(shape)
if subset is None:
subset = self.columns
elif not np.iterable(subset) or \
isinstance(subset, compat.string_types) or \
isinstance(subset, tuple) and subset in self.columns:
subset = subset,
vals = (self[col].values for col in subset)
labels, shape = map(list, zip( * map(f, vals)))
ids = get_group_index(labels, shape, sort=False, xnull=False)
return Series(duplicated_int64(ids, take_last), index=self.index)
#----------------------------------------------------------------------
# Sorting
def sort(self, columns=None, axis=0, ascending=True,
inplace=False, kind='quicksort', na_position='last'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
column(s)
Parameters
----------
columns : object
Column name(s) in frame. Accepts a column name or a list
for a nested sort. A tuple will be interpreted as the
levels of a multi-index.
ascending : boolean or list, default True
Sort ascending vs. descending. Specify list for multiple sort
orders
axis : {0, 1}
Sort index/rows versus columns
inplace : boolean, default False
Sort the DataFrame without creating a new instance
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
This option is only applied when sorting on a single column or label.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
Examples
--------
>>> result = df.sort(['A', 'B'], ascending=[1, 0])
Returns
-------
sorted : DataFrame
"""
return self.sort_index(by=columns, axis=axis, ascending=ascending,
inplace=inplace, kind=kind, na_position=na_position)
def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
kind='quicksort', na_position='last'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
a column
Parameters
----------
axis : {0, 1}
Sort index/rows versus columns
by : object
Column name(s) in frame. Accepts a column name or a list
for a nested sort. A tuple will be interpreted as the
levels of a multi-index.
ascending : boolean or list, default True
Sort ascending vs. descending. Specify list for multiple sort
orders
inplace : boolean, default False
Sort the DataFrame without creating a new instance
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
This option is only applied when sorting on a single column or label.
Examples
--------
>>> result = df.sort_index(by=['A', 'B'], ascending=[True, False])
Returns
-------
sorted : DataFrame
"""
from pandas.core.groupby import _lexsort_indexer, _nargsort
axis = self._get_axis_number(axis)
if axis not in [0, 1]: # pragma: no cover
raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
labels = self._get_axis(axis)
if by is not None:
if axis != 0:
raise ValueError('When sorting by column, axis must be 0 '
'(rows)')
if not isinstance(by, list):
by = [by]
if com.is_sequence(ascending) and len(by) != len(ascending):
raise ValueError('Length of ascending (%d) != length of by'
' (%d)' % (len(ascending), len(by)))
if len(by) > 1:
def trans(v):
if com.needs_i8_conversion(v):
return v.view('i8')
return v
keys = []
for x in by:
k = self[x].values
if k.ndim == 2:
raise ValueError('Cannot sort by duplicate column %s' % str(x))
keys.append(trans(k))
indexer = _lexsort_indexer(keys, orders=ascending,
na_position=na_position)
indexer = com._ensure_platform_int(indexer)
else:
by = by[0]
k = self[by].values
if k.ndim == 2:
# try to be helpful
if isinstance(self.columns, MultiIndex):
raise ValueError('Cannot sort by column %s in a multi-index'
' you need to explicity provide all the levels'
% str(by))
raise ValueError('Cannot sort by duplicate column %s'
% str(by))
if isinstance(ascending, (tuple, list)):
ascending = ascending[0]
indexer = _nargsort(k, kind=kind, ascending=ascending,
na_position=na_position)
elif isinstance(labels, MultiIndex):
# make sure that the axis is lexsorted to start
# if not we need to reconstruct to get the correct indexer
if not labels.is_lexsorted():
labels = MultiIndex.from_tuples(labels.values)
indexer = _lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
indexer = com._ensure_platform_int(indexer)
else:
indexer = _nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)
bm_axis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer, axis=bm_axis,
convert=False, verify=False)
if inplace:
return self._update_inplace(new_data)
else:
return self._constructor(new_data).__finalize__(self)
def sortlevel(self, level=0, axis=0, ascending=True,
inplace=False, sort_remaining=True):
"""
Sort multilevel index by chosen axis and primary level. Data will be
lexicographically sorted by the chosen level followed by the other
levels (in order)
Parameters
----------
level : int
axis : {0, 1}
ascending : boolean, default True
inplace : boolean, default False
Sort the DataFrame without creating a new instance
sort_remaining : boolean, default True
Sort by the other levels too.
Returns
-------
sorted : DataFrame
"""
axis = self._get_axis_number(axis)
the_axis = self._get_axis(axis)
if not isinstance(the_axis, MultiIndex):
raise TypeError('can only sort by level with a hierarchical index')
new_axis, indexer = the_axis.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
if self._is_mixed_type and not inplace:
ax = 'index' if axis == 0 else 'columns'
if new_axis.is_unique:
return self.reindex(**{ax: new_axis})
else:
return self.take(indexer, axis=axis, convert=False)
bm_axis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer, axis=bm_axis,
convert=False, verify=False)
if inplace:
return self._update_inplace(new_data)
else:
return self._constructor(new_data).__finalize__(self)
def swaplevel(self, i, j, axis=0):
"""
Swap levels i and j in a MultiIndex on a particular axis
Parameters
----------
i, j : int, string (can be mixed)
Level of index to be swapped. Can pass level name as string.
Returns
-------
swapped : type of caller (new object)
"""
result = self.copy()
axis = self._get_axis_number(axis)
if axis == 0:
result.index = result.index.swaplevel(i, j)
else:
result.columns = result.columns.swaplevel(i, j)
return result
def reorder_levels(self, order, axis=0):
"""
Rearrange index levels using input order.
May not drop or duplicate levels
Parameters
----------
order : list of int or list of str
List representing new level order. Reference level by number
(position) or by key (label).
axis : int
Where to reorder levels.
Returns
-------
type of caller (new object)
"""
axis = self._get_axis_number(axis)
if not isinstance(self._get_axis(axis),
MultiIndex): # pragma: no cover
raise TypeError('Can only reorder levels on a hierarchical axis.')
result = self.copy()
if axis == 0:
result.index = result.index.reorder_levels(order)
else:
result.columns = result.columns.reorder_levels(order)
return result
#----------------------------------------------------------------------
# Arithmetic / combination related
def _combine_frame(self, other, func, fill_value=None, level=None):
this, other = self.align(other, join='outer', level=level, copy=False)
new_index, new_columns = this.index, this.columns
def _arith_op(left, right):
if fill_value is not None:
left_mask = isnull(left)
right_mask = isnull(right)
left = left.copy()
right = right.copy()
# one but not both
mask = left_mask ^ right_mask
left[left_mask & mask] = fill_value
right[right_mask & mask] = fill_value
return func(left, right)
if this._is_mixed_type or other._is_mixed_type:
# unique
if this.columns.is_unique:
def f(col):
r = _arith_op(this[col].values, other[col].values)
return self._constructor_sliced(r, index=new_index,
dtype=r.dtype)
result = dict([(col, f(col)) for col in this])
# non-unique
else:
def f(i):
r = _arith_op(this.iloc[:, i].values,
other.iloc[:, i].values)
return self._constructor_sliced(r, index=new_index,
dtype=r.dtype)
result = dict([
(i, f(i)) for i, col in enumerate(this.columns)
])
result = self._constructor(result, index=new_index, copy=False)
result.columns = new_columns
return result
else:
result = _arith_op(this.values, other.values)
return self._constructor(result, index=new_index,
columns=new_columns, copy=False)
def _combine_series(self, other, func, fill_value=None, axis=None,
level=None):
if axis is not None:
axis = self._get_axis_name(axis)
if axis == 'index':
return self._combine_match_index(other, func, level=level, fill_value=fill_value)
else:
return self._combine_match_columns(other, func, level=level, fill_value=fill_value)
return self._combine_series_infer(other, func, level=level, fill_value=fill_value)
def _combine_series_infer(self, other, func, level=None, fill_value=None):
if len(other) == 0:
return self * NA
if len(self) == 0:
# Ambiguous case, use _series so works with DataFrame
return self._constructor(data=self._series, index=self.index,
columns=self.columns)
# teeny hack because one does DataFrame + TimeSeries all the time
if self.index.is_all_dates and other.index.is_all_dates:
warnings.warn(("TimeSeries broadcasting along DataFrame index "
"by default is deprecated. Please use "
"DataFrame.<op> to explicitly broadcast arithmetic "
"operations along the index"),
FutureWarning)
return self._combine_match_index(other, func, level=level, fill_value=fill_value)
else:
return self._combine_match_columns(other, func, level=level, fill_value=fill_value)
def _combine_match_index(self, other, func, level=None, fill_value=None):
left, right = self.align(other, join='outer', axis=0, level=level, copy=False)
if fill_value is not None:
raise NotImplementedError("fill_value %r not supported." %
fill_value)
return self._constructor(func(left.values.T, right.values).T,
index=left.index,
columns=self.columns, copy=False)
def _combine_match_columns(self, other, func, level=None, fill_value=None):
left, right = self.align(other, join='outer', axis=1, level=level, copy=False)
if fill_value is not None:
raise NotImplementedError("fill_value %r not supported" %
fill_value)
new_data = left._data.eval(
func=func, other=right, axes=[left.columns, self.index])
return self._constructor(new_data)
def _combine_const(self, other, func, raise_on_error=True):
if self.empty:
return self
new_data = self._data.eval(func=func, other=other, raise_on_error=raise_on_error)
return self._constructor(new_data)
def _compare_frame_evaluate(self, other, func, str_rep):
# unique
if self.columns.is_unique:
def _compare(a, b):
return dict([(col, func(a[col], b[col])) for col in a.columns])
new_data = expressions.evaluate(_compare, str_rep, self, other)
return self._constructor(data=new_data, index=self.index,
columns=self.columns, copy=False)
# non-unique
else:
def _compare(a, b):
return dict([(i, func(a.iloc[:, i], b.iloc[:, i]))
for i, col in enumerate(a.columns)])
new_data = expressions.evaluate(_compare, str_rep, self, other)
result = self._constructor(data=new_data, index=self.index,
copy=False)
result.columns = self.columns
return result
def _compare_frame(self, other, func, str_rep):
if not self._indexed_same(other):
raise ValueError('Can only compare identically-labeled '
'DataFrame objects')
return self._compare_frame_evaluate(other, func, str_rep)
def _flex_compare_frame(self, other, func, str_rep, level):
if not self._indexed_same(other):
self, other = self.align(other, 'outer', level=level, copy=False)
return self._compare_frame_evaluate(other, func, str_rep)
def combine(self, other, func, fill_value=None, overwrite=True):
"""
Add two DataFrame objects and do not propagate NaN values, so if for a
(column, time) one frame is missing a value, it will default to the
other frame's value (which might be NaN as well)
Parameters
----------
other : DataFrame
func : function
fill_value : scalar value
overwrite : boolean, default True
If True then overwrite values for common keys in the calling frame
Returns
-------
result : DataFrame
"""
other_idxlen = len(other.index) # save for compare
this, other = self.align(other, copy=False)
new_index = this.index
if other.empty and len(new_index) == len(self.index):
return self.copy()
if self.empty and len(other) == other_idxlen:
return other.copy()
# sorts if possible
new_columns = this.columns.union(other.columns)
do_fill = fill_value is not None
result = {}
for col in new_columns:
series = this[col]
otherSeries = other[col]
this_dtype = series.dtype
other_dtype = otherSeries.dtype
this_mask = isnull(series)
other_mask = isnull(otherSeries)
# don't overwrite columns unecessarily
# DO propogate if this column is not in the intersection
if not overwrite and other_mask.all():
result[col] = this[col].copy()
continue
if do_fill:
series = series.copy()
otherSeries = otherSeries.copy()
series[this_mask] = fill_value
otherSeries[other_mask] = fill_value
# if we have different dtypes, possibily promote
new_dtype = this_dtype
if this_dtype != other_dtype:
new_dtype = com._lcd_dtypes(this_dtype, other_dtype)
series = series.astype(new_dtype)
otherSeries = otherSeries.astype(new_dtype)
# see if we need to be represented as i8 (datetimelike)
# try to keep us at this dtype
needs_i8_conversion = com.needs_i8_conversion(new_dtype)
if needs_i8_conversion:
this_dtype = new_dtype
arr = func(series, otherSeries, True)
else:
arr = func(series, otherSeries)
if do_fill:
arr = com.ensure_float(arr)
arr[this_mask & other_mask] = NA
# try to downcast back to the original dtype
if needs_i8_conversion:
arr = com._possibly_cast_to_datetime(arr, this_dtype)
else:
arr = com._possibly_downcast_to_dtype(arr, this_dtype)
result[col] = arr
# convert_objects just in case
return self._constructor(result,
index=new_index,
columns=new_columns).convert_objects(
convert_dates=True,
copy=False)
def combine_first(self, other):
"""
Combine two DataFrame objects and default to non-null values in frame
calling the method. Result index columns will be the union of the
respective indexes and columns
Parameters
----------
other : DataFrame
Examples
--------
a's values prioritized, use values from b to fill holes:
>>> a.combine_first(b)
Returns
-------
combined : DataFrame
"""
def combiner(x, y, needs_i8_conversion=False):
x_values = x.values if hasattr(x, 'values') else x
y_values = y.values if hasattr(y, 'values') else y
if needs_i8_conversion:
mask = isnull(x)
x_values = x_values.view('i8')
y_values = y_values.view('i8')
else:
mask = isnull(x_values)
return expressions.where(mask, y_values, x_values,
raise_on_error=True)
return self.combine(other, combiner, overwrite=False)
def update(self, other, join='left', overwrite=True, filter_func=None,
raise_conflict=False):
"""
Modify DataFrame in place using non-NA values from passed
DataFrame. Aligns on indices
Parameters
----------
other : DataFrame, or object coercible into a DataFrame
join : {'left'}, default 'left'
overwrite : boolean, default True
If True then overwrite values for common keys in the calling frame
filter_func : callable(1d-array) -> 1d-array<boolean>, default None
Can choose to replace values other than NA. Return True for values
that should be updated
raise_conflict : boolean
If True, will raise an error if the DataFrame and other both
contain data in the same place.
"""
# TODO: Support other joins
if join != 'left': # pragma: no cover
raise NotImplementedError("Only left join is supported")
if not isinstance(other, DataFrame):
other = DataFrame(other)
other = other.reindex_like(self)
for col in self.columns:
this = self[col].values
that = other[col].values
if filter_func is not None:
mask = ~filter_func(this) | isnull(that)
else:
if raise_conflict:
mask_this = notnull(that)
mask_that = notnull(this)
if any(mask_this & mask_that):
raise ValueError("Data overlaps.")
if overwrite:
mask = isnull(that)
# don't overwrite columns unecessarily
if mask.all():
continue
else:
mask = notnull(this)
self[col] = expressions.where(
mask, this, that, raise_on_error=True)
#----------------------------------------------------------------------
# Misc methods
def first_valid_index(self):
"""
Return label for first non-NA/null value
"""
return self.index[self.count(1) > 0][0]
def last_valid_index(self):
"""
Return label for last non-NA/null value
"""
return self.index[self.count(1) > 0][-1]
#----------------------------------------------------------------------
# Data reshaping
def pivot(self, index=None, columns=None, values=None):
"""
Reshape data (produce a "pivot" table) based on column values. Uses
unique values from index / columns to form axes and return either
DataFrame or Panel, depending on whether you request a single value
column (DataFrame) or all columns (Panel)
Parameters
----------
index : string or object
Column name to use to make new frame's index
columns : string or object
Column name to use to make new frame's columns
values : string or object, optional
Column name to use for populating new frame's values
Notes
-----
For finer-tuned control, see hierarchical indexing documentation along
with the related stack/unstack methods
Examples
--------