Skip to content

Commit

Permalink
ENH: parser API changes, added parse_dates options, address GH #225, #…
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Oct 12, 2011
1 parent 99a4400 commit 0cc5616
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 103 deletions.
21 changes: 21 additions & 0 deletions RELEASE.rst
Expand Up @@ -10,11 +10,32 @@ pandas 0.4.4

**Release date:** not yet released

**New features / modules**

- Added `parse_dates` option to `read_csv` and `read_table` methods to
optionally try to parse dates in the index columns
- Added ability to join on multiple columns in `DataFrame.join` (GH #214)

**API Changes**

- `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for
`index_col` is now None. To use one or more of the columns as the resulting
DataFrame's index, these must be explicitly specified now
- Parsing functions no longer parse dates by default (GH #225)

**Improvements to existing features**

- Refactored merging / joining code into a tidy class and disabled unnecessary
computations in the float/object case, thus getting about 10% better
performance
- Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about
5x, regression from 0.3.0

**Bug fixes**

- Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should
be reported upstream to matplotlib (GH #224)


pandas 0.4.3
============
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/common.py
Expand Up @@ -518,6 +518,9 @@ def _asarray_tuplesafe(values, dtype=None):
if not isinstance(values, (list, tuple, np.ndarray)):
values = list(values)

if isinstance(values, list) and dtype == np.object_:
return lib.list_to_object_array(values)

result = np.asarray(values, dtype=dtype)

if issubclass(result.dtype.type, basestring):
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/frame.py
Expand Up @@ -430,7 +430,8 @@ def to_records(self, index=True):
return np.rec.fromarrays(arrays, names=names)

@classmethod
def from_csv(cls, path, header=0, delimiter=',', index_col=0):
def from_csv(cls, path, header=0, delimiter=',', index_col=0,
parse_dates=True):
"""
Read delimited file into DataFrame
Expand All @@ -447,16 +448,15 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0):
Notes
-----
Will attempt to convert index to datetimes for time series
data. Use read_csv for more options
data. Use read_table for more options
Returns
-------
y : DataFrame or DataFrame
"""
from pandas.io.parsers import read_table
df = read_table(path, header=header, sep=delimiter,
index_col=index_col)
return df
return read_table(path, header=header, sep=delimiter,
parse_dates=parse_dates, index_col=index_col)

def to_sparse(self, fill_value=None, kind='block'):
"""
Expand Down
188 changes: 107 additions & 81 deletions pandas/io/parsers.py
Expand Up @@ -9,36 +9,10 @@
from pandas.core.index import Index, MultiIndex
from pandas.core.frame import DataFrame

def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0,
na_values=None, date_parser=None, names=None):
"""
Read CSV file into DataFrame

Parameters
----------
filepath_or_buffer : string or file handle / StringIO
sep : string, default None
Delimiter to use. By default will try to automatically determine
this
header : int, default 0
Row to use for the column labels of the parsed DataFrame
skiprows : list-like
Row numbers to skip (0-indexed)
index_col : int or sequence., default 0
Column to use as the row labels of the DataFrame. Pass None if there is
no such column. If a sequence is given, a MultiIndex is used.
na_values : list-like, default None
List of additional strings to recognize as NA/NaN
date_parser : function
Function to use for converting dates to strings. Defaults to
dateutil.parser
names : array-like
List of column names
Returns
-------
parsed : DataFrame
"""
def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
skiprows=None, na_values=None, parse_dates=False,
date_parser=None):
import csv

if hasattr(filepath_or_buffer, 'read'):
Expand Down Expand Up @@ -71,43 +45,77 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0,
else:
lines = [l for l in reader]
f.close()
return _simple_parser(lines, header=header, indexCol=index_col,
colNames=names, na_values=na_values,
date_parser=date_parser)

def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None,
index_col=0, na_values=None, date_parser=None, names=None):
"""
Read delimited file into DataFrame
if date_parser is not None:
parse_dates = True

Parameters
----------
filepath_or_buffer : string or file handle
sep : string, default '\t'
Delimiter to use
header : int, default 0
Row to use for the column labels of the parsed DataFrame
skiprows : list-like
Row numbers to skip (0-indexed)
index_col : int or sequence, default 0
Column to use as the row labels of the DataFrame. Pass None if there is
no such column. If a sequence is given, a MultiIndex is used.
na_values : list-like, default None
List of additional strings to recognize as NA/NaN
date_parser : function
Function to use for converting dates to strings. Defaults to
dateutil.parser
names : array-like
List of column names
Returns
-------
parsed : DataFrame
"""
return read_csv(filepath_or_buffer, sep, header, skiprows,
index_col, na_values, date_parser, names)
return _simple_parser(lines,
header=header,
index_col=index_col,
colNames=names,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser)

def _simple_parser(lines, colNames=None, header=0, indexCol=0,

def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
names=None, skiprows=None, na_values=None, parse_dates=False,
date_parser=None):
return read_csv(filepath_or_buffer, sep=sep, header=header,
skiprows=skiprows, index_col=index_col,
na_values=na_values, date_parser=date_parser,
names=names, parse_dates=parse_dates)

_parser_params = """Parameters
----------
filepath_or_buffer : string or file handle / StringIO
%s
header : int, default 0
Row to use for the column labels of the parsed DataFrame
skiprows : list-like
Row numbers to skip (0-indexed)
index_col : int or sequence, default None
Column to use as the row labels of the DataFrame. If a sequence is
given, a MultiIndex is used.
na_values : list-like, default None
List of additional strings to recognize as NA/NaN
parse_dates : boolean, default False
Attempt to parse dates in the index column(s)
date_parser : function
Function to use for converting dates to strings. Defaults to
dateutil.parser
names : array-like
List of column names"""

_csv_sep = """sep : string, default None
Delimiter to use. By default will try to automatically determine
this"""

_table_sep = """sep : string, default \\t (tab-stop)
Delimiter to use"""

read_csv.__doc__ = """
Read CSV (comma-separated) file into DataFrame
%s
Returns
-------
parsed : DataFrame
""" % (_parser_params % _csv_sep)

read_table.__doc__ = """
Read delimited file into DataFrame
%s
Returns
-------
parsed : DataFrame
""" % (_parser_params % _table_sep)


def _simple_parser(lines, colNames=None, header=0, index_col=0,
na_values=None, date_parser=None, parse_dates=True):
"""
Workhorse function for processing nested list into DataFrame
Expand Down Expand Up @@ -142,30 +150,48 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0,
zipped_content = zip(*content)

if len(content) == 0: # pragma: no cover
raise Exception('No content to parse')
if index_col is not None:
if np.isscalar(index_col):
index = Index([], name=columns.pop(index_col))
else:
cp_cols = list(columns)
names = []
for i in index_col:
name = cp_cols[i]
columns.remove(name)
names.append(name)
index = MultiIndex.fromarrays([[]] * len(index_col),
names=names)
else:
index = Index([])

return DataFrame(index=index, columns=columns)

if index_col is None and len(content[0]) == len(columns) + 1:
index_col = 0

# no index column specified, so infer that's what is wanted
if indexCol is not None:
if np.isscalar(indexCol):
if indexCol == 0 and len(content[0]) == len(columns) + 1:
if index_col is not None:
if np.isscalar(index_col):
if index_col == 0 and len(content[0]) == len(columns) + 1:
index = zipped_content[0]
zipped_content = zipped_content[1:]
else:
index = zipped_content.pop(indexCol)
columns.pop(indexCol)
index = zipped_content.pop(index_col)
columns.pop(index_col)
else: # given a list of index
idx_names = []
index = []
for idx in indexCol:
for idx in index_col:
idx_names.append(columns[idx])
index.append(zipped_content[idx])
#remove index items from content and columns, don't pop in loop
for i in range(len(indexCol)):
for i in range(len(index_col)):
columns.remove(idx_names[i])
zipped_content.remove(index[i])


if np.isscalar(indexCol):
if np.isscalar(index_col):
if parse_dates:
index = _try_parse_dates(index, parser=date_parser)
index = Index(_maybe_convert_int(np.array(index, dtype=object)))
Expand Down Expand Up @@ -232,9 +258,6 @@ def _maybe_convert_int(arr):
return arr

def _maybe_convert_int_mindex(index, parse_dates, date_parser):
if len(index) == 0:
return index

for i in range(len(index)):
try:
int(index[i][0])
Expand Down Expand Up @@ -298,8 +321,8 @@ def __init__(self, path):
def __repr__(self):
return object.__repr__(self)

def parse(self, sheetname, header=0, skiprows=None, index_col=0,
na_values=None):
def parse(self, sheetname, header=0, skiprows=None, index_col=None,
parse_dates=False, date_parser=None, na_values=None):
"""
Read Excel table into DataFrame
Expand Down Expand Up @@ -348,7 +371,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=0,
value = datetime(*dt)
row.append(value)
data.append(row)
return _simple_parser(data, header=header, indexCol=index_col,
return _simple_parser(data, header=header, index_col=index_col,
parse_dates=parse_dates, date_parser=date_parser,
na_values=na_values)

#-------------------------------------------------------------------------------
Expand All @@ -363,7 +387,8 @@ def parseCSV(filepath, header=0, skiprows=None, indexCol=0,
"""
warnings.warn("parseCSV is deprecated. Use read_csv instead", FutureWarning)
return read_csv(filepath, header=header, skiprows=skiprows,
index_col=indexCol, na_values=na_values)
index_col=indexCol, na_values=na_values,
parse_dates=True)

def parseText(filepath, sep='\t', header=0,
indexCol=0, colNames=None): # pragma: no cover
Expand All @@ -374,7 +399,7 @@ def parseText(filepath, sep='\t', header=0,
warnings.warn("parseText is deprecated. Use read_table instead",
FutureWarning)
return read_table(filepath, sep=sep, header=header, index_col=indexCol,
names=colNames)
names=colNames, parse_dates=True)


def parseExcel(filepath, header=None, indexCol=0,
Expand All @@ -385,6 +410,7 @@ def parseExcel(filepath, header=None, indexCol=0,
warnings.warn("parseExcel is deprecated. Use the ExcelFile class instead",
FutureWarning)
excel_file = ExcelFile(filepath)
return excel_file.parse(sheetname, header=header, index_col=indexCol)
return excel_file.parse(sheetname, header=header, index_col=indexCol,
parse_dates=True)


0 comments on commit 0cc5616

Please sign in to comment.