Skip to content

Commit

Permalink
apacheGH-35485: [CI][Python] Archery formats Python C++ codebase (apa…
Browse files Browse the repository at this point in the history
…che#35487)

### Rationale for this change

Python C++ files weren't being formatted. 

### What changes are included in this PR?

This adds a new format to Archery and formats all the files. The last commit contains just the formatting. The first two contain the substantive changes.

Adds a new linter step that activates when both `--python` and `--clang-format` are passed. This was easy to implement and works fine, but does have a disadvantage: You can't run `clang-format` on _just_ the Python C++ codebase. It also runs it on the C++ and that requires a CMake build to be setup. I think this is fine for now.

### Are these changes tested?

Tested in CI and locally. I'd recommend pulling this branch down and verifying you can run this locally as well. We want contributors to be able to run this as well.

### Are there any user-facing changes?

This adds a new linting step. Instructions in the developer documentation are updated to reflect these changes.

* Closes: apache#35485

Authored-by: Will Jones <willjones127@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
  • Loading branch information
wjones127 authored and rtpsw committed May 16, 2023
1 parent a0acf96 commit e89334d
Show file tree
Hide file tree
Showing 28 changed files with 320 additions and 290 deletions.
5 changes: 5 additions & 0 deletions dev/archery/archery/lang/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
from ..utils.command import Command, capture_stdout, default_bin


class PythonCommand(Command):
def __init__(self, python_bin=None):
self.bin = default_bin(python_bin, "python")


class Flake8(Command):
def __init__(self, flake8_bin=None):
self.bin = default_bin(flake8_bin, "flake8")
Expand Down
32 changes: 31 additions & 1 deletion dev/archery/archery/utils/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from .git import git
from .logger import logger
from ..lang.cpp import CppCMakeDefinition, CppConfiguration
from ..lang.python import Autopep8, Flake8, CythonLint, NumpyDoc
from ..lang.python import Autopep8, Flake8, CythonLint, NumpyDoc, PythonCommand
from .rat import Rat, exclusion_from_globs
from .tmpdir import tmpdir

Expand Down Expand Up @@ -259,6 +259,31 @@ def python_linter(src, fix=False):
yield LintResult.from_cmd(cython_lint(*args))


def python_cpp_linter(src, clang_format=True, fix=False):
"""Run C++ linters on python/pyarrow/src/arrow/python."""
cpp_src = os.path.join(src.python, "pyarrow", "src", "arrow", "python")

python = PythonCommand()

if clang_format:
logger.info("Running clang-format for python/pyarrow/src/arrow/python")

if "CLANG_TOOLS_PATH" in os.environ:
clang_format_binary = os.path.join(
os.environ["CLANG_TOOLS_PATH"], "clang-format")
else:
clang_format_binary = "clang-format-14"

run_clang_format = os.path.join(src.cpp, "build-support",
"run_clang_format.py")
args = [run_clang_format, "--source_dir", cpp_src,
"--clang_format_binary", clang_format_binary]
if fix:
args += ["--fix"]

yield LintResult.from_cmd(python.run(*args))


def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None):
"""Run numpydoc linter on python.
Expand Down Expand Up @@ -434,6 +459,11 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False,
if python:
results.extend(python_linter(src, fix=fix))

if python and clang_format:
results.extend(python_cpp_linter(src,
clang_format=clang_format,
fix=fix))

if numpydoc:
results.extend(python_numpydoc())

Expand Down
7 changes: 7 additions & 0 deletions docs/source/developers/python.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ Some of the issues can be automatically fixed by passing the ``--fix`` option:
$ archery lint --python --fix
The Python code base also includes some C++ files. To fix formatting in those
files, add the ``--clang-format`` option:

.. code-block::
$ archery lint --python --clang-format --fix
.. _python-unit-testing:

Unit Testing
Expand Down
78 changes: 33 additions & 45 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -808,18 +808,13 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
return Status::OK();
}

template<typename F1, typename F2, typename F3>
Status ConvertMapHelper(
F1 resetRow,
F2 addPairToRow,
F3 stealRow,
const ChunkedArray& data,
PyArrayObject* py_keys,
PyArrayObject* py_items,
// needed for null checks in items
const std::vector<std::shared_ptr<Array>> item_arrays,
PyObject** out_values) {

template <typename F1, typename F2, typename F3>
Status ConvertMapHelper(F1 resetRow, F2 addPairToRow, F3 stealRow,
const ChunkedArray& data, PyArrayObject* py_keys,
PyArrayObject* py_items,
// needed for null checks in items
const std::vector<std::shared_ptr<Array>> item_arrays,
PyObject** out_values) {
OwnedRef key_value;
OwnedRef item_value;

Expand Down Expand Up @@ -893,7 +888,8 @@ Status CheckMapAsPydictsTypeError() {
PyErr_Fetch(&type, &value, &traceback);
std::string message;
RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
message += ". If keys are not hashable, then you must use the option "
message +=
". If keys are not hashable, then you must use the option "
"[maps_as_pydicts=None (default)]";

// resets the error
Expand All @@ -902,8 +898,8 @@ Status CheckMapAsPydictsTypeError() {
return ConvertPyError();
}

Status CheckForDuplicateKeys(bool error_on_duplicate_keys,
Py_ssize_t total_dict_len, Py_ssize_t total_raw_len) {
Status CheckForDuplicateKeys(bool error_on_duplicate_keys, Py_ssize_t total_dict_len,
Py_ssize_t total_raw_len) {
if (total_dict_len < total_raw_len) {
const char* message =
"[maps_as_pydicts] "
Expand Down Expand Up @@ -979,11 +975,7 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
PyTuple_Pack(2, key_value.obj(), item_value.obj()));
return CheckPyError();
},
[&list_item]{ return list_item.detach(); },
data,
py_keys,
py_items,
item_arrays,
[&list_item] { return list_item.detach(); }, data, py_keys, py_items, item_arrays,
out_values);
} else {
// Use a native pydict
Expand All @@ -998,9 +990,8 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
error_on_duplicate_keys = true;
} else {
auto val = std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
return Status::UnknownError(
"Received unknown option for maps_as_pydicts: " + std::to_string(val)
);
return Status::UnknownError("Received unknown option for maps_as_pydicts: " +
std::to_string(val));
}

auto status = ConvertMapHelper(
Expand All @@ -1009,26 +1000,23 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
dict_item.reset(PyDict_New());
return CheckPyError();
},
[&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
[&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value,
OwnedRef& item_value) {
auto setitem_result =
PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj());
ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
// returns -1 if there are internal errors around hashing/resizing
return setitem_result == 0 ?
Status::OK() :
Status::UnknownError("[maps_as_pydicts] "
"Unexpected failure inserting Arrow (key, value) pair into Python dict"
);
return setitem_result == 0 ? Status::OK()
: Status::UnknownError(
"[maps_as_pydicts] "
"Unexpected failure inserting Arrow (key, "
"value) pair into Python dict");
},
[&dict_item, &total_dict_len]{
[&dict_item, &total_dict_len] {
total_dict_len += PyDict_Size(dict_item.obj());
return dict_item.detach();
},
data,
py_keys,
py_items,
item_arrays,
out_values);
data, py_keys, py_items, item_arrays, out_values);

ARROW_RETURN_NOT_OK(status);
// If there were no errors generating the pydicts,
Expand Down Expand Up @@ -2202,14 +2190,14 @@ class PandasBlockCreator {

// Helper function for extension chunked arrays
// Constructing a storage chunked array of an extension chunked array
std::shared_ptr<ChunkedArray> GetStorageChunkedArray(std::shared_ptr<ChunkedArray> arr){
auto value_type = checked_cast<const ExtensionType&>(*arr->type()).storage_type();
ArrayVector storage_arrays;
for (int c = 0; c < arr->num_chunks(); c++) {
const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr->chunk(c));
storage_arrays.emplace_back(arr_ext.storage());
}
return std::make_shared<ChunkedArray>(std::move(storage_arrays), value_type);
std::shared_ptr<ChunkedArray> GetStorageChunkedArray(std::shared_ptr<ChunkedArray> arr) {
auto value_type = checked_cast<const ExtensionType&>(*arr->type()).storage_type();
ArrayVector storage_arrays;
for (int c = 0; c < arr->num_chunks(); c++) {
const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr->chunk(c));
storage_arrays.emplace_back(arr_ext.storage());
}
return std::make_shared<ChunkedArray>(std::move(storage_arrays), value_type);
};

class ConsolidatedBlockCreator : public PandasBlockCreator {
Expand Down Expand Up @@ -2239,7 +2227,7 @@ class ConsolidatedBlockCreator : public PandasBlockCreator {
} else {
// In case of an extension array default to the storage type
if (arrays_[column_index]->type()->id() == Type::EXTENSION) {
arrays_[column_index] = GetStorageChunkedArray(arrays_[column_index]);
arrays_[column_index] = GetStorageChunkedArray(arrays_[column_index]);
}
return GetPandasWriterType(*arrays_[column_index], options_, out);
}
Expand Down Expand Up @@ -2465,7 +2453,7 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,

// In case of an extension array default to the storage type
if (arr->type()->id() == Type::EXTENSION) {
arr = GetStorageChunkedArray(arr);
arr = GetStorageChunkedArray(arr);
}

PandasWriter::type output_type;
Expand Down
7 changes: 4 additions & 3 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ class Table;
namespace py {

enum class MapConversionType {
DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
LOSSY, // report warnings when lossiness is encountered due to duplicate keys
STRICT_, // raise a Python exception when lossiness is encountered due to duplicate keys
DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
LOSSY, // report warnings when lossiness is encountered due to duplicate keys
STRICT_, // raise a Python exception when lossiness is encountered due to duplicate
// keys
};

struct PandasOptions {
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/csv.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
#include <vector>

#include "arrow/csv/options.h"
#include "arrow/util/macros.h"
#include "arrow/python/common.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace py {
Expand Down
8 changes: 4 additions & 4 deletions python/pyarrow/src/arrow/python/datetime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@
#include <string_view>

#include "arrow/array.h"
#include "arrow/python/arrow_to_python_internal.h"
#include "arrow/python/common.h"
#include "arrow/python/helpers.h"
#include "arrow/python/platform.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/logging.h"
#include "arrow/util/regex.h"
#include "arrow/util/value_parsing.h"
#include "arrow/python/arrow_to_python_internal.h"
#include "arrow/python/common.h"
#include "arrow/python/helpers.h"
#include "arrow/python/platform.h"

namespace arrow {

Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/src/arrow/python/datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@
#include <algorithm>
#include <chrono>

#include "arrow/python/platform.h"
#include "arrow/python/visibility.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/int_util_overflow.h"
#include "arrow/util/logging.h"
#include "arrow/python/platform.h"
#include "arrow/python/visibility.h"

// By default, PyDateTimeAPI is a *static* variable. This forces
// PyDateTime_IMPORT to be called in every C/C++ module using the
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/src/arrow/python/decimal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
#include <algorithm>
#include <limits>

#include "arrow/type_fwd.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
#include "arrow/python/common.h"
#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/type_fwd.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"

namespace arrow {
namespace py {
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/deserialize.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
#include <memory>
#include <vector>

#include "arrow/status.h"
#include "arrow/python/serialize.h"
#include "arrow/python/visibility.h"
#include "arrow/status.h"

namespace arrow {

Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/src/arrow/python/extension_type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
#include <sstream>
#include <utility>

#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/python/extension_type.h"
#include "arrow/python/helpers.h"
#include "arrow/python/pyarrow.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"

namespace arrow {

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/extension_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
#include <string>

#include "arrow/extension_type.h"
#include "arrow/util/macros.h"
#include "arrow/python/common.h"
#include "arrow/python/visibility.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace py {
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.

#include "arrow/util/logging.h"
#include "arrow/python/filesystem.h"
#include "arrow/util/logging.h"

namespace arrow {

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
#include <vector>

#include "arrow/filesystem/filesystem.h"
#include "arrow/util/macros.h"
#include "arrow/python/common.h"
#include "arrow/python/visibility.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace py {
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/flight.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#include <signal.h>
#include <utility>

#include "arrow/python/flight.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
#include "arrow/python/flight.h"

using arrow::flight::FlightPayload;

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/gdb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "arrow/datum.h"
#include "arrow/extension_type.h"
#include "arrow/ipc/json_simple.h"
#include "arrow/python/gdb.h"
#include "arrow/record_batch.h"
#include "arrow/scalar.h"
#include "arrow/table.h"
Expand All @@ -33,7 +34,6 @@
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/python/gdb.h"

namespace arrow {

Expand Down
Loading

0 comments on commit e89334d

Please sign in to comment.