Merge pull request #3485 from pycaret/check_duplicate_cols

Check duplicate cols
pycaret · Apr 25, 2023 · 4610ee8 · 4610ee8
2 parents 87f398b + d8c4027
commit 4610ee8
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 0 deletions.
diff --git a/pycaret/internal/preprocess/preprocessor.py b/pycaret/internal/preprocess/preprocessor.py
@@ -119,6 +119,10 @@ def _prepare_dataset(self, X, y=None):
         # Make copy to not overwrite mutable arguments
         X = to_df(deepcopy(X))
 
+        # No duplicate column names are allowed
+        if len(set(X.columns)) != len(X.columns):
+            raise ValueError("Duplicate column names found in X.")
+
         # Prepare target column
         if isinstance(y, (list, tuple, np.ndarray, pd.Series)):
             if not isinstance(y, pd.Series):

diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
@@ -104,6 +104,14 @@ def test_assign_index(index):
     assert pc.dataset.index[0] != 0
 
 
+def test_duplicate_columns():
+    """Assert that an error is raised when there are duplicate columns."""
+    data = pycaret.datasets.get_data("juice")
+    data = data.rename(columns={"Purchase": "Id"})  # Make another column named Id
+    with pytest.raises(ValueError, match=".*Duplicate column names found in X.*"):
+        pycaret.classification.setup(data)
+
+
 def test_duplicate_indices():
     """Assert that an error is raised when there are duplicate indices."""
     data = pycaret.datasets.get_data("juice")