Merge pull request #6 from ray310/package

Prepared for package distribution
ray310 · Jun 4, 2022 · 8e4cbe1 · 8e4cbe1
2 parents 9505f5d + 6f0be82
commit 8e4cbe1
Show file tree

Hide file tree

Showing 14 changed files with 438 additions and 66 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,40 @@
+# Pylint configuration settings
+
+[MASTER]
+fail-under=9.0
+jobs=0
+
+[MESSAGES CONTROL]
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead
+
+
+[REPORTS]
+output-format=colorized
+
+[BASIC]
+good-names=i,
+           j,
+           k,
+           s,
+           x,
+           y,
+           z,
+           df,
+           fh,
+           _
+
+[FORMAT]
+max-line-length=88
+
+[STRING]
+check-quote-consistency=yes
+
+[DESIGN]
+max-args=5
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+
+## Unreleased
+### Added
+- Improved documentation
+____
+## 0.0.1 - 2022-06-04
+### Added
+- First version of Panda-Helper
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Ray310
+Copyright (c) 2022 Ray310
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include tests *.py *.txt *csv
+include requirements.txt
diff --git a/README.md b/README.md
@@ -1,47 +1,230 @@
-# Quickly inspect Pandas DataFrames and Series with Panda-Helper data profiles
-- Perform initial data exploration
-- Detect data issues and help with quality control 
+# Panda-Helper: Quickly and easily inspect data
+Panda-Helper creates data profiles for data in Pandas DataFrames and Series
 
-### DataFrameProfile:
-- Reports DataFrame shape, Series names, and Series data types
-- Checks for obvious duplicates
-- Provides distribution statistics on null values per row
+Assess data quality and usefulness with minimal effort
 
-```
-DataFrameProfile(df)
-```
-![Sample DataFrameProfile](https://github.com/ray310/Panda-Helper/blob/main/images/df_profile.png)
+Effortlessly perform initial data exploration, _so you can move on to more in-depth analysis_
 
+-----
+### DataFrame profiles quickly and easily:
+- Report shape
+- Detect duplicated rows
+- Display series names and data types
+- Provide distribution statistics on null values per row providing a view on data completeness
 
-### SeriesProfile:
-- Reports data type, number of unique values, and number of null values
-- Displays a frequency table of the most and least common values
-- Provides distribution statistics (for numeric data)
+__Sample DataFrame profile__<br>
+_Vehicles passing through toll stations_
 
-#### Catgorical data
-```
-SeriesProfile(df["Direction"])
-```
-![Sample Categorical SeriesProfile](https://github.com/ray310/Panda-Helper/blob/main/images/series_profile_direction.png)
+    DataFrame-Level Info
+    -------------------------  ------------
+    DF Shape                   (1586280, 6)
+    Duplicated Rows             2184
+
+    Column Name                 Data Type
+    --------------------------  -----------
+    Plaza ID                    int64
+    Date                        object
+    Hour                        int64
+    Direction                   object
+    # Vehicles - ETC (E-ZPass)  int64
+    # Vehicles - Cash/VToll     int64
+
+    Summary of Nulls Per Row
+    --------------------------  -----------
+    count                       1.58628e+06
+    min                         0
+    1%                          0
+    5%                          0
+    25%                         0
+    50%                         0
+    75%                         0
+    95%                         0
+    99%                         0
+    max                         0
+    median                      0
+    mean                        0
+    median absolute deviation   0
+    standard deviation          0
+    skew                        0
 
+-----
+### Series profiles quickly and easily report the:
+- Series data type 
+- Count of non-null values in the series
+- Number of unique values
+- Count of null values
+- Counts and frequency of the most and least common values
+- Distribution statistics for numeric data
 
-#### Numeric data
-```
-SeriesProfile(df["# Vehicles - ETC (E-ZPass)"])
-```
-![Sample Numeric SeriesProfile](https://github.com/ray310/Panda-Helper/blob/main/images/series_profile_ez.png)
+__Sample profile of categorical data__<br>
+_Direction vehicles are traveling_
 
+    Direction Info
+    ----------------  -------
+    Data Type         object
+    Count             1586280
+    Unique Values     2
+    Null Values       0
+
+    Value      Count  % of total
+    -------  -------  ------------
+    I         814100  51.32%
+    O         772180  48.68%
 
-### Using Panda-Helper
-- Note that Panda-Helper is not currently a package
-- Install any required dependencies to your environment of choice
-- Copy `reports.py` (in `src/pandahelper` directory) and incorporate into your analyses
-- Cite this repo or let me know if this is helpful
+__Sample profile of numeric data__<br>
+_Hourly vehicle counts at tolling points_
 
+    # Vehicles - ETC (E-ZPass) Info
+    ---------------------------------  -------
+    Data Type                          int64
+    Count                              1586280
+    Unique Values                      8987
+    Null Values                        0
+
+      Value    Count  % of total
+    -------  -------  ------------
+          0     3137  0.20%
+         43     1762  0.11%
+         44     1743  0.11%
+         40     1712  0.11%
+         42     1699  0.11%
+         41     1682  0.11%
+         39     1676  0.11%
+         37     1673  0.11%
+         48     1659  0.10%
+         46     1654  0.10%
+         38     1646  0.10%
+         45     1641  0.10%
+         36     1636  0.10%
+         52     1574  0.10%
+         47     1572  0.10%
+         50     1571  0.10%
+         51     1555  0.10%
+         53     1547  0.10%
+         55     1543  0.10%
+         34     1534  0.10%
+       8269        1  0.00%
+       8438        1  0.00%
+       8876        1  0.00%
+       8261        1  0.00%
+       8694        1  0.00%
+
+    Statistic                            Value
+    -------------------------  ---------------
+    count                          1.58628e+06
+    min                            0
+    1%                            25
+    5%                            68
+    25%                          407
+    50%                         1054
+    75%                         2071
+    95%                         3583
+    99%                         6308
+    max                        16854
+    median                      1054
+    mean                        1373.16
+    median absolute deviation    751
+    standard deviation          1253.1
+    skew                           1.69154
 
-<br><br>Demonstration data obtained from: <br>
-https://data.ny.gov/Transportation/Hourly-Traffic-on-Metropolitan-Transportation-Auth/qzve-kjga/data
+-----
+### Installing Panda-Helper
+`pip install panda-helper`
 
+-----
+### Using Panda-Helper
+__Profiling a DataFrame__<br>
+Create the DataFrameProfile and then display it or save the profile.
+```python
+import pandas as pd
+import pandahelper.reports as ph
+
+data = {
+    "user_id": [1, 2, 3, 4, 4],
+    "transaction": ["purchase", "return", "purchase", "exchange", "exchange"],
+    "amount": [100.00, None, 1400.00, 85.12, 85.12],
+    "survey": [None, None, None, "online", "online"],
+}
+df = pd.DataFrame(data)
+df_profile = ph.DataFrameProfile(df)
+df_profile
+```
 
-<br><br>Test data obtained from: <br>
-https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95
+    DataFrame-Level Info
+    -------------------------  ------
+    DF Shape                   (5, 4)
+    Obviously Duplicated Rows  1
+
+    Column Name    Data Type
+    -------------  -----------
+    user_id        int64
+    transaction    object
+    amount         float64
+    survey         object
+
+    Summary of Nulls Per Row
+    --------------------------  --------
+    count                       5
+    min                         0
+    1%                          0
+    5%                          0
+    25%                         0
+    50%                         1
+    75%                         1
+    95%                         1.8
+    99%                         1.96
+    max                         2
+    median                      1
+    mean                        0.8
+    median absolute deviation   1
+    standard deviation          0.83666
+    skew                        0.512241
+
+```python
+df_profile.save_report("df_profile.txt")
+```
+
+__Profiling a Series__<br>
+Create the SeriesProfile and then display it or save it. That's it!
+```python
+series_profile = ph.SeriesProfile(df["amount"])
+series_profile
+```
+    amount Info
+    -------------  -------
+    Data Type      float64
+    Count          4
+    Unique Values  3
+    Null Values    1
+
+      Value    Count  % of total
+    -------  -------  ------------
+      85.12        2  50.00%
+     100           1  25.00%
+    1400           1  25.00%
+
+    Statistic                       Value
+    -------------------------  ----------
+    count                         4
+    min                          85.12
+    1%                           85.12
+    5%                           85.12
+    25%                          85.12
+    50%                          92.56
+    75%                         425
+    95%                        1205
+    99%                        1361
+    max                        1400
+    median                       92.56
+    mean                        417.56
+    median absolute deviation     7.44
+    standard deviation          654.998
+    skew                          1.99931
+
+```python
+series_profile.save_report("amount_profile.txt")
+```
+____
+### Sample data obtained from:
+- https://data.ny.gov/Transportation/Hourly-Traffic-on-Metropolitan-Transportation-Auth/qzve-kjga/data
+- https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95
diff --git a/conda_environment_dev.yaml b/conda_environment_dev.yaml
@@ -0,0 +1,20 @@
+name: panda_helper
+channels:
+    - defaults
+    - conda-forge
+dependencies:
+    - python=3.9
+    - black
+    - build
+    - coverage
+    - jupyter
+    - pandas
+    - pip
+    - pydocstyle
+    - pylint
+    - pytest
+    - notebook
+    - scipy
+    - twine
+    - pip:
+        - tabulate
diff --git a/conda_requirements.yaml b/conda_requirements.yaml
diff --git a/images/df_profile.png b/images/df_profile.png
diff --git a/images/series_profile_direction.png b/images/series_profile_direction.png
diff --git a/images/series_profile_ez.png b/images/series_profile_ez.png
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,37 @@
+[metadata]
+name = panda-helper
+version = 0.0.1
+author = Ray310
+author_email = ray310@pm.me
+classifiers =
+    Programming Language :: Python :: 3
+    License :: OSI Approved :: MIT License
+    Operating System :: OS Independent
+    Development Status :: 3 - Alpha
+    Topic :: Scientific/Engineering
+description = Data profiler for Pandas
+keywords = data-profiling
+license = MIT
+license_files = LICENSE
+long_description = file: README.md
+long_description_content_type = text/markdown
+platforms = any
+project_urls =
+    Project = https://github.com/ray310/Panda-Helper
+    Tracker = https://github.com/ray310/Panda-Helper/issues
+
+
+[options]
+include_package_data = true
+install_requires =
+    pandas
+    scipy>=1.5
+    tabulate
+packages = find:
+package_dir =
+    = src
+python_requires = >=3.6
+
+
+[options.packages.find]
+where = src