In [None]:
#! pip install --upgrade pip

In [None]:
!pip install pyjanitor

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import janitor
import pandas_flavor as pf
import os


## Installation
pip install pyjanitor 

conda install pyjanitor -c conda-forge


# 0. PyJanitor - Definition

pyjanitor is a Python-based API on top of pandas inspired by the janitor R package. 

It aims to provide a clean, understandable interface based on method chaining for common and less-common tasks involving data cleaning and DataFrame manipulation.

Source: https://pyjanitor.readthedocs.io/notebooks/pyjanitor_intro.html#Why-pyjanitor?



In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df=pd.read_csv('/kaggle/input/beerreviews/beer_reviews.csv')


In [None]:
#df=pd.read_csv("beer_reviews.csv")
df.head(5)

In [None]:
df.isna().sum()

## 1. Remove Columns, Drop NA, Rename Columns

In [None]:
(df.remove_columns(["review_time"])
    .dropna(subset=["beer_abv"])
    .rename_column("review_overall", "Overall Review")
    .reset_index(drop=True)
)


In [None]:
df["review_profilename"]

## 1.1 Flag Null values

In [None]:
df=df.flag_nulls(columns=["review_profilename"])


In [None]:
df.head(6)

### 1.2 Clean up names by removing whitespace, punctuation / symbols, capitalization:



In [None]:
df=df.clean_names()
df.head(5)

### 1.3 Remove entirely empty rows / columns:



In [None]:
df["Empty"]=np.nan
df

In [None]:
df=df.remove_empty() ## Remove Empty column
df.head(3)

## Add columns

In [None]:
df.add_columns(lucky_number=np.random.randint(0,10,len(df)),
                    age=np.random.randint(0,100,len(df)),
                   )

## 2. Encode Categorical data

In [None]:
df.dtypes

In [None]:
df=df.encode_categorical(["beer_style"])
df.dtypes

## 3. Calculate mean, median of all numerical columns after grouping by.
Use .collapse_levels(), a pyjanitor convenience function, to convert the DataFrame returned by .agg() from having multi-level columns (because we supplied a list of aggregation operations) to single-level by concatenating the level names with an underscore:

In [None]:
(df
 .groupby("beer_style")
 .agg(["mean","median"])
 .collapse_levels() 
 .reset_index()
)

### 4. Merging the same information

In [None]:
data=pd.read_excel("dirty_data.xlsx")
data

In [None]:
df=(data.clean_names()
    .remove_empty()
    .rename_column("%_allocated", "percent_allocated")
    .rename_column("full_time_", "full_time"))
df


In [None]:
(data.coalesce(["Certification", "Certification.1"],
    new_column_name="Certification"))

## 5. Convert Excel date into a date format

In [None]:
data.convert_excel_date('Hire Date')


In [None]:
df=(data.clean_names()
    .remove_empty()
    .rename_column("%_allocated", "percent_allocated")
    .rename_column("full_time_", "full_time")
    .coalesce(
        column_names=["certification", "certification_1"],
        new_column_name="Certification")
    .convert_excel_date('hire_date')
   )

df
   

## 6. Use grouby_agg to find average price for each item and append column to dataframeÂ¶

In [None]:
data = {
    'item': ['shoe', 'shoe', 'bag', 'shoe', 'bag'],
    'MRP': [220, 450, 320, 200, 305],
    'number_sold': [100, 40, 56, 38, 25]
}

df = pd.DataFrame(data)

df

In [None]:
df.groupby_agg(
    by="item",
    agg="mean",
    agg_column_name="MRP",
    new_column_name="Avg. MRP"
)

## 7. Filtering and Counting

In [None]:
df = pd.DataFrame(
        {
            "name": ("black", "black", "black", "red", "red"),
            "type": ("chair", "chair", "sofa", "sofa", "plate"),
            "num": (4, 5, 12, 4, 3),
            "nulls": (1, 1, np.nan, np.nan, 3),
        }
    )

df

In [None]:
df.groupby_agg(
    by=["nulls","type"],
    agg="size",
    agg_column_name="type",
    new_column_name="counter"
).query('counter > 1')

## 8.Sorting categorical data

In [None]:
df.dtypes

In [None]:
df.sort_naturally("type")


## 9. Expand Column

In [None]:
df = pd.DataFrame(
        {
            "name": ("black, Fender", "black, Taylor", "blue, Fender", "red, Idaho", "red, Fender"),
            "type": ("guitar", "guitar", "banjo", "ukulele", "ukulel"),
            "num": (4, 5, 12, 4, 3),
            "nulls": (1, 1, np.nan, np.nan, 3),
        }
    )

df

In [None]:
df.expand_column(column_name='name',
                   sep=', ')  # note space in sep

## 10. Change the type of the column

In [None]:
df=df.change_type('num', float)
df=df.change_type('num', int)
df

## 11. Convert Unix Date

In [None]:
df = pd.DataFrame({ "unix_date": (1607977803,1607977804)})
df

In [None]:
df.convert_unix_date("unix_date")

## 12. Find and Replace: Exact and Regex


In [None]:
df = pd.DataFrame({
    'customer': ['Mary', 'Tom', 'Lila'],
    'order': ['ice coffee', 'lemonade', 'regular coffee']
})
df


In [None]:
df.find_replace(
    match='exact',
    order={'ice coffee': 'latte', 'regular coffee': 'latte'}
)

In [None]:
df.find_replace(
    match='regex',
    order={'latte$': 'water'},
)

## 13. Update Where (Conditions)

In [None]:
data = {
    "a": [1, 2, 3, 4],
    "b": [5, 6, 7, 8],
    "c": [0, 0, 0, 0]
}
df = pd.DataFrame(data)

df

## Documentation: 

https://pyjanitor.readthedocs.io/