# Pandas accessor for `redflag`

In [1]:
import pandas as pd

df = pd.read_csv("https://geocomp.s3.amazonaws.com/data/RPC_simple.csv")

df.head()

ModuleNotFoundError: No module named 'pandas'

In [2]:
import redflag as rf

rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])

1.4130434782602501

In [4]:
from pandas.api.extensions import register_dataframe_accessor

In [5]:
@register_dataframe_accessor("redflag")
class RedflagAccessor:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    def imbalance_degree(self, target=None):
        return rf.imbalance_degree(self._obj[target])

    def minority_classes(self, target=None):
        return rf.minority_classes(self._obj[target])

In [6]:
df.redflag.imbalance_degree(target='Lithology')

-1.0

Noice.

In [7]:
df.redflag.minority_classes(target='Lithology')

array([], dtype=float64)

In [8]:
import redflag as rf
data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]
rf.get_outliers(data)
# array([], dtype=int64)

array([], dtype=int64)

In [9]:
import numpy as np
import redflag as rf
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

X = np.arange(10).reshape(-1, 1)
np.random.shuffle(X)
y = np.squeeze(10 * X + 1)
pipe = make_pipeline(rf.DistributionComparator(), LinearRegression())
pipe.fit(X, y)
pipe.predict(X / 100)  # Dramatically different distribution.

array([1.1, 1.9, 1.4, 1.7, 1.5, 1.3, 1. , 1.8, 1.6, 1.2])

In [10]:
pipe.predict(X / 100)

array([1.1, 1.9, 1.4, 1.7, 1.5, 1.3, 1. , 1.8, 1.6, 1.2])

In [11]:
X

array([[1],
       [9],
       [4],
       [7],
       [5],
       [3],
       [0],
       [8],
       [6],
       [2]])

## Series Accessor

In [12]:
from pandas.api.extensions import register_series_accessor
from pandas.api.extensions import register_dataframe_accessor

@register_series_accessor("redflag")
class SeriesAccessor:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    def imbalance_degree(self):
        return rf.imbalance_degree(self._obj)

    def minority_classes(self):
        return rf.minority_classes(self._obj)

registration of accessor <class '__main__.SeriesAccessor'> under name 'redflag' for type <class 'pandas.core.series.Series'> is overriding a preexisting attribute with the same name.


In [13]:
df['Lithology'].redflag.imbalance_degree()

-1.0

## Avoid depending on pandas

We want to avoid importing Pandas if a person doesn't want to use the accessors.

BTW, we can't (or don't want to) avoid depending on `sklearn` so the sklearn.py module does not need to do the same.

In [14]:
def identity(arg):
    def decorator(func):
        return func 
    return decorator

@identity('foo')
def hello(x):
    return f"Hello {x}"

In [15]:
hello('Matt')

'Hello Matt'

Test with environment `foo`, which does not have `pandas`...

In [1]:
import pandas as pd

df = pd.read_csv("https://geocomp.s3.amazonaws.com/data/RPC_simple.csv")

df.head()

ModuleNotFoundError: No module named 'pandas'

In [2]:
import redflag as rf

rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])

1.4130434782602501