# Pandas accessor for `redflag`

In [2]:
import pandas as pd

df = pd.read_csv("https://geocomp.s3.amazonaws.com/data/RPC_simple.csv")

df.head()

Unnamed: 0,Vp,Vs,rho,Lithology
0,3045.6,1595.7,2.109121,sandstone
1,3000.6,1517.1,2.090342,sandstone
2,3363.6,2041.5,2.13199,sandstone
3,3195.3,1606.2,2.184939,sandstone
4,4237.5,2448.6,2.472231,sandstone


In [3]:
import redflag as rf

rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])

1.4130434782602501

In [4]:
from pandas.api.extensions import register_dataframe_accessor

In [5]:
@register_dataframe_accessor("redflag")
class RedflagAccessor:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    def imbalance_degree(self, target=None):
        return rf.imbalance_degree(self._obj[target])

    def minority_classes(self, target=None):
        return rf.minority_classes(self._obj[target])

In [6]:
df.redflag.imbalance_degree(target='Lithology')

-1.0

Noice.

In [7]:
df.redflag.minority_classes(target='Lithology')

array([], dtype=float64)

In [8]:
import redflag as rf
data = 3 * [-3, -2, -2, -1, 0, 0, 0, 1, 2, 2, 3]
rf.get_outliers(data)
# array([], dtype=int64)

array([], dtype=int64)

In [9]:
import numpy as np
import redflag as rf
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

X = np.arange(10).reshape(-1, 1)
np.random.shuffle(X)
y = np.squeeze(10 * X + 1)
pipe = make_pipeline(rf.DistributionComparator(), LinearRegression())
pipe.fit(X, y)
pipe.predict(X / 100)  # Dramatically different distribution.

array([1.3, 1.1, 1.8, 1.6, 1.5, 1.2, 1.7, 1.9, 1.4, 1. ])

In [10]:
pipe.predict(X / 100)

array([1.3, 1.1, 1.8, 1.6, 1.5, 1.2, 1.7, 1.9, 1.4, 1. ])

In [11]:
X

array([[3],
       [1],
       [8],
       [6],
       [5],
       [2],
       [7],
       [9],
       [4],
       [0]])

## Series Accessor

In [12]:
from pandas.api.extensions import register_series_accessor
from pandas.api.extensions import register_dataframe_accessor

@register_series_accessor("redflag")
class SeriesAccessor:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    def imbalance_degree(self):
        return rf.imbalance_degree(self._obj)

    def minority_classes(self):
        return rf.minority_classes(self._obj)

In [13]:
df['Lithology'].redflag.imbalance_degree()

-1.0

## Avoid depending on pandas

We want to avoid importing Pandas if a person doesn't want to use the accessors.

BTW, we can't (or don't want to) avoid depending on `sklearn` so the sklearn.py module does not need to do the same.

In [14]:
def identity(arg):
    def decorator(func):
        return func 
    return decorator

@identity('foo')
def hello(x):
    return f"Hello {x}"

In [15]:
hello('Matt')

'Hello Matt'

Test with environment `foo`, which does not have `pandas`...

In [16]:
import pandas as pd

df = pd.read_csv("https://geocomp.s3.amazonaws.com/data/RPC_simple.csv")

df.head()

Unnamed: 0,Vp,Vs,rho,Lithology
0,3045.6,1595.7,2.109121,sandstone
1,3000.6,1517.1,2.090342,sandstone
2,3363.6,2041.5,2.13199,sandstone
3,3195.3,1606.2,2.184939,sandstone
4,4237.5,2448.6,2.472231,sandstone


In [17]:
import redflag as rf

rf.imbalance_degree([1,1,2,1,1,2,2,1,1,1,1,1,2,1,3,3,3,3,2,1,1,1,1])

1.4130434782602501

## Dummy models

In [18]:
df['Lithology']

0      sandstone
1      sandstone
2      sandstone
3      sandstone
4      sandstone
         ...    
395        shale
396        shale
397        shale
398        shale
399        shale
Name: Lithology, Length: 400, dtype: object

In [67]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score

def dummy_classification_scores(y, random_state=None):
    result = {'most_frequent': {}, 'stratified': {}}
    y = np.asanyarray(y)
    X = np.ones_like(y).reshape(-1, 1)
    for method, scores in result.items():
        model = DummyClassifier(strategy=method, random_state=random_state)
        _ = model.fit(X, y)
        scores['f1'] = f1_score(y, model.predict(X), average='weighted')
        y_prob = model.predict_proba(X)
        if rf.is_binary(y):
            scores['roc_auc'] = roc_auc_score(y, y_prob[:, 1])
        else:
            scores['roc_auc'] = roc_auc_score(y, y_prob, multi_class='ovr')            
    return result

dummy_classification_scores(df['Lithology'], random_state=42)

{'most_frequent': {'f1': 0.33333333333333326, 'roc_auc': 0.5},
 'stratified': {'f1': 0.47233840363611357, 'roc_auc': 0.4725}}

In [68]:
y_ = [1, 1, 1, 1, 1, 2, 2, 2, 3, 3]
dummy_classification_scores(y_, random_state=42)

{'most_frequent': {'f1': 0.3333333333333333, 'roc_auc': 0.5},
 'stratified': {'f1': 0.20000000000000004, 'roc_auc': 0.35654761904761906}}