In [21]:
import sys, pandas as pd, importlib.util
print("Python exe :", sys.executable)
print("pandas     :", pd.__version__)
print("has pyarrow:", importlib.util.find_spec("pyarrow") is not None)
print("has fparq  :", importlib.util.find_spec("fastparquet") is not None)


Python exe : /home/edward99/github/datenvorbearbeitung/.venv/bin/python
pandas     : 2.3.1
has pyarrow: True
has fparq  : True


In [23]:
import pandas as pd
df = pd.read_parquet('okcupid_profiles.parquet', engine='fastparquet')
print(df.head()['body_type'])
print(df.shape)


0    a little extra
1           average
2              thin
3              thin
4          athletic
Name: body_type, dtype: object
(59946, 21)


In [3]:
from sklearn.preprocessing import OneHotEncoder
# Create OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Fit and transform body_type column
encoded_data = encoder.fit_transform(df[['body_type']])
# Bonus 1: Get feature names
feature_names = encoder.get_feature_names_out(['body_type'])
print(f"Feature names: {feature_names}")


Feature names: ['body_type_a little extra' 'body_type_athletic' 'body_type_average'
 'body_type_curvy' 'body_type_fit' 'body_type_full figured'
 'body_type_jacked' 'body_type_overweight' 'body_type_rather not say'
 'body_type_skinny' 'body_type_thin' 'body_type_used up' 'body_type_None']


In [4]:
# Create DataFrame with encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)
print(encoded_df.head())

   body_type_a little extra  body_type_athletic  body_type_average  \
0                       1.0                 0.0                0.0   
1                       0.0                 0.0                1.0   
2                       0.0                 0.0                0.0   
3                       0.0                 0.0                0.0   
4                       0.0                 1.0                0.0   

   body_type_curvy  body_type_fit  body_type_full figured  body_type_jacked  \
0              0.0            0.0                     0.0               0.0   
1              0.0            0.0                     0.0               0.0   
2              0.0            0.0                     0.0               0.0   
3              0.0            0.0                     0.0               0.0   
4              0.0            0.0                     0.0               0.0   

   body_type_overweight  body_type_rather not say  body_type_skinny  \
0                   0.0          

In [10]:
#Dummy Coding
dummies = pd.get_dummies(df["body_type"], prefix="body", drop_first=True)
df_dummy = pd.concat([df.drop(columns=["body_type"]), dummies], axis=1)

print("Dummy coded DataFrame: ", dummies.head(), sep="\n")


Dummy coded DataFrame: 
   body_athletic  body_average  body_curvy  body_fit  body_full figured  \
0          False         False       False     False              False   
1          False          True       False     False              False   
2          False         False       False     False              False   
3          False         False       False     False              False   
4           True         False       False     False              False   

   body_jacked  body_overweight  body_rather not say  body_skinny  body_thin  \
0        False            False                False        False      False   
1        False            False                False        False      False   
2        False            False                False        False       True   
3        False            False                False        False       True   
4        False            False                False        False      False   

   body_used up  
0         False  
1       

In [6]:
#Effect Coding
import pandas as pd

def effect_code(series: pd.Series, prefix="x"):
    """Return an effect–coded DataFrame (K-1 columns, 1/0/-1)."""
    dummies = pd.get_dummies(series, prefix=prefix, drop_first=True)
    # rows that were dropped_first() become the reference --> turn the 0s into -1
    ref_mask = (~series.isna()) & (dummies.sum(axis=1) == 0)
    dummies.loc[ref_mask, :] = -1
    return dummies

ec = effect_code(df["body_type"], prefix="body")
df_ec = pd.concat([df.drop(columns="body_type"), ec], axis=1)
print("Effect coded DataFrame: ", ec.head(), sep="\n")


Effect coded DataFrame: 
  body_athletic body_average body_curvy body_fit body_full figured  \
0            -1           -1         -1       -1                -1   
1         False         True      False    False             False   
2         False        False      False    False             False   
3         False        False      False    False             False   
4          True        False      False    False             False   

  body_jacked body_overweight body_rather not say body_skinny body_thin  \
0          -1              -1                  -1          -1        -1   
1       False           False               False       False     False   
2       False           False               False       False      True   
3       False           False               False       False      True   
4       False           False               False       False     False   

  body_used up  
0           -1  
1        False  
2        False  
3        False  
4        False  


  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1


In [14]:
# Effect Coding
def frequency_cutoff(series: pd.Series, min_count: int = 100) -> pd.Series:
    """Replace infrequent levels by 'other'."""
    vc = series.value_counts()
    return series.where(series.map(vc) >= min_count, other="other")

df["body_type_cut"] = frequency_cutoff(df["body_type"], min_count=200)
dummies = pd.get_dummies(df["body_type_cut"], prefix="body")

print(dummies.head(2))

   body_a little extra  body_athletic  body_average  body_curvy  body_fit  \
0                 True          False         False       False     False   
1                False          False          True       False     False   

   body_full figured  body_jacked  body_other  body_overweight  body_skinny  \
0              False        False       False            False        False   
1              False        False       False            False        False   

   body_thin  body_used up  
0      False         False  
1      False         False  


In [15]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(
        handle_unknown="infrequent_if_exist",
        min_frequency=200,   # absolute cut-off
        drop="first"         # optional: keeps dummy coding
)
X_encoded = ohe.fit_transform(df[["body_type"]])
print("Encoded DataFrame with OHE: ", pd.DataFrame(X_encoded.toarray(), columns=ohe.get_feature_names_out()).head(2), sep="\n")



Encoded DataFrame with OHE: 
   body_type_athletic  body_type_average  body_type_curvy  body_type_fit  \
0                 0.0                0.0              0.0            0.0   
1                 0.0                1.0              0.0            0.0   

   body_type_full figured  body_type_jacked  body_type_overweight  \
0                     0.0               0.0                   0.0   
1                     0.0               0.0                   0.0   

   body_type_skinny  body_type_thin  body_type_used up  body_type_None  \
0               0.0             0.0                0.0             0.0   
1               0.0             0.0                0.0             0.0   

   body_type_infrequent_sklearn  
0                           0.0  
1                           0.0  


In [17]:
import pandas as pd
import numpy as np

# toy sample ─────────────────────────────────────────────
df = pd.DataFrame({
    "body_type": ["average", "athletic", "average", "curvy",
                  "athletic", "thin", "curvy", "average"],
    "liked":     [1,          0,          1,        0,
                  1,          1,       0,        1]       # 1 = liked, 0 = skipped
})
print("Sample DataFrame:\n", df, sep="\n")

# 1) build the 2-way contingency table
ct = (
    pd.crosstab(df.body_type, df.liked)
      .rename(columns={0: "non_like", 1: "like"})
      .assign(                                  # extra stats we care about
          total     = lambda x: x.like + x.non_like,
          # numerator of odds ratio  = like / (non_like + ε)
          odds_num  = lambda x: x.like / (x.non_like + 1e-6)
      )
)

print(ct)


Sample DataFrame:

  body_type  liked
0   average      1
1  athletic      0
2   average      1
3     curvy      0
4  athletic      1
5      thin      1
6     curvy      0
7   average      1
liked      non_like  like  total      odds_num
body_type                                     
athletic          1     1      2  9.999990e-01
average           0     3      3  3.000000e+06
curvy             2     0      2  0.000000e+00
thin              0     1      1  1.000000e+06


In [19]:
"""
Bin-count (target) encoding demo
✓ high-cardinality categorical → dense numeric columns
✓ no target leakage inside CV
✓ works end-to-end in an sklearn Pipeline
"""

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


# ---------------------------------------------------------------------
# 1. toy data ─ exactly the little body_type / liked table you typed
# ---------------------------------------------------------------------
df = pd.DataFrame({
    "body_type": ["average", "athletic", "average", "curvy",
                  "athletic", "thin", "curvy", "average"],
    "liked":     [1, 0, 1, 0, 1, 1, 0, 1]
})


# ---------------------------------------------------------------------
# 2. helpers to create look-up tables   (fit on *training* data only!)
# ---------------------------------------------------------------------
def _build_lookups(series: pd.Series, y: pd.Series,
                   min_count: int = 1, eps: float = 1e-6):
    """return two dicts: positive counts, odds numerator"""
    vc = series.value_counts()
    safe = series.where(series.map(vc) >= min_count, other="other")

    ct = pd.crosstab(safe, y)              # rows = category, cols = {0,1}
    ct = ct.rename(columns={0: "neg", 1: "pos"})

    pos_cnt  = ct["pos"].to_dict()
    neg_cnt  = ct["neg"].to_dict()
    odds_num = {k: pos_cnt[k] / (neg_cnt.get(k, 0) + eps) for k in ct.index}
    return pos_cnt, odds_num


# ---------------------------------------------------------------------
# 3. proper sklearn transformer   (stateless once look-ups are given)
# ---------------------------------------------------------------------
class BinCountEncoder(BaseEstimator, TransformerMixin):
    """
    Replace a single categorical column by
    – positive count
    – odds-numerator  (= pos / neg)
    """
    def __init__(self, min_count: int = 30):
        self.min_count = min_count
        # placeholders – filled in fit()
        self._pos_lookup = None
        self._odds_lookup = None
        self._feat_names = np.array(["bin_pos_cnt", "bin_odds_num"])

    # sklearn API ------------------------------------------------------
    def fit(self, X, y):
        # X arrives as DataFrame with ONE column
        cat = X.iloc[:, 0]
        self._pos_lookup, self._odds_lookup = _build_lookups(
            cat, y, min_count=self.min_count
        )
        return self

    def transform(self, X):
        cat = X.iloc[:, 0]
        pos  = cat.map(self._pos_lookup).fillna(self._pos_lookup.get("other", 0))
        odds = cat.map(self._odds_lookup).fillna(self._odds_lookup.get("other", 0.0))
        return np.vstack([pos, odds]).T             # shape (n_samples, 2)

    def get_feature_names_out(self, in_names=None):
        return self._feat_names


# ---------------------------------------------------------------------
# 4. pipeline  → safe CV without leakage
# ---------------------------------------------------------------------
pipe = Pipeline([
    ("bin",  BinCountEncoder(min_count=1)),   # 1 for this tiny example
    ("clf",  LogisticRegression(solver="lbfgs"))
])

print("4-fold CV accuracy:",
      cross_val_score(pipe, df[["body_type"]], df["liked"],
                      cv=4, scoring="accuracy").mean())


# ---------------------------------------------------------------------
# 5. train / inference example
# ---------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df[["body_type"]], df["liked"], test_size=0.25, random_state=0
)

pipe.fit(X_train, y_train)
print("\nencoded test rows:")
print(pipe.named_steps["bin"].transform(X_test))

print("\npredicted probabilities (like=1):")
print(pipe.predict_proba(X_test)[:, 1])


4-fold CV accuracy: 0.75

encoded test rows:
[[0.e+00 0.e+00]
 [2.e+00 2.e+06]]

predicted probabilities (like=1):
[0.25932963 1.        ]




In [28]:
from sklearn.feature_extraction import FeatureHasher
# ── 2. FeatureHasher expects a list / iterable of {feature_name: value} dicts
to_hash = df['body_type'].apply(lambda s: {s: 1})

# choose the vector width (power of two is common). 8 buckets   for the demo
hasher = FeatureHasher(n_features=8, input_type="dict", alternate_sign=True)
hashed = hasher.transform(to_hash)        # sparse CSR matrix

# ── 3. wrap for readability  ──────────────────────────────────────────
hashed_df = pd.DataFrame(
    hashed.toarray().astype(int),          # dense for the print-out
    columns=[f"h{i}" for i in range(hashed.shape[1])]
)
print("hashed representation:\n", hashed_df.head())

TypeError: feature names must be strings