In [4]:
import sys, pandas as pd, importlib.util
print("Python exe :", sys.executable)
print("pandas     :", pd.__version__)
print("has pyarrow:", importlib.util.find_spec("pyarrow") is not None)
print("has fparq  :", importlib.util.find_spec("fastparquet") is not None)


Python exe : /home/edward99/github/datenvorbearbeitung/.venv/bin/python
pandas     : 2.3.1
has pyarrow: True
has fparq  : True


In [37]:
import pandas as pd
df = pd.read_parquet('okcupid_profiles.parquet', engine='fastparquet')
print(df.head())
print(df.shape)


   age     status sex orientation       body_type               diet  \
0   22     single   m    straight  a little extra  strictly anything   
1   35     single   m    straight         average       mostly other   
2   38  available   m    straight            thin           anything   
3   23     single   m    straight            thin         vegetarian   
4   29     single   m    straight        athletic               None   

     drinks      drugs                          education  \
0  socially      never      working on college/university   
1     often  sometimes              working on space camp   
2  socially       None     graduated from masters program   
3  socially       None      working on college/university   
4  socially      never  graduated from college/university   

             ethnicity  ...  income                          job  \
0         asian, white  ...      -1               transportation   
1                white  ...   80000         hospitality / travel

In [6]:
from sklearn.preprocessing import OneHotEncoder
# Create OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Fit and transform body_type column
encoded_data = encoder.fit_transform(df[['body_type']])
# Bonus 1: Get feature names
feature_names = encoder.get_feature_names_out(['body_type'])
print(f"Feature names: {feature_names}")


Feature names: ['body_type_a little extra' 'body_type_athletic' 'body_type_average'
 'body_type_curvy' 'body_type_fit' 'body_type_full figured'
 'body_type_jacked' 'body_type_overweight' 'body_type_rather not say'
 'body_type_skinny' 'body_type_thin' 'body_type_used up' 'body_type_None']


In [7]:
# Create DataFrame with encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)
print(encoded_df.head())

   body_type_a little extra  body_type_athletic  body_type_average  \
0                       1.0                 0.0                0.0   
1                       0.0                 0.0                1.0   
2                       0.0                 0.0                0.0   
3                       0.0                 0.0                0.0   
4                       0.0                 1.0                0.0   

   body_type_curvy  body_type_fit  body_type_full figured  body_type_jacked  \
0              0.0            0.0                     0.0               0.0   
1              0.0            0.0                     0.0               0.0   
2              0.0            0.0                     0.0               0.0   
3              0.0            0.0                     0.0               0.0   
4              0.0            0.0                     0.0               0.0   

   body_type_overweight  body_type_rather not say  body_type_skinny  \
0                   0.0          

In [8]:
#Dummy Coding
dummies = pd.get_dummies(df["body_type"], prefix="body", drop_first=True)
df_dummy = pd.concat([df.drop(columns=["body_type"]), dummies], axis=1)

print("Dummy coded DataFrame: ", dummies.head(), sep="\n")


Dummy coded DataFrame: 
   body_athletic  body_average  body_curvy  body_fit  body_full figured  \
0          False         False       False     False              False   
1          False          True       False     False              False   
2          False         False       False     False              False   
3          False         False       False     False              False   
4           True         False       False     False              False   

   body_jacked  body_overweight  body_rather not say  body_skinny  body_thin  \
0        False            False                False        False      False   
1        False            False                False        False      False   
2        False            False                False        False       True   
3        False            False                False        False       True   
4        False            False                False        False      False   

   body_used up  
0         False  
1       

In [9]:
#Effect Coding
import pandas as pd

def effect_code(series: pd.Series, prefix="x"):
    """Return an effect–coded DataFrame (K-1 columns, 1/0/-1)."""
    dummies = pd.get_dummies(series, prefix=prefix, drop_first=True)
    # rows that were dropped_first() become the reference --> turn the 0s into -1
    ref_mask = (~series.isna()) & (dummies.sum(axis=1) == 0)
    dummies.loc[ref_mask, :] = -1
    return dummies

ec = effect_code(df["body_type"], prefix="body")
df_ec = pd.concat([df.drop(columns="body_type"), ec], axis=1)
print("Effect coded DataFrame: ", ec.head(), sep="\n")


Effect coded DataFrame: 
  body_athletic body_average body_curvy body_fit body_full figured  \
0            -1           -1         -1       -1                -1   
1         False         True      False    False             False   
2         False        False      False    False             False   
3         False        False      False    False             False   
4          True        False      False    False             False   

  body_jacked body_overweight body_rather not say body_skinny body_thin  \
0          -1              -1                  -1          -1        -1   
1       False           False               False       False     False   
2       False           False               False       False      True   
3       False           False               False       False      True   
4       False           False               False       False     False   

  body_used up  
0           -1  
1        False  
2        False  
3        False  
4        False  


  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1
  dummies.loc[ref_mask, :] = -1


In [10]:
# Effect Coding
def frequency_cutoff(series: pd.Series, min_count: int = 100) -> pd.Series:
    """Replace infrequent levels by 'other'."""
    vc = series.value_counts()
    return series.where(series.map(vc) >= min_count, other="other")

df["body_type_cut"] = frequency_cutoff(df["body_type"], min_count=200)
dummies = pd.get_dummies(df["body_type_cut"], prefix="body")

print(dummies.head(2))

   body_a little extra  body_athletic  body_average  body_curvy  body_fit  \
0                 True          False         False       False     False   
1                False          False          True       False     False   

   body_full figured  body_jacked  body_other  body_overweight  body_skinny  \
0              False        False       False            False        False   
1              False        False       False            False        False   

   body_thin  body_used up  
0      False         False  
1      False         False  


In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(
        handle_unknown="infrequent_if_exist",
        min_frequency=200,   # absolute cut-off
        drop="first"         # optional: keeps dummy coding
)
X_encoded = ohe.fit_transform(df[["body_type"]])
print("Encoded DataFrame with OHE: ", pd.DataFrame(X_encoded.toarray(), columns=ohe.get_feature_names_out()).head(2), sep="\n")



Encoded DataFrame with OHE: 
   body_type_athletic  body_type_average  body_type_curvy  body_type_fit  \
0                 0.0                0.0              0.0            0.0   
1                 0.0                1.0              0.0            0.0   

   body_type_full figured  body_type_jacked  body_type_overweight  \
0                     0.0               0.0                   0.0   
1                     0.0               0.0                   0.0   

   body_type_skinny  body_type_thin  body_type_used up  body_type_None  \
0               0.0             0.0                0.0             0.0   
1               0.0             0.0                0.0             0.0   

   body_type_infrequent_sklearn  
0                           0.0  
1                           0.0  


In [12]:
import pandas as pd
import numpy as np

# toy sample ─────────────────────────────────────────────
df = pd.DataFrame({
    "body_type": ["average", "athletic", "average", "curvy",
                  "athletic", "thin", "curvy", "average"],
    "liked":     [1,          0,          1,        0,
                  1,          1,       0,        1]       # 1 = liked, 0 = skipped
})
print("Sample DataFrame:\n", df, sep="\n")

# 1) build the 2-way contingency table
ct = (
    pd.crosstab(df.body_type, df.liked)
      .rename(columns={0: "non_like", 1: "like"})
      .assign(                                  # extra stats we care about
          total     = lambda x: x.like + x.non_like,
          # numerator of odds ratio  = like / (non_like + ε)
          odds_num  = lambda x: x.like / (x.non_like + 1e-6)
      )
)

print(ct)


Sample DataFrame:

  body_type  liked
0   average      1
1  athletic      0
2   average      1
3     curvy      0
4  athletic      1
5      thin      1
6     curvy      0
7   average      1
liked      non_like  like  total      odds_num
body_type                                     
athletic          1     1      2  9.999990e-01
average           0     3      3  3.000000e+06
curvy             2     0      2  0.000000e+00
thin              0     1      1  1.000000e+06


In [34]:
"""
Bin-count (target) encoding demo
✓ high-cardinality categorical → dense numeric columns
✓ no target leakage inside CV
✓ works end-to-end in an sklearn Pipeline
"""

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


# ---------------------------------------------------------------------
# 1. toy data ─ exactly the little body_type / liked table you typed
# ---------------------------------------------------------------------
df = pd.DataFrame({
    "body_type": ["average", "athletic", "average", "curvy",
                  "athletic", "thin", "curvy", "average"],
    "liked":     [1, 0, 1, 0, 1, 1, 0, 1]
})


# ---------------------------------------------------------------------
# 2. helpers to create look-up tables   (fit on *training* data only!)
# ---------------------------------------------------------------------
def _build_lookups(series: pd.Series, y: pd.Series,
                   min_count: int = 1, eps: float = 1e-6):
    """return two dicts: positive counts, odds numerator"""
    vc = series.value_counts()
    safe = series.where(series.map(vc) >= min_count, other="other")

    ct = pd.crosstab(safe, y)              # rows = category, cols = {0,1}
    ct = ct.rename(columns={0: "neg", 1: "pos"})

    pos_cnt  = ct["pos"].to_dict()
    neg_cnt  = ct["neg"].to_dict()
    odds_num = {k: pos_cnt[k] / (neg_cnt.get(k, 0) + eps) for k in ct.index}
    return pos_cnt, odds_num


# ---------------------------------------------------------------------
# 3. proper sklearn transformer   (stateless once look-ups are given)
# ---------------------------------------------------------------------
class BinCountEncoder(BaseEstimator, TransformerMixin):
    """
    Replace a single categorical column by
    – positive count
    – odds-numerator  (= pos / neg)
    """
    def __init__(self, min_count: int = 30):
        self.min_count = min_count
        # placeholders – filled in fit()
        self._pos_lookup = None
        self._odds_lookup = None
        self._feat_names = np.array(["bin_pos_cnt", "bin_odds_num"])

    # sklearn API ------------------------------------------------------
    def fit(self, X, y):
        # X arrives as DataFrame with ONE column
        cat = X.iloc[:, 0]
        self._pos_lookup, self._odds_lookup = _build_lookups(
            cat, y, min_count=self.min_count
        )
        return self

    def transform(self, X):
        cat = X.iloc[:, 0]
        pos  = cat.map(self._pos_lookup).fillna(self._pos_lookup.get("other", 0))
        odds = cat.map(self._odds_lookup).fillna(self._odds_lookup.get("other", 0.0))
        return np.vstack([pos, odds]).T             # shape (n_samples, 2)

    def get_feature_names_out(self, in_names=None):
        return self._feat_names


# ---------------------------------------------------------------------
# 4. pipeline  → safe CV without leakage
# ---------------------------------------------------------------------
pipe = Pipeline([
    ("bin",  BinCountEncoder(min_count=1)),   # 1 for this tiny example
    ("clf",  LogisticRegression(solver="lbfgs"))
])

print("4-fold CV accuracy:",
      cross_val_score(pipe, df[["body_type"]], df["liked"],
                      cv=4, scoring="accuracy").mean())


# ---------------------------------------------------------------------
# 5. train / inference example
# ---------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df[["body_type"]], df["liked"], test_size=0.25, random_state=0
)

pipe.fit(X_train, y_train)
print("\nencoded test rows:")
print(pipe.named_steps["bin"].transform(X_test))

print("\npredicted probabilities (like=1):")
print(pipe.predict_proba(X_test)[:, 1])


4-fold CV accuracy: 0.75

encoded test rows:
[[0.e+00 0.e+00]
 [2.e+00 2.e+06]]

predicted probabilities (like=1):
[0.25932963 1.        ]




In [38]:
print("Unique body types (using .unique()):")
print(df['body_type'].unique())

print("\nBody type counts (using .value_counts()):")
print(df['body_type'].value_counts())

print(f"\nTotal number of unique body types: {df['body_type'].nunique()}")

print("\nUnique body types (using set()):")
print(set(df['body_type'].dropna()))  # dropna() removes NaN values

Unique body types (using .unique()):
['a little extra' 'average' 'thin' 'athletic' 'fit' None 'skinny' 'curvy'
 'full figured' 'jacked' 'rather not say' 'used up' 'overweight']

Body type counts (using .value_counts()):
body_type
average           14652
fit               12711
athletic          11819
thin               4711
curvy              3924
a little extra     2629
skinny             1777
full figured       1009
overweight          444
jacked              421
used up             355
rather not say      198
Name: count, dtype: int64

Total number of unique body types: 12

Unique body types (using set()):
{'average', 'a little extra', 'jacked', 'full figured', 'athletic', 'skinny', 'curvy', 'overweight', 'rather not say', 'thin', 'fit', 'used up'}


In [None]:
df_copy = df.copy()
# ── 1. create a new column with the hashed values
df_copy



Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,income,job,last_online,location,offspring,pets,religion,sign,smokes,speaks
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (..."
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,...,-1,,2012-06-27-09-10,"san francisco, california",,has cats,,pisces but it doesn&rsquo;t matter,no,"english, french, c++"
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,white,...,20000,student,2012-06-28-14-22,"berkeley, california",doesn't want kids,likes cats,,pisces,no,"english, german (poorly)"
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"asian, black, other",...,-1,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,likes dogs and likes cats,,aquarius,no,english
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,59,single,f,straight,,,socially,never,graduated from college/university,,...,-1,sales / marketing / biz dev,2012-06-12-21-47,"oakland, california",has kids,has dogs,catholicism but not too serious about it,cancer and it&rsquo;s fun to think about,no,english
59942,24,single,m,straight,fit,mostly anything,often,sometimes,working on college/university,"white, other",...,-1,entertainment / media,2012-06-29-11-01,"san francisco, california",doesn't have kids,likes dogs and likes cats,agnosticism,leo but it doesn&rsquo;t matter,no,english (fluently)
59943,42,single,m,straight,average,mostly anything,not at all,never,graduated from masters program,asian,...,100000,construction / craftsmanship,2012-06-27-23-37,"south san francisco, california",doesn't have kids,,christianity but not too serious about it,sagittarius but it doesn&rsquo;t matter,no,english (fluently)
59944,27,single,m,straight,athletic,mostly anything,socially,often,working on college/university,"asian, black",...,-1,medicine / health,2012-06-23-13-01,"san francisco, california","doesn't have kids, but wants them",likes dogs and likes cats,agnosticism but not too serious about it,leo and it&rsquo;s fun to think about,trying to quit,"english (fluently), spanish (poorly), chinese ..."


In [41]:
from sklearn.feature_extraction import FeatureHasher
# ── 2. FeatureHasher expects a list / iterable of {feature_name: value} dicts
to_hash = (
    df['body_type']
      .fillna('missing')          # make sure every key is a str
      .astype(str)                # in case something weird slipped through
      .apply(lambda s: {s: 1})
)
hasher  = FeatureHasher(n_features=8, input_type='dict', alternate_sign=True)
hashed  = hasher.transform(to_hash)


# choose the vector width (power of two is common). 8 buckets   for the demo
hasher = FeatureHasher(n_features=8, input_type="dict", alternate_sign=True)
hashed = hasher.transform(to_hash)        # sparse CSR matrix

# ── 3. wrap for readability  ──────────────────────────────────────────
hashed_df = pd.DataFrame(
    hashed.toarray().astype(int),          # dense for the print-out
    columns=[f"h{i}" for i in range(hashed.shape[1])]
)
print("hashed representation:\n", hashed_df.head())

hashed representation:
    h0  h1  h2  h3  h4  h5  h6  h7
0   0   0   0   0   0  -1   0   0
1   0   0   0   0   0   1   0   0
2   0   0   0   0  -1   0   0   0
3   0   0   0   0  -1   0   0   0
4   0   0   0   1   0   0   0   0


In [61]:
# Replace df_copy.concat with:
df_combined = pd.concat([df_copy, hashed_df], axis=1)
print(df_combined.head())

   age     status sex orientation       body_type               diet  \
0   22     single   m    straight  a little extra  strictly anything   
1   35     single   m    straight         average       mostly other   
2   38  available   m    straight            thin           anything   
3   23     single   m    straight            thin         vegetarian   
4   29     single   m    straight        athletic               None   

     drinks      drugs                          education  \
0  socially      never      working on college/university   
1     often  sometimes              working on space camp   
2  socially       None     graduated from masters program   
3  socially       None      working on college/university   
4  socially      never  graduated from college/university   

             ethnicity  ...     smokes  \
0         asian, white  ...  sometimes   
1                white  ...         no   
2                 None  ...         no   
3                white  ...     

In [63]:
mask       = df_combined['drinks'].notna()        # keeps only real strings
df_clean   = df_combined[mask].copy()

df_clean

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,smokes,speaks,h0,h1,h2,h3,h4,h5,h6,h7
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,sometimes,english,0,0,0,0,0,-1,0,0
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,no,"english (fluently), spanish (poorly), french (...",0,0,0,0,0,1,0,0
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,...,no,"english, french, c++",0,0,0,0,-1,0,0,0
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,white,...,no,"english, german (poorly)",0,0,0,0,-1,0,0,0
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"asian, black, other",...,no,english,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,59,single,f,straight,,,socially,never,graduated from college/university,,...,no,english,0,0,0,0,0,0,-1,0
59942,24,single,m,straight,fit,mostly anything,often,sometimes,working on college/university,"white, other",...,no,english (fluently),0,1,0,0,0,0,0,0
59943,42,single,m,straight,average,mostly anything,not at all,never,graduated from masters program,asian,...,no,english (fluently),0,0,0,0,0,1,0,0
59944,27,single,m,straight,athletic,mostly anything,socially,often,working on college/university,"asian, black",...,trying to quit,"english (fluently), spanish (poorly), chinese ...",0,0,0,1,0,0,0,0


In [64]:
y = df_clean['drinks']
X = df_clean.drop(columns=['drinks'])


X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)  


Training set shape: (45568, 28)
Test set shape: (11393, 28)


In [60]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, accuracy_score

# 1. fit a model ───────────────────────────────────────────────
dummy = DummyClassifier(strategy="most_frequent")   # always predicts the most common class
dummy.fit(X_train, y_train)

# 2. predict on the held-out test set ──────────────────────────
y_pred = dummy.predict(X_test)

# 3. evaluate ─────────────────────────────────────────────────
print("accuracy :", accuracy_score(y_test, y_pred))
print("f1_macro :", f1_score(y_test, y_pred, average="macro"))  # suitable for multi-class
print("\nfull report:\n", classification_report(y_test, y_pred))


accuracy : 0.7335205828140086
f1_macro : 0.1410464135021097

full report:
               precision    recall  f1-score   support

 desperately       0.00      0.00      0.00        64
  not at all       0.00      0.00      0.00       653
       often       0.00      0.00      0.00      1033
      rarely       0.00      0.00      0.00      1192
    socially       0.73      1.00      0.85      8357
  very often       0.00      0.00      0.00        94

    accuracy                           0.73     11393
   macro avg       0.12      0.17      0.14     11393
weighted avg       0.54      0.73      0.62     11393



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [66]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# example: one-hot all object-dtype columns, leave numeric columns unchanged
cat_cols  = X_train.select_dtypes(include="object").columns
num_cols  = X_train.select_dtypes(exclude="object").columns

pre = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough",                         num_cols)
])

pipe = Pipeline([
        ("prep", pre),
        ("clf",  LogisticRegression(max_iter=10000, random_state=42))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("f1_macro :", f1_score(y_test, y_pred, average="macro"))


f1_macro : 0.1410464135021097


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
