In [13]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

df = pd.read_parquet('okcupid_profiles.parquet', engine='fastparquet')
print(df['location'].head())

0    south san francisco, california
1                oakland, california
2          san francisco, california
3               berkeley, california
4          san francisco, california
Name: location, dtype: object


In [None]:
from sklearn.preprocessing import OneHotEncoder
# Create OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Fit and transform location column
encoded_data = encoder.fit_transform(df[['location']])
# Bonus 1: Get feature names
feature_names = encoder.get_feature_names_out(['location'])
print(f"Feature names: {len(feature_names)}")


Feature names: ['location_alameda, california' 'location_albany, california'
 'location_amsterdam, netherlands' 'location_arcadia, california'
 'location_asheville, north carolina' 'location_ashland, california'
 'location_astoria, new york' 'location_atherton, california'
 'location_atlanta, georgia' 'location_austin, texas'
 'location_bayshore, california' 'location_bellingham, washington'
 'location_bellwood, illinois' 'location_belmont, california'
 'location_belvedere tiburon, california' 'location_benicia, california'
 'location_berkeley, california' 'location_billings, montana'
 'location_boise, idaho' 'location_bolinas, california'
 'location_bonaduz, switzerland' 'location_boston, massachusetts'
 'location_boulder, colorado' 'location_brea, california'
 'location_brisbane, california' 'location_brooklyn, new york'
 'location_burlingame, california' 'location_cambridge, massachusetts'
 'location_campbell, california' 'location_canyon country, california'
 'location_canyon, cali

In [11]:
# ------------------------------------------------------------------
# 2. Coerce location to str, fill missing with a sentinel
# ------------------------------------------------------------------
location_str = df["location"].fillna("missing").astype(str)

# ------------------------------------------------------------------
# 3. Build list-of-strings format  (simplest)
# ------------------------------------------------------------------
samples = location_str.to_list()          # length = n_rows
samples = [[s] for s in samples]      # each sample must be an *iterable* of str

# ------------------------------------------------------------------
# 4. Feature hashing: 2⁸ = 256 buckets with signed hash
# ------------------------------------------------------------------
hasher = FeatureHasher(
    n_features=256,          # power of two makes modulo cheap; tune as needed
    input_type="string",
    alternate_sign=True      # +1 / –1 collisions cancel instead of always adding
)

hashed = hasher.transform(samples)     # sparse CSR matrix  (n_rows × 256)

# ------------------------------------------------------------------
# 5. Wrap in a DataFrame (dense just for the demo print-out)
# ------------------------------------------------------------------
hashed_df = pd.DataFrame(
    hashed.toarray(),                  # keep sparse in real pipelines!
    columns=[f"hash_{i}" for i in range(hashed.shape[1])],
    index=df.index
)

print("first 3 hashed rows:")
print(hashed_df.head(3))

# ------------------------------------------------------------------
# 6. Stick back onto the original frame (drop location if you’re done)
# ------------------------------------------------------------------
df_hashed = pd.concat([df.drop(columns="location"), hashed_df], axis=1)

print("final shape with hashed columns ->", df_hashed.shape)

first 3 hashed rows:
   hash_0  hash_1  hash_2  hash_3  hash_4  hash_5  hash_6  hash_7  hash_8  \
0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     1.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   hash_9  ...  hash_246  hash_247  hash_248  hash_249  hash_250  hash_251  \
0     0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   
1     0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   
2     0.0  ...       0.0       0.0       0.0       0.0       0.0       0.0   

   hash_252  hash_253  hash_254  hash_255  
0       0.0       0.0       0.0       0.0  
1       0.0       0.0       0.0       0.0  
2       0.0       0.0       0.0       0.0  

[3 rows x 256 columns]
final shape with hashed columns -> (59946, 276)
