##### Report for the okcupid_profile dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score


In [2]:

# -----------------------------
# 1. Load the Dataset
# -----------------------------
file_path = '../data/okcupid_profiles.csv'  # Updated file path
df = pd.read_csv(file_path)


In [7]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Prevent line wrapping for better visibility


df.head()

   age     status sex orientation       body_type               diet    drinks      drugs                          education            ethnicity  height  income                          job       last_online                         location                               offspring                       pets                                  religion                                sign     smokes                                             speaks                                             essay0                                             essay1                                             essay2                                             essay3                                             essay4                                             essay5                       essay6                                             essay7                                             essay8                                             essay9
0   22     single   m    straight  a little extra  strictly any

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,last_online,location,offspring,pets,religion,sign,smokes,speaks,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (...",i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,68.0,-1,,2012-06-27-09-10,"san francisco, california",,has cats,,pisces but it doesn&rsquo;t matter,no,"english, french, c++","i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement conversation creation contemplation t...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ..."
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",doesn't want kids,likes cats,,pisces,no,"english, german (poorly)",i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . . lynch, jarmusch...",,cats and german philosophy,,,you feel so inclined.
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"asian, black, other",66.0,-1,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,likes dogs and likes cats,,aquarius,no,english,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at: http://bagsbrown....,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians at the moment...",,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

In [15]:
import pandas as pd
import re

# Assume df is already defined (e.g., loaded via pd.read_csv("your_data.csv"))

# Define columns to skip because they have too many unique values
skip_cols = ['age', 'income', 'last_online', 'location'] + [f"essay{i}" for i in range(10)]

# ---------------------------
# Helper functions for cleaning
# ---------------------------

def clean_speaks(entry):
    """
    Cleans an entry from the 'speaks' column by splitting on commas,
    removing any parenthetical information, stripping extra spaces,
    and lowercasing the language names.
    Returns a list of cleaned language tokens.
    """
    languages = entry.split(',')
    cleaned = []
    for lang in languages:
        # Remove parenthetical remarks (e.g., "english (fluently)" -> "english")
        lang_clean = re.sub(r'\s*\(.*\)', '', lang).strip().lower()
        if lang_clean:
            cleaned.append(lang_clean)
    return cleaned

def clean_ethnicity(entry):
    """
    Cleans an entry from the 'ethnicity' column.
    Splits on commas and also on forward-slashes if present,
    strips extra spaces, and lowercases the tokens.
    Returns a list of cleaned ethnicity tokens.
    """
    # First split on commas, then further split each piece on '/'
    tokens = []
    for part in entry.split(','):
        for token in part.split('/'):
            token_clean = token.strip().lower()
            if token_clean:
                tokens.append(token_clean)
    return tokens

def clean_job(entry):
    """
    Cleans an entry from the 'job' column.
    Many job entries include multiple values separated by commas and/or '/'
    (for example, "artistic / musical / writer" or "hospitality / travel").
    This function splits the string on commas and then on '/' to extract individual tokens,
    strips extra whitespace, and lowercases the result.
    Returns a list of cleaned job tokens.
    """
    tokens = []
    # Split by comma first (if there are multiple entries)
    for part in entry.split(','):
        # Then split further by '/'
        for sub in part.split('/'):
            token = sub.strip().lower()
            if token:
                tokens.append(token)
    return tokens

def split_religion(entry):
    """
    Splits a religion entry into two parts:
      - primary religion (e.g. "christianity", "atheism", etc.)
      - secondary descriptor (e.g. "very serious about it", "but not too serious about it")
      
    It uses a regular expression to look for "and" or "but" as a separator.
    If no separator is found, the entire entry is treated as the primary religion.
    """
    pattern = re.compile(r'^(.*?)\s*(?:and|but)\s*(.*)$', flags=re.IGNORECASE)
    match = pattern.match(entry)
    if match:
        primary = match.group(1).strip().lower()
        seriousness = match.group(2).strip().lower()
        return primary, seriousness
    else:
        return entry.strip().lower(), ""

def apply_religion_split(x):
    """
    Helper to return a pandas Series for the new religion columns.
    """
    if pd.isnull(x):
        return pd.Series(["", ""])
    else:
        return pd.Series(split_religion(x))

# -------------------------------------------
# Create new columns by splitting "religion"
# -------------------------------------------
df[['religion_primary', 'religion_seriousness']] = df['religion'].apply(apply_religion_split)

# -------------------------------------------------
# Build the unique summary string for the DataFrame
# -------------------------------------------------
unique_summary = ""

for col in df.columns:
    # Skip columns with too many unique values or the newly created religion columns
    if col in skip_cols or col in ['religion_primary', 'religion_seriousness']:
        continue

    # Special cleaning for the "speaks" column
    if col == 'speaks':
        unique_set = set()
        for entry in df['speaks'].dropna():
            cleaned_languages = clean_speaks(entry)
            unique_set.update(cleaned_languages)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"

    # Special cleaning for the "ethnicity" column
    elif col == 'ethnicity':
        unique_set = set()
        for entry in df['ethnicity'].dropna():
            cleaned_ethnicities = clean_ethnicity(entry)
            unique_set.update(cleaned_ethnicities)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"
    
    # Special cleaning for the "job" column
    elif col == 'job':
        unique_set = set()
        for entry in df['job'].dropna():
            cleaned_jobs = clean_job(entry)
            unique_set.update(cleaned_jobs)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"

    # For the raw "religion" column, instead list the two new columns
    elif col == 'religion':
        unique_primary = df['religion_primary'].dropna().unique()
        unique_seriousness = df['religion_seriousness'].dropna().unique()
        unique_summary += f"religion_primary: {', '.join(sorted(unique_primary))}\n"
        unique_summary += f"religion_seriousness: {', '.join(sorted(unique_seriousness))}\n"
    
    else:
        # For all other columns, simply get the unique values
        unique_values = df[col].dropna().unique()
        unique_str = ", ".join(map(str, unique_values))
        unique_summary += f"{col}: {unique_str}\n"

# Print the final summary string
print(unique_summary)


status: single, available, seeing someone, married, unknown
sex: m, f
orientation: straight, bisexual, gay
body_type: a little extra, average, thin, athletic, fit, skinny, curvy, full figured, jacked, rather not say, used up, overweight
diet: strictly anything, mostly other, anything, vegetarian, mostly anything, mostly vegetarian, strictly vegan, strictly vegetarian, mostly vegan, strictly other, mostly halal, other, vegan, mostly kosher, strictly halal, halal, strictly kosher, kosher
drinks: socially, often, not at all, rarely, very often, desperately
drugs: never, sometimes, often
education: working on college/university, working on space camp, graduated from masters program, graduated from college/university, working on two-year college, graduated from high school, working on masters program, graduated from space camp, college/university, dropped out of space camp, graduated from ph.d program, graduated from law school, working on ph.d program, two-year college, graduated from two-

In [17]:
import pandas as pd
import re

# Assume df is already defined (for example, via pd.read_csv("your_data.csv"))

# Define columns to skip because they have too many unique values
skip_cols = ['age', 'income', 'last_online', 'location'] + [f"essay{i}" for i in range(10)]

# ---------------------------
# Helper functions for cleaning
# ---------------------------

def clean_speaks(entry):
    """
    Cleans an entry from the 'speaks' column by splitting on commas,
    removing any parenthetical information, stripping extra spaces,
    and lowercasing the language names.
    Returns a list of cleaned language tokens.
    """
    languages = entry.split(',')
    cleaned = []
    for lang in languages:
        lang_clean = re.sub(r'\s*\(.*\)', '', lang).strip().lower()
        if lang_clean:
            cleaned.append(lang_clean)
    return cleaned

def clean_ethnicity(entry):
    """
    Cleans an entry from the 'ethnicity' column by splitting on commas and forward-slashes,
    then stripping extra spaces and lowercasing each token.
    Returns a list of cleaned ethnicity tokens.
    """
    tokens = []
    for part in entry.split(','):
        for token in part.split('/'):
            token_clean = token.strip().lower()
            if token_clean:
                tokens.append(token_clean)
    return tokens

def clean_job(entry):
    """
    Cleans an entry from the 'job' column.
    Splits on commas and forward-slashes to extract individual tokens,
    then strips extra spaces and lowercases the result.
    Returns a list of cleaned job tokens.
    """
    tokens = []
    for part in entry.split(','):
        for token in part.split('/'):
            token_clean = token.strip().lower()
            if token_clean:
                tokens.append(token_clean)
    return tokens

def split_religion(entry):
    """
    Splits a religion entry into two parts:
      - primary religion (e.g., "christianity", "atheism", etc.)
      - secondary descriptor (e.g., "very serious about it", "but not too serious about it")
    Uses a regex to look for the words "and" or "but" as a separator.
    If no separator is found, the whole entry is treated as the primary religion.
    """
    pattern = re.compile(r'^(.*?)\s*(?:and|but)\s*(.*)$', flags=re.IGNORECASE)
    match = pattern.match(entry)
    if match:
        primary = match.group(1).strip().lower()
        seriousness = match.group(2).strip().lower()
        return primary, seriousness
    else:
        return entry.strip().lower(), ""

def apply_religion_split(x):
    """
    Applies the split_religion function and returns a pandas Series
    for the new religion columns.
    """
    if pd.isnull(x):
        return pd.Series(["", ""])
    else:
        return pd.Series(split_religion(x))

def split_sign(entry):
    """
    Splits an entry from the 'sign' column into two parts:
      - sign_clean: the zodiac sign (e.g., "gemini", "aries", etc.)
      - sign_importance: an optional descriptor (e.g., "fun to think about")
    The function uses a regex to look for the keywords "but" or "and" as a separator.
    It also normalizes apostrophes and removes common extraneous phrases.
    """
    if pd.isnull(entry):
        return "", ""
    # Normalize to lowercase and replace curly apostrophes with straight ones
    entry = entry.lower().replace("’", "'")
    pattern = re.compile(r'^(.*?)\s*(?:but|and)\s*(.*)$', flags=re.IGNORECASE)
    match = pattern.match(entry)
    if match:
        sign = match.group(1).strip()
        importance = match.group(2).strip()
        # Remove common extraneous phrases
        if "doesn't matter" in importance or "fun to think about" in importance:
            importance = ""
        # Remove isolated "it's" or "its" if that is all that remains
        if importance in ["it's", "its"]:
            importance = ""
        return sign, importance
    else:
        return entry.strip(), ""

def clean_pets(entry):
    """
    Cleans an entry from the 'pets' column.
    Splits the entry on commas and the word "and" to extract individual pet descriptors.
    For example, an entry like "likes dogs and likes cats, has cats" is split into:
      - "likes dogs"
      - "likes cats"
      - "has cats"
    Returns a list of cleaned pet tokens.
    """
    if pd.isnull(entry):
        return []
    # Lowercase the entry
    entry = entry.lower()
    # First, split on commas
    parts = re.split(r',', entry)
    tokens = []
    for part in parts:
        # Further split on the word "and"
        subparts = re.split(r'\s+and\s+', part)
        for token in subparts:
            token_clean = token.strip()
            if token_clean:
                tokens.append(token_clean)
    return tokens

# -------------------------------------------
# Create new columns by splitting "religion"
# -------------------------------------------
df[['religion_primary', 'religion_seriousness']] = df['religion'].apply(apply_religion_split)

# -------------------------------------------
# Create new columns by splitting "sign"
# -------------------------------------------
# The new columns will be named "sign_clean" and "sign_importance"
df[['sign_clean', 'sign_importance']] = df['sign'].apply(
    lambda x: pd.Series(split_sign(x)) if pd.notnull(x) else pd.Series(["", ""])
)

# -------------------------------------------------
# Build the unique summary string for the DataFrame
# -------------------------------------------------
unique_summary = ""

for col in df.columns:
    # Skip columns with too many unique values and the new columns we created
    if col in skip_cols or col in ['religion_primary', 'religion_seriousness', 'sign_clean', 'sign_importance']:
        continue

    # Special cleaning for the "speaks" column
    if col == 'speaks':
        unique_set = set()
        for entry in df['speaks'].dropna():
            cleaned_languages = clean_speaks(entry)
            unique_set.update(cleaned_languages)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"

    # Special cleaning for the "ethnicity" column
    elif col == 'ethnicity':
        unique_set = set()
        for entry in df['ethnicity'].dropna():
            cleaned_ethnicities = clean_ethnicity(entry)
            unique_set.update(cleaned_ethnicities)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"
    
    # Special cleaning for the "job" column
    elif col == 'job':
        unique_set = set()
        for entry in df['job'].dropna():
            cleaned_jobs = clean_job(entry)
            unique_set.update(cleaned_jobs)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"

    # For the raw "religion" column, output the two new religion columns instead
    elif col == 'religion':
        unique_primary = df['religion_primary'].dropna().unique()
        unique_seriousness = df['religion_seriousness'].dropna().unique()
        unique_summary += f"religion_primary: {', '.join(sorted(unique_primary))}\n"
        unique_summary += f"religion_seriousness: {', '.join(sorted(unique_seriousness))}\n"
    
    # For the raw "sign" column, output the two new sign columns instead
    elif col == 'sign':
        unique_clean = df['sign_clean'].dropna().unique()
        unique_importance = df['sign_importance'].dropna().unique()
        unique_summary += f"sign_clean: {', '.join(sorted(unique_clean))}\n"
        unique_summary += f"sign_importance: {', '.join(sorted(unique_importance))}\n"
    
    # Special cleaning for the "pets" column
    elif col == 'pets':
        unique_set = set()
        for entry in df['pets'].dropna():
            cleaned_pets = clean_pets(entry)
            unique_set.update(cleaned_pets)
        unique_str = ", ".join(sorted(unique_set))
        unique_summary += f"{col}: {unique_str}\n"
    
    # For all other columns, simply list their unique values
    else:
        unique_values = df[col].dropna().unique()
        unique_str = ", ".join(map(str, unique_values))
        unique_summary += f"{col}: {unique_str}\n"

# Print the final summary string
print(unique_summary)


status: single, available, seeing someone, married, unknown
sex: m, f
orientation: straight, bisexual, gay
body_type: a little extra, average, thin, athletic, fit, skinny, curvy, full figured, jacked, rather not say, used up, overweight
diet: strictly anything, mostly other, anything, vegetarian, mostly anything, mostly vegetarian, strictly vegan, strictly vegetarian, mostly vegan, strictly other, mostly halal, other, vegan, mostly kosher, strictly halal, halal, strictly kosher, kosher
drinks: socially, often, not at all, rarely, very often, desperately
drugs: never, sometimes, often
education: working on college/university, working on space camp, graduated from masters program, graduated from college/university, working on two-year college, graduated from high school, working on masters program, graduated from space camp, college/university, dropped out of space camp, graduated from ph.d program, graduated from law school, working on ph.d program, two-year college, graduated from two-

In [None]:

# -----------------------------
# 2. Combine Essay Columns
# -----------------------------
essay_cols = [f'essay{i}' for i in range(10) if f'essay{i}' in df.columns]
df['essays'] = df[essay_cols].fillna('').agg(' '.join, axis=1)
df.drop(columns=essay_cols, inplace=True)

# -----------------------------
# 3. Clean Missing Values
# -----------------------------
categorical_cols = ['status', 'sex', 'orientation', 'body_type', 'diet',
                    'drinks', 'drugs', 'education', 'ethnicity', 'job',
                    'location', 'offspring', 'pets', 'religion', 'sign',
                    'smokes', 'speaks']

for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].fillna("unknown")

numeric_cols = ['age', 'height', 'income']
for col in numeric_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())


In [3]:

# -----------------------------
# 4. Outlier Detection and Removal
# -----------------------------
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df = remove_outliers(df, numeric_cols)

# -----------------------------
# 5. Create a Target Variable
# -----------------------------
df['target'] = (df['status'].str.lower() == 'single').astype(int)
df.drop(columns=['status'], inplace=True)


In [5]:

# -----------------------------
# 6. Feature Engineering & Preprocessing Setup
# -----------------------------
numeric_features = ['age', 'height', 'income']
categorical_features = [col for col in categorical_cols if col in df.columns and col != 'status']
text_feature = 'essays'

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('text', TfidfVectorizer(max_features=1000), text_feature)
    ],
    remainder='drop'
)


X = preprocessor.fit_transform(df)
y = df['target'].values


In [None]:

# -----------------------------
# 7. Split the Dataset
# -----------------------------
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test, X_eval, y_test, y_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


In [8]:

# -----------------------------
# 8. Train XGBoost Model
# -----------------------------
xgb_model = xgb.XGBClassifier(
    n_estimators=200, 
    max_depth=6, 
    learning_rate=0.1, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    random_state=42,
    eval_metric="logloss"  # Removed deprecated use_label_encoder
)

xgb_model.fit(X_train, y_train)


In [9]:

# -----------------------------
# 9. Evaluate the Model
# -----------------------------
y_test_pred = xgb_model.predict(X_test)
y_test_prob = xgb_model.predict_proba(X_test)[:, 1]

test_accuracy = accuracy_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_prob)

print("Test Set Evaluation:")
print("  Accuracy: {:.4f}".format(test_accuracy))
print("  ROC AUC:  {:.4f}".format(test_roc_auc))


Test Set Evaluation:
  Accuracy: 0.9369
  ROC AUC:  0.7826


In [10]:

# -----------------------------
# 10. Evaluation on Validation Set
# -----------------------------
y_eval_pred = xgb_model.predict(X_eval)
y_eval_prob = xgb_model.predict_proba(X_eval)[:, 1]

eval_accuracy = accuracy_score(y_eval, y_eval_pred)
eval_roc_auc = roc_auc_score(y_eval, y_eval_prob)

print("\nEvaluation Set Evaluation:")
print("  Accuracy: {:.4f}".format(eval_accuracy))
print("  ROC AUC:  {:.4f}".format(eval_roc_auc))



Evaluation Set Evaluation:
  Accuracy: 0.9350
  ROC AUC:  0.7829
