##### Report for the okcupid_profile dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from fastFM import sgd
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy import sparse

In [None]:
file_path = 'data/okcupid_profiles.csv'
data = pd.read_csv(file_path)


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from fastFM import sgd
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy import sparse

# -----------------------------
# 1. Load the dataset
# -----------------------------
# Replace 'dating_app_data.csv' with your actual filename/path.
df = pd.read_csv('dating_app_data.csv')

# -----------------------------
# 2. Combine Essay Columns
# -----------------------------
# Assume the essay columns are named 'essay0', 'essay1', ..., 'essay9'
essay_cols = [f'essay{i}' for i in range(10)]
# Fill missing essays with an empty string and combine them into one large text field.
df['essays'] = df[essay_cols].fillna('').agg(' '.join, axis=1)
# Drop the original essay columns (no longer needed)
df.drop(columns=essay_cols, inplace=True)

# -----------------------------
# 3. Clean Missing Values
# -----------------------------
# List of categorical columns (adjust as needed)
categorical_cols = ['status', 'sex', 'orientation', 'body_type', 'diet',
                    'drinks', 'drugs', 'education', 'ethnicity', 'job',
                    'location', 'offspring', 'pets', 'religion', 'sign',
                    'smokes', 'speaks']

# For categorical columns, fill missing values with a placeholder (e.g., "unknown")
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].fillna("unknown")

# List of numeric columns that we plan to use
numeric_cols = ['age', 'height', 'income']
# For numeric columns, fill missing values with the median value
for col in numeric_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# -----------------------------
# 4. Outlier Detection and Removal
# -----------------------------


def remove_outliers(df, columns):
    """Remove rows where any of the specified numeric columns is an outlier.
       Outliers are defined using the IQR method (beyond 1.5 * IQR)."""
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df


df = remove_outliers(df, numeric_cols)

# -----------------------------
# 5. Create a Target Variable
# -----------------------------
# For demonstration, assume that if a user's status is "single" (case-insensitive)
# then they are available for matching (target=1), otherwise 0.
df['target'] = (df['status'].str.lower() == 'single').astype(int)
# Since we used "status" to create the target, drop it from the features.
df.drop(columns=['status'], inplace=True)

# -----------------------------
# 6. Feature Engineering & Preprocessing Setup
# -----------------------------
# Define the features to be processed:
numeric_features = ['age', 'height', 'income']
# Note: Exclude 'status' since it was used for target creation.
categorical_features = ['sex', 'orientation', 'body_type', 'diet',
                        'drinks', 'drugs', 'education', 'ethnicity', 'job',
                        'location', 'offspring', 'pets', 'religion', 'sign',
                        'smokes', 'speaks']
text_feature = 'essays'  # our combined essay text

# Create a ColumnTransformer that:
# - Standardizes numeric features,
# - One-hot encodes categorical features,
# - Vectorizes the text (using TF–IDF) with a maximum of 1000 features.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore',
         sparse=True), categorical_features),
        ('text', TfidfVectorizer(max_features=1000), text_feature)
    ],
    remainder='drop'  # drop any columns not specified above
)

# Fit the preprocessor on the entire dataset and transform the data.
X = preprocessor.fit_transform(df)
y = df['target'].values

# fastFM works best with sparse input; if X isn’t already sparse, convert it.
if not sparse.issparse(X):
    X = sparse.csr_matrix(X)

# -----------------------------
# 7. Split the Dataset
# -----------------------------
# First split into training (70%) and temporary (30%) sets, stratifying by target.
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Then split the temporary set equally into test and evaluation sets (15% each of the original data).
X_test, X_eval, y_test, y_eval = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# -----------------------------
# 8. Train Factorization Machines (FM) Model
# -----------------------------
# We use the SGD-based FMClassification from fastFM.
# Adjust hyperparameters (n_iter, rank, regularization, etc.) as needed.
fm = sgd.FMClassification(n_iter=100,
                          init_stdev=0.1,
                          rank=8,
                          l2_reg_w=0.1,
                          l2_reg_V=0.5,
                          random_state=42)

# Train the model on the training set
fm.fit(X_train, y_train)

# -----------------------------
# 9. Evaluate the Model
# -----------------------------
# Predict probabilities on the test set. (fastFM returns continuous outputs; threshold at 0.5 for binary decisions.)
y_test_pred = fm.predict(X_test)
y_test_pred_binary = (y_test_pred > 0.5).astype(int)

# Compute evaluation metrics on the test set
test_accuracy = accuracy_score(y_test, y_test_pred_binary)
test_roc_auc = roc_auc_score(y_test, y_test_pred)

print("Test Set Evaluation:")
print("  Accuracy: {:.4f}".format(test_accuracy))
print("  ROC AUC:  {:.4f}".format(test_roc_auc))

# Similarly, evaluate on the evaluation set
y_eval_pred = fm.predict(X_eval)
y_eval_pred_binary = (y_eval_pred > 0.5).astype(int)
eval_accuracy = accuracy_score(y_eval, y_eval_pred_binary)
eval_roc_auc = roc_auc_score(y_eval, y_eval_pred)

print("\nEvaluation Set Evaluation:")
print("  Accuracy: {:.4f}".format(eval_accuracy))
print("  ROC AUC:  {:.4f}".format(eval_roc_auc))