# Link to dataset: https://www.kaggle.com/datasets/ulrikthygepedersen/speed-dating

# 1. Data exploration

## 1.1 Import modules needed for analysis

In [6]:
import pandas as pd
from scipy.io import arff
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from tabulate import tabulate
import numpy as np
import time

## 1.2 Load the dataset

In [7]:
path = './speeddating.arff'
data, meta = arff.loadarff(path)
df = pd.DataFrame(data)

## 1.3 Check dataframe dimensions and explore attributes

In [10]:
print(f"Number of rows: {df.shape[0]}\nNumber of attributes: {df.shape[1]}\n\n")
ds = df.replace(r'^s*$', np.nan, regex=True) # to count empty strings as NaN

column = ds.columns
table_data = []

for column, dtype in ds.dtypes.items():
    null_count = ds[column].isna().sum()  
    attribute_info = {"Attribute name": column, "Data type": dtype, "Null values": null_count}
    if dtype == 'object':
        distinct_values = ds[column].unique()
        attribute_info["Distinct values"] = distinct_values
    elif dtype in ['int64', 'float64']:
        min_value = ds[column].min()
        max_value = ds[column].max()
        mean_value = ds[column].mean()
        distinct_values = ds[column].unique()
        if len(distinct_values) <= 11: # to check if there are 10 unique instances + maybe NA; list them
            attribute_info["Distinct values"] = distinct_values
        else:
            attribute_info["Distinct values"] = f"Numer of unique values: {len(distinct_values)}"
        attribute_info["Range"] = f"{min_value} to {max_value}"
        attribute_info["Mean"] = mean_value
    table_data.append(attribute_info)

attribute_table = pd.DataFrame(table_data)
print(tabulate(attribute_table, headers='keys', tablefmt='fancy_grid', showindex=False))

percentage_distribution = ds['decision'].value_counts(normalize=True) * 100

print("Percentage Distribution of Decision Attribute:")
print(percentage_distribution)

Number of rows: 8378
Number of attributes: 123


╒═════════════════════════════════╤═════════════╤═══════════════╤════════════════════════════════════════════════════════════════════════════╤═══════════════╤═════════════╕
│ Attribute name                  │ Data type   │   Null values │ Distinct values                                                            │ Range         │        Mean │
╞═════════════════════════════════╪═════════════╪═══════════════╪════════════════════════════════════════════════════════════════════════════╪═══════════════╪═════════════╡
│ has_null                        │ object      │             0 │ [b'0' b'1']                                                                │ nan           │ nan         │
├─────────────────────────────────┼─────────────┼───────────────┼────────────────────────────────────────────────────────────────────────────┼───────────────┼─────────────┤
│ wave                            │ float64     │             0 │ Numer of unique valu

## 1.4 Display first 10 rows of dataframe

In [None]:
pd.set_option('display.max_columns', None)
display(df.head(10))

# 2. Preprocessing

In [None]:
# make a copy so original dataframe stays unchanges
df_PP = df.copy()

# 1) attribute selection

# drop columns that represent metadata
columns = ['has_null', 'wave']
df_PP = df_PP.drop(columns, axis=1)

# drop columns 'field' as it has too many categories
columns = ['field']
df_PP = df_PP.drop(columns, axis=1)

# drop columns that are related to partner's POV
columns = [
    'match'
]

# drop all categorical columns that are already included as a rating
df_PP = df_PP.drop(columns, axis=1)

columns = [col for col in df_PP.columns if col.startswith('d_')]
df_PP = df_PP.drop(columns, axis=1)


# 2) handle missing values

# transforms met from float to binary category
threshold = 0.5  
df_PP['met'] = (df_PP['met'] > threshold).astype(int)
df_PP['met'] = df_PP['met'].astype('category')
df_PP['met'] = df_PP['met'].cat.rename_categories({0: 'Not Met', 1: 'Met'})

# for categorical data, delete rows with missing values
categorical_columns = df_PP.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    df_PP = df_PP[df_PP[col] != b'?']

# for numerical data, replace missing values with mean value of that attribute
numerical_columns = df_PP.select_dtypes(include=['int', 'float']).columns.tolist()
for col in numerical_columns:
    df_PP[col].fillna(df_PP[col].mean(), inplace=True)


# 3) perform one-hot encoding for categorical variables
categorical_columns = df_PP.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_columns:
    if col != 'decision':  # Exclude the 'decision' column
        encoded = pd.get_dummies(df_PP[col], prefix=col)
        df_PP = pd.concat([df_PP, encoded], axis=1)
        df_PP.drop(col, axis=1, inplace=True)

# print new dimensions and head of dataframe
print(df_PP.shape)
display(df_PP.head(10))

# 3. Machine Learning

## 3.1 Split dataset

In [None]:
df_ML = df_PP.copy()

# prediction and predicting variables
X = df_ML.drop(columns='decision')
y = df_ML['decision']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# splitting the original dataframe into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3.2 Scale data

In [None]:
scaledX_train = X_train[:]
scaledX_train[numerical_columns] = StandardScaler().fit_transform(X_train[numerical_columns])
scaledX_train = pd.DataFrame(scaledX_train, columns=X_train.columns)

## 3.3 Machine Learning

### 3.3.1 Support Vector Machine

In [None]:
# cross validation + searching for the best parmeters
model = SVC()
parameters = {'C':[0.07, 0.1, 0.2], 'kernel':('linear','poly', 'rbf', 'sigmoid'), 'class_weight':['balanced']}

gs = GridSearchCV(model, parameters, scoring=["roc_auc", "f1", "recall", "precision", "accuracy"], refit="f1")
gs.fit(scaledX_train, y_train)

SVM_res = pd.DataFrame(gs.cv_results_)
SVM_res = SVM_res.loc[:, ["mean_fit_time", "params", "mean_test_roc_auc", "mean_test_f1", "mean_test_recall",
                           "mean_test_precision", "mean_test_accuracy"]]
SVM_res = SVM_res.sort_values(by=['mean_test_f1'], ascending=False, ignore_index=True) # ordering dataframe by chosen metric

ss = np.round(np.linspace(0, len(SVM_res) - 1, 3)).astype(int) # indices for subsetting the res df, i.e., the best, the "middle" and the worst params setting
SVM_res = SVM_res.iloc[ss,:]
SVM_res.index = ['SVM_CV_1', 'SVM_CV_2', 'SVM_CV_3']

# Holdout (= determining performance on one separated portion of the train dataset)
X_tr, X_val, y_tr, y_val = train_test_split(scaledX_train, y_train, test_size=0.2, random_state=1298)
train_split = np.bincount(y_tr)[1] / len(y_tr)  # proportion of the True class in y_tr
val_split = np.bincount(y_val)[1] / len(y_val)  # proportion of the True class in y_val


params = gs.best_params_ # take best parametes from CV grid search

model = SVC(C = params.get("C"), kernel = params.get("kernel"), class_weight = "balanced")

start_time = time.time()
fit = model.fit(X_tr, y_tr)
fit_time_HO = time.time() - start_time

prediction_HO = fit.predict(X_val)
pr_HO = precision_score(y_val, prediction_HO)
rc_HO = recall_score(y_val, prediction_HO)
f1_HO = f1_score(y_val, prediction_HO)
auc_HO = roc_auc_score(y_val, prediction_HO)
acc_HO = accuracy_score(y_val, prediction_HO)

print(train_split, val_split)

SVM_res.loc["SVM_HO"] = [fit_time_HO, params, auc_HO, f1_HO, rc_HO, pr_HO, acc_HO]

### 3.3.2 Light Gradient Boosting Machine

In [None]:
# Cross validation + searching for the best parmeters
model = LGBMClassifier(verbosity = -1)
sample_pos_weight = np.bincount(y)[0] / np.bincount(y)[1]
parameters = {'max_depth':(20, 50, 100), 'learning_rate':[0.1, 0.5, 0.55], 'num_leaves':(31, 20, 50),
             'n_estimators':(100, 80, 200), 'sample_pos_weight':[sample_pos_weight]}


gs = GridSearchCV(model, parameters, scoring=["roc_auc", "f1", "recall", "precision", "accuracy"], refit="f1")
gs.fit(scaledX_train, y_train)

LGBM_res = pd.DataFrame(gs.cv_results_)
LGBM_res = LGBM_res.loc[:, ["mean_fit_time", "params", "mean_test_roc_auc", "mean_test_f1", "mean_test_recall",
                           "mean_test_precision", "mean_test_accuracy"]]
LGBM_res = LGBM_res.sort_values(by=['mean_test_f1'], ascending=False, ignore_index=True) # ordering dataframe by chosen metric

ss = np.round(np.linspace(0, len(LGBM_res) - 1, 3)).astype(int) # indices for subsetting the res df, i.e., the best, the "middle" and the worst params setting
LGBM_res = LGBM_res.iloc[ss,:]
LGBM_res.index = ['LGBM_CV_1', 'LGBM_CV_2', 'LGBM_CV_3']

# Holdout (= determining performance on one separated portion of the dataset)
X_tr, X_val, y_tr, y_val = train_test_split(scaledX_train, y_train, test_size=0.2, random_state=1297)
train_split = np.bincount(y_tr)[1] / len(y_tr)  # proportion of the True class in y_tr
val_split = np.bincount(y_val)[1] / len(y_val)  # proportion of the True class in y_val

params = gs.best_params_ # take best parametes from CV grid search

model = LGBMClassifier(max_depth = params.get("max_depth"), learning_rate = params.get("learning_rate"),
                       num_leaves = params.get("num_leaves"), n_estimators = params.get("n_estimators"), sample_pos_weight = sample_pos_weight, verbosity = -1)

start_time = time.time()
fit = model.fit(X_tr, y_tr)
fit_time_HO = time.time() - start_time

prediction_HO = fit.predict(X_val)
pr_HO = precision_score(y_val, prediction_HO)
rc_HO = recall_score(y_val, prediction_HO)
f1_HO = f1_score(y_val, prediction_HO)
auc_HO = roc_auc_score(y_val, prediction_HO)
acc_HO = accuracy_score(y_val, prediction_HO)

print(train_split, val_split)

LGBM_res.loc["LGBM_HO"] = [fit_time_HO, params, auc_HO, f1_HO, rc_HO, pr_HO, acc_HO]

### 3.3.3 Neural networks - Multi-layer Perceptron classifier

In [None]:
# Cross validation + searching for the best parmeters
model = MLPClassifier(random_state=1, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)
parameters = {
    'random_state': [1],
    'max_iter': [5000],  # Keep the maximum iterations
    'hidden_layer_sizes': [(100,), (50,), (200,)],  
    'solver': ['adam'],  # Use 'adam' solver only
    'alpha': [0.00001, 0.00003],  # Decrease alpha to smaller values
    'learning_rate_init': [0.001]  # Keep the learning rate
}


gs = GridSearchCV(model, parameters, scoring=["roc_auc", "f1", "recall", "precision", "accuracy"], refit="f1")
gs.fit(scaledX_train, y_train)

MLP_res = pd.DataFrame(gs.cv_results_)
MLP_res = MLP_res.loc[:, ["mean_fit_time", "params", "mean_test_roc_auc", "mean_test_f1", "mean_test_recall",
                           "mean_test_precision", "mean_test_accuracy"]]
MLP_res = MLP_res.sort_values(by=['mean_test_f1'], ascending=False, ignore_index=True) # ordering dataframe by chosen metric

ss = np.round(np.linspace(0, len(MLP_res) - 1, 3)).astype(int) # indices for subsetting the res df, i.e., the best, the "middle" and the worst params setting
MLP_res = MLP_res.iloc[ss,:]
MLP_res.index = ['MLP_CV_1', 'MLP_CV_2', 'MLP_CV_3']

# Holdout (= determining performance on one separated portion of the dataset)
X_tr, X_val, y_tr, y_val = train_test_split(scaledX_train, y_train, test_size=0.2, random_state=1297)
train_split = np.bincount(y_tr)[1] / len(y_tr)  # proportion of the True class in y_tr
val_split = np.bincount(y_val)[1] / len(y_val)  # proportion of the True class in y_val

params = gs.best_params_ # take best parametes from CV grid search

model = MLPClassifier(max_iter = params.get("max_iter"), hidden_layer_sizes = params.get("hidden_layer_sizes"),
                      alpha = params.get("alpha"), random_state = params.get("random_state"), 
                      solver = params.get("solver"))

start_time = time.time()
fit = model.fit(X_tr,  y_tr)
fit_time_HO = time.time() - start_time

prediction_HO = fit.predict(X_val)
pr_HO = precision_score(y_val, prediction_HO)
rc_HO = recall_score(y_val, prediction_HO)
f1_HO = f1_score(y_val, prediction_HO)
auc_HO = roc_auc_score(y_val, prediction_HO)
acc_HO = accuracy_score(y_val, prediction_HO)

print(train_split, val_split)

MLP_res.loc["MLP_HO"] = [fit_time_HO, params, auc_HO, f1_HO, rc_HO, pr_HO, acc_HO]

## 3.4 Comparison

In [None]:
comparison = pd.concat([SVM_res, LGBM_res, MLP_res], axis=0)
comparison.columns = ["Fit_time", "Parameters", "ROC_AUC", "F1", "Recall_sens", "Precision", "Accuracy"]
pd.set_option('display.max_colwidth', None)
display(comparison)

## 3.5 Performance on test set

### 3.5.1 Scale data

In [None]:
scaledX_test = X_test[:]
scaledX_test[numerical_columns] = StandardScaler().fit_transform(X_test[numerical_columns])
scaledX_test = pd.DataFrame(scaledX_test, columns=X_test.columns)

### 3.5.2 Testing

In [None]:
best_models = ["SVM_CV_1", "LGBM_CV_1", "MLP_CV_1"]
best_params = comparison.loc[best_models,"Parameters"]
test_res = pd.DataFrame(index=["SVM", "LGBM", "MLP"], columns=["Fit_time", "Parameters", "ROC_AUC", "F1", "Recall_sens", "Precision", "Accuracy"])

# SVM
model = SVC(C = best_params["SVM_CV_1"].get("C"), kernel = best_params["SVM_CV_1"].get("kernel"), class_weight = "balanced")
start_time = time.time()
fit = model.fit(scaledX_train,  y_train)
fit_time = time.time() - start_time
prediction = fit.predict(scaledX_test)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction)
acc = accuracy_score(y_test, prediction)

test_res.loc["SVM"] = [fit_time, best_params["SVM_CV_1"], auc, f1, rc, pr, acc]

# LGBM
model = LGBMClassifier(max_depth = best_params["LGBM_CV_1"].get("max_depth"), learning_rate = best_params["LGBM_CV_1"].get("learning_rate"),
                       num_leaves = best_params["LGBM_CV_1"].get("num_leaves"), n_estimators = best_params["LGBM_CV_1"].get("n_estimators"), 
                       sample_pos_weight = sample_pos_weight, verbosity = -1)
start_time = time.time()

fit = model.fit(scaledX_train,  y_train)
fit_time = time.time() - start_time
prediction = fit.predict(scaledX_test)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction)
acc = accuracy_score(y_test, prediction)

test_res.loc["LGBM"] = [fit_time, best_params["LGBM_CV_1"], auc, f1, rc, pr, acc]

# MLP
model = MLPClassifier(max_iter = best_params["MLP_CV_1"].get("max_iter"), hidden_layer_sizes = best_params["MLP_CV_1"].get("hidden_layer_sizes"),
                      alpha = best_params["MLP_CV_1"].get("alpha"), random_state = best_params["MLP_CV_1"].get("random_state"), 
                      solver = best_params["MLP_CV_1"].get("solver"))
start_time = time.time()
fit = model.fit(scaledX_train,  y_train)
fit_time = time.time() - start_time
prediction = fit.predict(scaledX_test)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction)
acc = accuracy_score(y_test, prediction)

test_res.loc["MLP"] = [fit_time, best_params["MLP_CV_1"], auc, f1, rc, pr, acc]

### 3.5.3 Results

In [None]:
display(test_res)

## 3.6 Unscaled data

In [None]:
# training and fitting with X_train/X_test, noc scaledX_train/scaledX_test
unscaled_res = pd.DataFrame(index=["SVM", "LGBM", "MLP"], columns=["Fit_time", "Parameters", "ROC_AUC", "F1", "Recall_sens", "Precision", "Accuracy"])

# SVM
model = SVC(C = best_params["SVM_CV_1"].get("C"), kernel = best_params["SVM_CV_1"].get("kernel"), class_weight = "balanced")
start_time = time.time()
fit = model.fit(X_train,  y_train)
fit_time = time.time() - start_time
prediction = fit.predict(X_test)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction)
acc = accuracy_score(y_test, prediction)

unscaled_res.loc["SVM"] = [fit_time, best_params["SVM_CV_1"], auc, f1, rc, pr, acc]

# LGBM
model = LGBMClassifier(max_depth = best_params["LGBM_CV_1"].get("max_depth"), learning_rate = best_params["LGBM_CV_1"].get("learning_rate"),
                       num_leaves = best_params["LGBM_CV_1"].get("num_leaves"), n_estimators = best_params["LGBM_CV_1"].get("n_estimators"), 
                       sample_pos_weight = sample_pos_weight, verbosity = -1)
start_time = time.time()
fit = model.fit(X_train,  y_train)
fit_time = time.time() - start_time
prediction = fit.predict(X_test)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction)
acc = accuracy_score(y_test, prediction)

unscaled_res.loc["LGBM"] = [fit_time, best_params["LGBM_CV_1"], auc, f1, rc, pr, acc]

# MLP
model = MLPClassifier(max_iter = best_params["MLP_CV_1"].get("max_iter"), hidden_layer_sizes = best_params["MLP_CV_1"].get("hidden_layer_sizes"),
                      alpha = best_params["MLP_CV_1"].get("alpha"), random_state = best_params["MLP_CV_1"].get("random_state"), 
                      solver = best_params["MLP_CV_1"].get("solver"))
start_time = time.time()
fit = model.fit(X_train,  y_train)
fit_time = time.time() - start_time
prediction = fit.predict(X_test)
pr = precision_score(y_test, prediction)
rc = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction)
acc = accuracy_score(y_test, prediction)

unscaled_res.loc["MLP"] = [fit_time, best_params["MLP_CV_1"], auc, f1, rc, pr, acc]

display(unscaled_res)