In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.io import arff
import matplotlib.pyplot as plt
from timeit import default_timer as timer
from scripts.experiment_functions import kfold_experiments
from scripts.preprocessing_dictionaries import category_strategy_dict, numerical_strategy_dict
from scripts.preprocessing_functions import basic_processing, data_preprocessing, split_dataset, data_reduction

In [2]:
okcupid_stem = arff.loadarff("data/okcupid_stem.arff")
okcupid_df = pd.DataFrame(okcupid_stem[0])

In [3]:
drinks_dict = {b"not at all": 0,
               b"rarely": 1,
               b"socially": 2,
               b"often":3,
               b"very often": 4,
               b"desperately":5,
               b"?": np.nan}

drugs_dict = {b"never":0,
              b"sometimes": 1,
              b"often": 2,
              b"?": np.nan}

smokes_dict = {b"no": 0,
               b"trying to quit": 1,
               b"when drinking": 2,
               b"sometimes":3,
               b"yes": 4,
              b"?": np.nan}

okcupid_df["drinks"] = okcupid_df["drinks"].apply(drinks_dict.get)
okcupid_df["drugs"] = okcupid_df["drugs"].apply(drugs_dict.get)
okcupid_df["smokes"] = okcupid_df["smokes"].apply(smokes_dict.get)

In [4]:
def determine_dtype(x):
    return x.decode("utf-8")

for col in okcupid_df.select_dtypes(include="object").columns:
    okcupid_df[col] = okcupid_df[col].apply(determine_dtype)

okcupid_df.replace('?', np.nan, inplace=True)
okcupid_df["income"] = okcupid_df["income"].astype('float64')

# Follow the pipeline

In [5]:
# Data
X_cupid = okcupid_df.drop(columns="job")
y_cupid = np.reshape(okcupid_df["job"].values, X_cupid.shape[0])

In [6]:
splitted_data = split_dataset(X_cupid, y_cupid, 0.1, 42)

In [7]:
# Remove NAs and add a small pseudocount
splitted_data["X_train_clean"] = basic_processing(splitted_data["X_train"])
splitted_data["X_validation_clean"] = basic_processing(splitted_data["X_validation"])

In [8]:
# Get data with categorical features
encoded_data, encoder_objs = data_preprocessing(splitted_data["X_train_clean"], ["OHE_VAR"], ["SSE_VAR"], category_strategy_dict, numerical_strategy_dict)


In [9]:
encoded_data, encoder_objs = data_reduction(splitted_data["X_train_clean"], 
                                            encoded_data, 
                                            encoder_objs, ["FAMD"], 
                                            20)
encoded_data


{'OHE_VAR-SSE_VAR': array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          2.19011591e+00,  0.00000000e+00,  5.19571719e-17],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.36103395e-01,  0.00000000e+00,  2.34514067e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          4.21762396e-01,  0.00000000e+00, -4.62790929e-01],
        ...,
        [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
          1.43225012e+00, -2.23629444e-01, -4.62790929e-01],
        [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         -5.88725325e-01,  0.00000000e+00,  2.34514067e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          1.68487205e+00,  0.00000000e+00, -4.62790929e-01]]),
 'FAMD': array([[-1.30377369, -0.80572219,  1.25624282, ..., -1.570633  ,
          0.04313904,  0.32989056],
        [ 2.5113574 , -0.28136791,  0.83148983, ..., -3.11553875,
          0.72817239,  0.57575745]

In [10]:
encoder_objs["FAMD"].transform(splitted_data["X_validation_clean"])

array([[-0.4340262 , -1.43679857, -0.67092224, ..., -1.27637973,
        -0.22898996,  0.6078747 ],
       [-2.10340171,  2.25563514,  2.49598662, ...,  1.30053128,
         1.32442278, -0.55516338],
       [ 0.82685425,  0.94191028, -0.12056374, ...,  1.33364424,
         0.38524246,  0.37850918],
       ...,
       [-0.54037522,  2.64707752, -1.97087568, ..., -1.37443192,
        -1.18525442,  0.66195473],
       [-1.15916035,  1.96906229,  1.46207295, ..., -0.9632268 ,
         0.09661166,  0.18204459],
       [ 1.14315572,  1.90496479,  0.62897992, ..., -0.86877995,
        -0.193219  ,  0.21116106]])