In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 124)

from sklearn.model_selection import train_test_split

from preprocess import *
from classify import *

# Speed Dating

In [2]:
df = pd.read_csv("data/raw/speeddating/speeddating_raw.csv")

X_raw = df.copy().drop(["match"], axis=1)
y_raw = df["match"].copy()

In [3]:
X = X_raw.copy()

# remove all columns with > 5% missing values
X.dropna(thresh=len(df)*0.95, axis=1, inplace=True)

# remove intervall-ized information
X.drop([attribute for attribute in df.columns if attribute[:2] == "d_" and attribute != "d_age"], axis=1, inplace=True)

# why bother with an id?
X.drop("id", axis=1, inplace=True)

# ladies first
X["gender"] = X["gender"].map({"female": 0, "male": 1})

for attribute in ["pref_o_attractive", "pref_o_sincere", "pref_o_intelligence", "pref_o_funny", "pref_o_ambitious", "pref_o_shared_interests", "attractive_important", "sincere_important", "intellicence_important", "funny_important", "ambtition_important", "shared_interests_important"]:
    X[attribute] /= 100

for attribute in ["importance_same_race", "importance_same_religion", "attractive_o", "sinsere_o", "intelligence_o", "funny_o", "attractive", "sincere", "intelligence", "funny", "ambition", "attractive_partner", "sincere_partner", "intelligence_partner", "funny_partner", "sports", "tvsports", "exercise", "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater", "movies", "concerts", "music", "shopping", "yoga", "expected_happy_with_sd_people", "like", "guess_prob_liked", "met"]:
    X[attribute] /= 10

y = y_raw.copy()

In [4]:
attributes = {
    "ohe": ["race", "race_o", "field"],
    "zsc": ["wave", "age", "age_o", "d_age"],
    "rimp": [],
}

categories = {
    attribute: [category for category in X[attribute].unique() if type(category) is str]
    for attribute in attributes["ohe"]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)

X_train_preprocessed, y_train_preprocessed = preprocess(X_train, y_train, attributes, categories)
X_test_preprocessed,  y_test_preprocessed  = preprocess(X_test,  y_test,  attributes, categories)

In [5]:
parameters = {
    "KNN": [1, 2, 3],
    "DTC": [10, 20],
    "RFC": [],
    "BNB": [],
    "MNB": [],
}

classifiers_dict, scores_dict = classify(
    X_train_preprocessed, y_train_preprocessed,
    X_test_preprocessed, y_test_preprocessed,
    parameters
)

In [6]:
scores_dict

{'KNN': [0.8573008849557522, 0.855641592920354, 0.8678097345132744],
 'DTC': [1.0, 1.0],
 'RFC': [],
 'BNB': [],
 'MNB': []}

# Mushrooms

In [7]:
df = pd.read_csv("data/raw/mushroom/mushrooms_raw.csv", delimiter=";")

X_raw = df.copy().drop(["class"], axis=1)
y_raw = df["class"].copy()

In [8]:
X = X_raw.dropna(thresh=len(df)*0.80, axis=1).copy()

for attribute in ["does-bruise-or-bleed", "has-ring"]:
    X[attribute] = X[attribute].map({"t": True, "f": False})

X["season"] = X["season"].map({"s": 0, "u": 1, "a": 2, "w": 3})

y = y_raw.map({"e": True, "p": False}).copy()

In [9]:
attributes = {
    "ohe": [attribute for attribute, dtype in dict(X.dtypes).items() if dtype is np.dtype("O")],
    "zsc": ["cap-diameter", "stem-height", "stem-width"],
    "rimp": ["gill-attachment", "ring-type"],
}

categories = {
    attribute: [category for category in X[attribute].unique() if type(category) is str]
    for attribute in attributes["ohe"]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)

X_train_preprocessed, y_train_preprocessed = preprocess(X_train, y_train, attributes, categories)
X_test_preprocessed,  y_test_preprocessed  = preprocess(X_test,  y_test,  attributes, categories)

In [10]:
parameters = {
    "KNN": [1, 2, 3],
    "DTC": [10, 20],
    "RFC": [],
    "BNB": [],
    "MNB": [],
}

classifiers_dict, scores_dict = classify(
    X_train_preprocessed, y_train_preprocessed,
    X_test_preprocessed, y_test_preprocessed,
    parameters
)

In [11]:
scores_dict

{'KNN': [0.9984280848834163, 0.9982970919570343, 0.9986245742729892],
 'DTC': [0.9107283206706838, 0.9863767356562746],
 'RFC': [],
 'BNB': [],
 'MNB': []}