# Phase 3 Project Office Hours

In [None]:
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, plot_confusion_matrix

import category_encoders as ce

Data source: https://www.kaggle.com/c/cat-in-the-dat-ii

In [None]:
# Grab, then explore data
df = pd.read_csv('data/cat_in_the_dat2_train.csv', index_col='id')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Exploring numeric cols
df.describe()

In [None]:
obj_cols = [c for c in df.columns if df[c].dtype == 'object']

In [None]:
# Exploring object cols
df[[c for c in df.columns if df[c].dtype == 'object']].describe()

In [None]:
# Exploring target distribution
df['target'].value_counts(normalize=True)

In [None]:
# Define our X and y
X = df.drop('target', axis = 1)
y = df['target']

# and train test split - to create our val holdout set!
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42)

## Set Up Our Pipeline

Reference: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [None]:
# Set up how to treat our columns
num_cols = []
cols_to_ohe = []
cols_to_freq = []

for c in X_train.columns:
    # Want to grab numeric columns
    if X_train[c].dtype in ['float64', 'int64']:
        # same as if X_train[c].dtype == 'float64'
        num_cols.append(c)

    # Then grab columns with fewer than 10 unique values
    elif len(X_train[c].unique()) < 10:
        cols_to_ohe.append(c)

    # Then grab columns with more than 10, since we won't OHE those
    else:
        cols_to_freq.append(c)

In [None]:
# Check our work
print(num_cols)

In [None]:
print(cols_to_ohe)

In [None]:
print(cols_to_freq)

In [None]:
# Now, set up the preprocessing steps for each type of col
num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

In [None]:
ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('ohencoder', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
freq_transformer = Pipeline(steps=[
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('freq_enc', ce.count.CountEncoder(normalize=True, 
                                       handle_unknown=0,
                                       min_group_size=0.001,
                                       min_group_name='Other'))])

In [None]:
# Put together our preprocessor using a Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, cols_to_ohe),
        ('freq', freq_transformer, cols_to_freq)])

## Model 1: 

Evaluate:

- 


## Model 2: 

Evaluate:

- 


## Model 3: 

Evaluate:

- 
