In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/train_data.zip')

In [6]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [8]:
def create_imputer(X_train):

    """
    Fit all transformers using `X_train`.
    
    Parameters
    ----------
    X_train: pd.DataFrame
        Training set
    
    Returns
    -------
    sklearn.compose._column_transformer.ColumnTransformer
    
    """
    
    #======================================
    # IMPORT DATA FRAME
    #======================================

    df = pd.read_csv('../data/train_data.zip')

    #======================================
    # IDENTIFY COLUMNS TO IMPUTE
    #======================================

    # Impute with 0
    monthly_count_equipment = df.loc[:, 'monthly_count_slide_single':'monthly_count_climber'].columns.to_list()
    historic_session = df.loc[:, 'historic_number_of_sessions':'historic_avg_mod_plus_vig'].columns.to_list()
    historic_hour = df.loc[:, 'historic_hour_0':'historic_hour_23'].columns.to_list()
    historic_count_equipment = df.loc[:, 'historic_count_bridge':'historic_count_zipline'].columns.to_list()
    historic_weather = df.loc[:, 'historic_cloudy':'historic_snow'].columns.to_list()
    OSM = df.loc[:, 'n': 'streets_per_node_proportion_7_osid'].columns.to_list()
    zero_misc = ['days_since_first_sess', 'perfect_days', 'Green_2016', 'Number_of_holidays']

    zero_imp_features = monthly_count_equipment + historic_session + historic_hour \
                        + historic_count_equipment + historic_weather + OSM + zero_misc

    # Impute with mean
    weather = df.loc[:, 'weather_clear':'avg_wind_12_above'].columns.to_list()
    mean_misc = ['walk_score', 'bike_score', 'Poor_physical_health_days', 'Poor_mental_health_days', 'Adult_smoking']

    mean_imp_features = weather + mean_misc

    #======================================
    # CREATE TRANSFORMERS
    #======================================

    # Create transformer for 0 imputation
    zero_transformer = SimpleImputer(strategy='constant', fill_value=0)

    # Create transformer for mean imputation
    mean_transformer = SimpleImputer(strategy='mean')

    # Create transformer for `Republicans_08_Votes`
    rep_08_votes_transformer = SimpleImputer(strategy='constant', fill_value=193841)

    # Create transformer for `Democrats_08_Votes`
    dem_08_votes_transformer = SimpleImputer(strategy='constant', fill_value=123594)

    # Create transformer for `Republican_12_Votes`
    rep_12_votes_transformer = SimpleImputer(strategy='constant', fill_value=164676)

    # Create transformer for `Democrats_12_Votes`
    dem_12_votes_transformer = SimpleImputer(strategy='constant', fill_value=122640)

    # Create transformer for `Republicans_2016`
    rep_2016_transformer = SimpleImputer(strategy='constant', fill_value=163387)

    # Create transformer for `Democrats_2016`
    dem_2016_transformer = SimpleImputer(strategy='constant', fill_value=116454)

    # Create transformer for `Libertarians_2016`
    lib_2016_transformer = SimpleImputer(strategy='constant', fill_value=18725)

    #======================================
    # PUTTING IT ALL TOGETHER
    #======================================

    imputer = ColumnTransformer(
        transformers=[
            ('zero', zero_transformer, zero_imp_features),
            ('mean', mean_transformer, mean_imp_features),
            ('rep_08_votes', rep_08_votes_transformer, ['Republican_08_Votes']),
            ('dem_08_votes', dem_08_votes_transformer, ['Democrats_08_Votes']),
            ('rep_12_votes', rep_12_votes_transformer, ['Republican_12_Votes']),
            ('dem_12_votes', dem_12_votes_transformer, ['Democrats_12_Votes']),
            ('rep_2016', rep_2016_transformer, ['Republicans_2016']),
            ('dem_2016', dem_2016_transformer, ['Democrats_2016']),
            ('lib_2016', lib_2016_transformer, ['Libertarians_2016'])
        ],
        remainder='passthrough'
    )
    
    return imputer

In [9]:
def impute_data(imputer, X_train, X_valid):
    """
    Given a transformer fit on `X_train`, return the imputed dataframes.
    
    Note: add code later if you want to impute `X_test`
    
    Parameters
    ----------
    imputer: sklearn.compose._column_transformer.ColumnTransformer
        imputer
    
    X_train: pd.DataFrame
        `X_train`
        
    X_valid: pd.DataFrame
        `X_valid`
    
    Returns
    -------
    tuple
    
    """
    imp_X_train = imputer.fit_transform(X_train)
    imp_X_valid = imputer.transform(X_valid)
        
    cols = []
    
    # Grab column names of imputed features
    for i in range(len(imputer.transformers_) - 1):
        cols += imputer.transformers_[i][2]
    
    # Grab column names of features that were passed through unchanged
    cols += [X_train.columns[i] for i in imputer.transformers_[-1][2]]
    
    # Grab old order of columns
    old_cols = X_train.columns.to_list()
    
    # Create new dataframes
    # Reshuffle column order of new dataframes to match old one
    imp_X_train = pd.DataFrame(imp_X_train, columns=cols).reindex(columns=old_cols)
    imp_X_valid = pd.DataFrame(imp_X_valid, columns=cols).reindex(columns=old_cols)
    
    imputed_dfs = (imp_X_train, imp_X_valid)
    
    return imputed_dfs

In [75]:
def comb_cols(input_df):

    """
    This function takes the entire data(all cols) and combines some of them
    into new features using expert knowledge.

    Parameters
    ---------------
    input_df : pandas.DataFrame
       the entire dataframe with all the columns that has been imputed
        
    Returns
    ---------------
    pandas.DataFrame
        with new features combining some existing features
    """
    output_df = input_df.copy()
    
    # create a list of equipments to group together
    equipments = ['slide', 'climb', 'tube', 'overhang', 'bridge', 'swing', 'obsta', 'crawls']
    
    # create a dictionary of lists of columns to combine. 
    # 2 (monthly and historic) for each type of equipment
    new_cols_list = {}
    cols_to_drop = []
    
    for equipment in equipments:
        new_cols_list["monthly_"+equipment] = [i for i in input_df.columns if re.match('monthly_.*'+equipment+'.*', i)]
        new_cols_list["historic_"+equipment] = [i for i in input_df.columns if re.match('historic_.*'+equipment+'.*', i)]
        if new_cols_list["monthly_"+equipment] == []:
            new_cols_list.pop("monthly_"+equipment)
            
    for key, val in new_cols_list.items():
        output_df[key+"_count_comb"] = np.sum(output_df.loc[:, val], axis=1)
        cols_to_drop = cols_to_drop + val # add previous columns to list to drop
        
    
    # combine wind speed columns by Beaufort scale
    
    # group together `avg_wind_*` columns
    avg_wind_calm = 'avg_wind_0_1'
    avg_wind_light_air = ['avg_wind_1_2','avg_wind_2_3','avg_wind_3_4']
    avg_wind_light_br = ['avg_wind_4_5','avg_wind_5_6','avg_wind_6_7','avg_wind_7_8']
    avg_wind_gentle_br = ['avg_wind_8_9','avg_wind_9_10','avg_wind_10_11','avg_wind_11_12']
    avg_wind_moderate_br = 'avg_wind_12_above'
    
    output_df['avg_wind_calm'] = input_df[avg_wind_calm]
    output_df['avg_wind_light_air'] = np.sum(input_df.loc[:, avg_wind_light_air], axis=1)
    output_df['avg_wind_light_br'] = np.sum(input_df.loc[:, avg_wind_light_br], axis=1)
    output_df['avg_wind_gentle_br'] = np.sum(input_df.loc[:, avg_wind_gentle_br], axis=1)
    output_df['avg_wind_moderate_br'] = input_df[avg_wind_moderate_br]
    
    # group together `monthly_ws_*` columns
    monthly_ws_calm = 'monthly_ws_below_2'
    monthly_ws_light_air = 'monthly_ws_2_to_4'
    monthly_ws_light_br = ['monthly_ws_4_to_6','monthly_ws_6_to_8']
    monthly_ws_gentle_br = ['monthly_ws_8_to_10','monthly_ws_10_to_12']
    monthly_ws_moderate_br = ['monthly_ws_12_to_14','monthly_ws_14_to_16','monthly_ws_above_16']
    
    output_df['monthly_ws_calm'] = input_df[monthly_ws_calm]
    output_df['monthly_ws_light_air'] = input_df[monthly_ws_light_air]
    output_df['monthly_ws_light_br'] = np.sum(input_df.loc[:, monthly_ws_light_br], axis=1)
    output_df['monthly_ws_gentle_br'] = np.sum(input_df.loc[:, monthly_ws_gentle_br], axis=1)
    output_df['monthly_ws_moderate_br'] = np.sum(input_df.loc[:, monthly_ws_moderate_br], axis=1)
    
    # group together `historic_ws_*` columns
    historic_ws_calm = 'historic_ws_calm'
    historic_ws_light_air = 'historic_ws_2_to_4'
    historic_ws_light_br = ['historic_ws_4_to_6','historic_ws_6_to_8']
    historic_ws_gentle_br = ['historic_ws_8_to_10','historic_ws_10_to_12']
    historic_ws_moderate_br = ['historic_ws_12_to_14','historic_ws_14_to_16','historic_ws_above_16']
    
    output_df['historic_ws_calm'] = input_df[historic_ws_calm]
    output_df['historic_ws_light_air'] = input_df[historic_ws_light_air]
    output_df['historic_ws_light_br'] = np.sum(input_df.loc[:, historic_ws_light_br], axis=1)
    output_df['historic_ws_gentle_br'] = np.sum(input_df.loc[:, historic_ws_gentle_br], axis=1)
    output_df['historic_ws_moderate_br'] = np.sum(input_df.loc[:, historic_ws_moderate_br], axis=1)
    
    wind_cols_to_drop = avg_wind_calm + avg_wind_light_air + avg_wind_light_br + avg_wind_gentle_br + avg_wind_moderate_br \
                        + monthly_ws_calm + monthly_ws_light_air + monthly_ws_light_br + monthly_ws_gentle_br + monthly_ws_moderate_br \
                        + historic_ws_calm + historic_ws_light_air + historic_ws_light_br + historic_ws_gentle_br + historic_ws_moderate_br
    
    # drop wind-related columns    
    output_df = output_df.drop(columns = wind_cols_to_drop)
    
    # drop old equipment count columns
    output_df = output_df.drop(columns = cols_to_drop)                        
    
    # re-order columns so they match order of input data
    
    return len(wind_cols_to_drop)