In [4]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# import other functions
from imputer import *
from feature_eng import *
from drop import *

In [2]:
df = pd.read_csv("../data/train_data.zip")
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [8]:
def drop_columns_wo_state(input_data):
    """
    Drops some columns that we think are irrelevant from the original dataframe.
    
    Parameters
    ---------------
    
    input_data : pandas.core.frame.DataFrame
    
    Returns
    ---------------
    pandas.core.frame.DataFrame
        
    """
    data = input_data.copy()
    data = data.drop(columns = ['external_id', 'monthly_count_of_holidays', 'B13016e2', 'B19113e1', 'name',
                                'city', 'country', 'county', 'MonthYear', 'date', 
                                'streets_per_node_counts_0', 'streets_per_node_counts_0_osid', 
                                'streets_per_node_counts_0_osdw', 'self_loop_proportion', 'self_loop_proportion_osid', 
                                'self_loop_proportion_osdw', 'circuity_avg', 'circuity_avg_osid', 'circuity_avg_osdw', 
                                'clean_intersection_density_km', 'node_density_km', 'clean_intersection_count_osid', 
                                'node_density_km_osdw', 'intersection_density_km_osdw', 'street_density_km_osid', 
                                'edge_density_km_osid', 'intersection_density_km_osid', 'node_density_km_osid', 
                                'edge_density_km_osdw', 'street_density_km_osdw', 'clean_intersection_count', 
                                'clean_intersection_count_osdw', 'clean_intersection_density_km_osdw', 'street_density_km', 
                                'edge_density_km', 'intersection_density_km', 'clean_intersection_density_km_osid', 
                                'streets_per_node_counts_8', 'streets_per_node_proportion_8', 'streets_per_node_proportion_7_osid', 
                                'streets_per_node_counts_7_osid', 'streets_per_node_proportion_8_osdw', 
                                'streets_per_node_counts_8_osdw', 'streets_per_node_proportion_7', 'streets_per_node_counts_7', 
                                'streets_per_node_counts_7_osdw', 'streets_per_node_proportion_7_osdw', 
                                'streets_per_node_proportion_6_osid', 'streets_per_node_counts_6_osid', 
                                'streets_per_node_proportion_6', 'streets_per_node_counts_6', 'streets_per_node_counts_6_osdw', 
                                'streets_per_node_proportion_6_osdw', 'transit_score', 'closest_place_category', 
                                'closest_place_distance'])
    
    # Gather all `temp_min_*` columns
    temp_list = [i for i in data.columns if re.match('temp_min_*', i)]
    
    # Gather all `streets_per_node_proportion_*` columns
    street_list = [i for i in data.columns if re.match('streets_per_node_proportion_*', i)]

    # Gather all news events columns
    news_state_list = data.loc[:, 'total_events_across_state':'material_conflict_events_across_state'].columns.to_list()
    news_radius_list = data.loc[:, 'total_events_500_meters':'material_conflict_events_2000_meters'].columns.to_list()
    
    # Gather unneeded census columns
    sex_age_list = data.loc[:, 'B01001e27':'B01001e6'].columns.to_list()
    
    # Gather all Biba survey columns
    monthly_survey_list = data.loc[:, ['monthly_weekday_counts', 'monthly_survey']].columns.to_list()
    historic_survey_list = data.loc[:, 'historic_weekday_0':'historic_variety'].columns.to_list()
    
    cols_to_drop =  temp_list + street_list + news_state_list + news_radius_list + sex_age_list \
                    + monthly_survey_list + historic_survey_list

    data = data.drop(columns = cols_to_drop)
        
    return data


In [9]:
# drop rows missing target variable
df = drop_missing_unacast(df)

# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns_wo_state(X_train)
X_valid = drop_columns_wo_state(X_valid)

# # perform OHE (climate, density_class, income_class)
# X_train_valid = clean_categorical(X_train, X_valid)
# X_train = X_train_valid[0]
# X_valid = X_train_valid[1]

In [10]:
train_processed = pd.concat([X_train, y_train.reset_index(drop = True)], axis = 1)
valid_processed = pd.concat([X_valid, y_valid.reset_index(drop = True)], axis = 1)

train_processed.to_csv('../data/train_processed.csv', index = False)
valid_processed.to_csv('../data/valid_processed.csv', index = False)

In [11]:
train_processed.head()

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,monthly_ws_light_br,monthly_ws_gentle_br,monthly_ws_moderate_br,historic_ws_calm,historic_ws_light_air,historic_ws_light_br,historic_ws_gentle_br,historic_ws_moderate_br,avg_fertility_rate,unacast_session_count
0,5,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,63.906875,190.0
1,4,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,1.0,18.0,63.0,41.0,54.0,61.173125,7.0
2,6,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0.0,2.0,2.0,2.0,3.0,65.725,32.0
3,9,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0.0,6.0,7.0,4.0,5.0,65.1275,2332.0
4,7,2019,176,16,160,1540398.0,438295.454545,156642.045455,56107.954545,0,...,54,58,35,0.0,13.0,50.0,23.0,20.0,58.47,156.0
