In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest

np.random.seed(23)

In [2]:
df = pd.read_csv('../data/survey_data_before_pre_processing.csv')

### Convert all Nulls to 'none'
- This is done prior to ordinal encoding 

In [3]:
df.fillna('none', inplace=True)

### Ordinal Encode

In [5]:
# Ordinal encoding these features

# the list of columns to ordinal
df_ord = df[['year','wave', 'cause_recoded', 'sci_consensus',
             'harm_personally', 'harm_US', 'harm_dev_countries',
             'harm_future_gen', 'harm_plants_animals', 'when_harm_US', 
             'generation', 'educ_category', 'income_category',
             'reg_CO2_pollutant', 'fund_research', 
             'discuss_GW']]

# have first spot be 'none' for features that got nulls changed
# have first spot be '' for features with no nulls changed
# this allows us to peel off nulls by removing all 0 encoded ordinals
dogs = [[-999,2008, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018],
        ['','2008 Nov', '2010 Jan', '2010 Jun', '2011 May', '2011 Nov', 
         '2012 Mar', '2012 Sep', '2013 Apr', '2013 Nov', '2014 Apr',
         '2014 Oct', '2015 Mar', '2015 Oct', '2016 Mar', '2016 Nov',
         '2017 May', '2017 Oct', '2018 Mar', '2018 Dec',],
# cause_rec
        ['none', 'No', "Don't know", 'Yes'],
# sci_cons
        ['none', 'No', "Don't know", 'Maybe', 'Yes'],

        ['none','No',"Don't know", 'Yes'],
        ['none','No',"Don't know", 'Yes'],
        ['none','No',"Don't know", 'Yes'],
        ['none','No',"Don't know", 'Yes'],
        ['none','No',"Don't know", 'Yes'],
        ['none','Never', 'In the Future', 'Now'],
# gen
        ['','Greatest (Before 1928)','Silent (1928 - 1945)',
         'Baby Boomers (1946 - 1964)',
         'Generation X (1965 - 1980)',
         'Millennials (1981 - 1996)',
         'iGen\u200e/Gen Z (1997 - )'],
# ed
        ['','Less than high school',
         'High school',
         'Some college',
         'Bachelor\'s degree or higher'],
# income        
        ['','Less than $50,000',
         '$50,000 to $99,999',
         '$100,000 or more'],
        
        ['none','Oppose', 'Support'],
        ['none','Oppose', 'Support'],
        ['none','Never', 'At All']]


# initialize
o_enc = OrdinalEncoder(categories=dogs) 

# fit transform
X_ord = o_enc.fit_transform(df_ord)

# add to df
# col names are original names + '_ord'
X_ord_df = pd.DataFrame(X_ord,
                        columns = [col+'_ord' for col in df_ord.columns])\
                .applymap(lambda x: np.nan if x == 0 else x)

# drop non-ordinal cols for ordinal cols
df = df.drop(df_ord.columns, axis = 1).join(X_ord_df)

### Dummy encoding remaining features and adding features that were already ordinal encoded

In [7]:
dum_df = df[['party', 'party_x_ideo', 'region4', 
             'religion','marit_status', 'employment', 'house_head',
             'house_type', 'house_own']]

rdy_to_go = df[['house_ages18plus', 'children', 'service_attendance', 'house_size']]


In [8]:
dum_df = pd.get_dummies(dum_df, drop_first=True)

In [9]:
additional_df = pd.concat([dum_df, rdy_to_go],  axis=1)

In [10]:
df.drop(['party', 'party_x_ideo', 'region4', 
         'religion','marit_status', 'employment', 'house_head',
         'house_type', 'house_own','house_ages18plus',
         'children', 'service_attendance', 'house_size'], axis=1, inplace=True)

In [11]:
df = pd.concat([df, additional_df], axis=1)

### Label Encoding target

In [32]:
df['happening'] = [0 if x == 'No' 
                   else 1 if x == "Don't know" 
                   else 2 for x in df.happening]

### Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
y = df.happening
X = df[[cols for cols in df.columns if cols != "happening"]]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.20,
                                                   random_state = 21)

### Iteratively Impute Nulls

Now all of our data is ordinal encoded or dummied, iteratively impute the nulls

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [16]:
# identify numerical columns
cols = X_train.describe().columns

In [17]:
# identify columns that will have nulls imputed
cols_with_indicator = X_train[cols].loc[:,X_train[cols].isnull().any()].columns

# rename them
cols_with_indicator = [c+'_ind' for c in cols_with_indicator]

In [19]:
# initialize
it_imp = IterativeImputer(initial_strategy='mean',
                              add_indicator=True,
                              random_state = 21
                             )

In [20]:
# fit on train
X_train_it_imp = pd.DataFrame(it_imp.fit_transform(X_train[cols]),
                columns = X_train[cols].columns.to_list()+cols_with_indicator,
                index = X_train.index)

# transform test
X_test_it_imp = pd.DataFrame(it_imp.transform(X_test[cols]),
            columns = X_train[cols].columns.to_list()+cols_with_indicator,
            index = X_test.index)

In [21]:
X_train = X_train.drop(cols, axis = 1).join(X_train_it_imp)
X_test = X_test.drop(cols, axis = 1).join(X_test_it_imp)

### Standard Scale

Will let us explore knn or clustering if wanted

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
# initialize
ss = StandardScaler()

# fit on Train
X_train_ss = pd.DataFrame(ss.fit_transform(X_train[cols]),
                          columns = X_train[cols].columns,
                          index = X_train.index)

# transform test
X_test_ss = pd.DataFrame(ss.transform(X_test[cols]),
                         columns = X_test[cols].columns,
                         index = X_test.index)

In [24]:
X_train_ss = X_train.drop(cols, axis = 1).join(X_train_ss)
X_test_ss = X_test.drop(cols, axis = 1).join(X_test_ss)

### SelectKBeset

In [33]:
# Create and fit selector
selector = SelectKBest(k=20)
selector.fit(X_train_ss, y_train)

  f = msb / msw


SelectKBest(k=20)

In [34]:
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)

In [35]:
X_train_feat_select = X_train_ss.iloc[:,cols]
X_test_feat_select = X_test_ss.iloc[:,cols]

In [36]:
removed_cols = [c for c in X_train_ss.columns if c not in X_train_ss_new.columns]
X_train_feat_removed = X_train_ss.loc[:,removed_cols]
X_test_feat_removed = X_test_ss.loc[:,removed_cols]

In [37]:
# Selected Columns
X_train_feat_select.columns

Index(['cause_recoded_ord', 'sci_consensus_ord', 'harm_personally_ord',
       'harm_US_ord', 'harm_dev_countries_ord', 'harm_future_gen_ord',
       'harm_plants_animals_ord', 'when_harm_US_ord', 'educ_category_ord',
       'reg_CO2_pollutant_ord', 'fund_research_ord', 'discuss_GW_ord',
       'party_No party/not interested in politics', 'party_Republican',
       'party_x_ideo_Liberal Democrat',
       'party_x_ideo_Moderate/Conservative Democrat',
       'party_x_ideo_No Party/Not Interested in politics',
       'marit_status_Married',
       'house_type_One-family house detached from any other house',
       'service_attendance'],
      dtype='object')

### PCA

## Exports

In [29]:
# Not using these
X_train_ss.to_csv('../data/x_train_ss_processed.csv', index=False)
X_test_ss.to_csv('../data/x_test_ss_processed.csv', index=False)

# export after SS, selectkbest, pca