In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv('../data/survey_data_before_pre_processing.csv')

### Convert all Refused to NaN

In [3]:
# map every Refused to np.nan
df = df.applymap(lambda x: np.nan if x == 'Refused' else x)

### Ordinal Encode

In [4]:
# Replacing nulls in these columns with 'none' to help with ordinal encoding
df['harm_dev_countries'].fillna('none', inplace=True)
df['harm_future_gen'].fillna('none', inplace=True)
df['harm_plants_animals'].fillna('none', inplace=True)
df['when_harm_US'].fillna('none', inplace=True)
df['reg_CO2_pollutant'].fillna('none', inplace=True)
df['reg_utilities'].fillna('none', inplace=True)
df['fund_research'].fillna('none', inplace=True)
df['discuss_GW'].fillna('none', inplace=True)

In [5]:
# Ordinal encoding these features

# the list of columns to ordinal
df_ord = df[['year','wave','generation', 'educ_category', 'income_category',
             'harm_dev_countries','harm_future_gen', 'harm_plants_animals',
             'when_harm_US', 'reg_CO2_pollutant','reg_utilities',
             'fund_research', 'discuss_GW']]


# have first spot be 'none' for features that got nulls changed
# have first spot be '' for features with no nulls changed
# this allows us to peel off nulls by removing all 0 encoded ordinals
dogs = [[-999,2008, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018],
        ['','2008 Nov', '2010 Jan', '2010 Jun', '2011 May', '2011 Nov', 
         '2012 Mar', '2012 Sep', '2013 Apr', '2013 Nov', '2014 Apr',
         '2014 Oct', '2015 Mar', '2015 Oct', '2016 Mar', '2016 Nov',
         '2017 May', '2017 Oct', '2018 Mar', '2018 Dec',],
        ['','Greatest (Before 1928)','Silent (1928 - 1945)',
         'Baby Boomers (1946 - 1964)',
         'Generation X (1965 - 1980)',
         'Millennials (1981 - 1996)',
         'iGen\u200e/Gen Z (1997 - )'],
        ['','Less than high school',
         'High school',
         'Some college',
         'Bachelor\'s degree or higher'],
        ['','Less than $50,000',
         '$50,000 to $99,999',
         '$100,000 or more'],
        ['none','No',"Don't know", 'Yes', ],
        ['none','No',"Don't know", 'Yes'],
        ['none','No',"Don't know", 'Yes'],
        ['none','Never', 'In the Future', 'Now'],
        ['none','Oppose', 'Support'],
        ['none','Oppose', 'Support'],
        ['none','Oppose', 'Support']
        ,['none','Never', 'At All']]


# initialize
o_enc = OrdinalEncoder(categories=dogs) 

# fit transform
X_ord = o_enc.fit_transform(df_ord)

# add to df
# col names are original names + '_ord'
X_ord_df = pd.DataFrame(X_ord,
                        columns = [col+'_ord' for col in df_ord.columns])\
                .applymap(lambda x: np.nan if x == 0 else x)

# drop non-ordinal cols for ordinal cols
df = df.drop(df_ord.columns, axis = 1).join(X_ord_df)

### Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
y = df.happening
X = df[[cols for cols in df.columns if cols != "happening"]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.20,
                                                   random_state = 21)

### Iteratively Impute Nulls

Now all of our data is ordinal encoded or dummied, iteratively impute the nulls

In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [10]:
# identify numerical columns
cols = X_train.describe().columns

In [11]:
# identify columns that will have nulls imputed
cols_with_indicator = X_train[cols].loc[:,X_train[cols].isnull().any()].columns

# rename them
cols_with_indicator = [c+'_ind' for c in cols_with_indicator]

In [12]:
# these columns still need to be encoded
[c for c in X_train.columns if c not in cols]

['cause_other_text',
 'cause_recoded',
 'sci_consensus',
 'worry',
 'harm_personally',
 'harm_US',
 'party',
 'party_x_ideo',
 'region4',
 'religion',
 'religion_other_nonchristian',
 'evangelical',
 'marit_status',
 'employment',
 'house_head',
 'house_type',
 'house_own']

In [13]:
# initialize
it_imp = IterativeImputer(initial_strategy='mean',
                              add_indicator=True,
                              random_state = 21
                             )

In [14]:
# fit on train
X_train_it_imp = pd.DataFrame(it_imp.fit_transform(X_train[cols]),
                columns = X_train[cols].columns.to_list()+cols_with_indicator)

# transform test
X_test_it_imp = pd.DataFrame(it_imp.transform(X_test[cols]),
            columns = X_train[cols].columns.to_list()+cols_with_indicator)

In [15]:
X_train = X_train.drop(cols, axis = 1).join(X_train_it_imp)
X_test = X_test.drop(cols, axis = 1).join(X_test_it_imp)

### Standard Scale

Will let us explore knn or clustering if wanted

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
# initialize
ss = StandardScaler()

# fit on Train
X_train_ss = pd.DataFrame(ss.fit_transform(X_train[cols]),
                          columns = X_train[cols].columns)

# transform test
X_test_ss = pd.DataFrame(ss.transform(X_test[cols]),
                         columns = X_test[cols].columns)

In [18]:
X_train_ss = X_train.drop(cols, axis = 1).join(X_train_ss)
X_test_ss = X_test.drop(cols, axis = 1).join(X_test_ss)

## Exports

In [19]:
y_train.to_csv('../data/y_train_processed.csv', index=False)
y_test.to_csv('../data/y_train_processed.csv', index=False)

X_train.to_csv('../data/x_train_processed.csv', index=False)
X_test.to_csv('../data/x_test_processed.csv', index=False)

X_train_ss.to_csv('../data/x_train_ss_processed.csv', index=False)
X_test_ss.to_csv('../data/x_test_ss_processed.csv', index=False)