In [263]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

# Read in Google Sheet
url = f"https://docs.google.com/spreadsheets/d/1lUn0GcPchnKEPA6EnSYnC9wZ5hAqXZegj3KEz64sB7I/export?format=csv"

# Establish dataframe
data = pd.read_csv(url)

data.drop(data.columns[0:15], axis=1, inplace=True) # Drop early columns with unnecessary data
data.drop(data.columns[[6, 21]], axis=1, inplace=True) # Drop empty last column and "religious other" text
data.drop([0], inplace=True) # Drop row with descriptive information
data.rename(columns={'UserLanguage.1': 'Q1'}, inplace=True) # Rename column 1

In [264]:
#print(data)

In [265]:
# Label columns by encoding style
binary_cols = ['Q4']
one_hot_cols = ['Q1', 'Q3', 'Q6', 'Q9', 'Q12']
multi_hot_cols = ['Q5', 'Q17', 'Q18']
ordinal_cols = ['Q2', 'Q7', 'Q8', 'Q10', 'Q11', 'Q13', 'Q14', 'Q15', 'Q16', 'Q19', 'Q20']

# Copy data set for encoding
data_encoded = data

In [266]:
# Initialize encoder
encoder = OneHotEncoder()

# One hot encode all necessary columns
for column in one_hot_cols:

    # Encode column, convert to array, then to data frame with format matching data_encoded
    encoded = encoder.fit_transform(data[[column]])
    encoded_array = encoded.toarray()
    col_df = pd.DataFrame(encoded_array, columns=encoder.categories_[0], index=data_encoded.index)

    # Add encoded column to data frame, drop original column
    data_encoded = pd.concat([data_encoded, col_df], axis=1)
    data_encoded = data_encoded.drop(columns=[column])

In [267]:
#print(data_encoded.head())

In [268]:
# Initializer encoder
mlb = MultiLabelBinarizer()

# Multi hot encode all necessary columns
for column in multi_hot_cols:

    # Turn empty values into strings with 'NaN' for consistency
    data_encoded[column] = data_encoded[column].fillna('NaN')

    # Split answers from long strings to lists of all answers selected
    col_list = data_encoded[column].str.split(',')

    # Encode column and translate to data frame with format matching data_encoded
    encoded = mlb.fit_transform(col_list)
    col_df = pd.DataFrame(encoded, columns=mlb.classes_, index=data_encoded.index)

    # Add encoded column to data frame, drop original column
    data_encoded = pd.concat([data_encoded, col_df], axis=1)
    data_encoded = data_encoded.drop(columns=[column])

In [269]:
#print(data_encoded.head())

In [270]:
# Binary encode column
data_encoded['Q4'] = data_encoded['Q4'].map({'Married': 1, 'Not married': 0})
data.rename(columns={'Q4': 'Married'}, inplace=True) # Rename column 4

In [271]:
#print(data_encoded.head())

In [272]:
# Ordinal Encoding - done manually

# Column 2
data_encoded['Q2'] = data_encoded['Q2'].map({'18-24': 1, '25-29': 2, '30-39': 3, '40-49': 4, '50-64': 5, '65+': 6})
data.rename(columns={'Q2': 'Age Group'}, inplace=True)

# Column 7
data_encoded['Q7'] = data_encoded['Q7'].map({'Less than High School': 1, 'High school graduate / GED': 2, 'Some college': 3, '2 year degree': 4, '4 year degree': 5, 'Professional degree': 6, 'Doctorate': 7})
data.rename(columns={'Q7': 'Education Level'}, inplace=True)

# Column 8
data_encoded['Q8'] = data_encoded['Q8'].map({'Less than $30,000': 1, '$30,000-$49,999': 2, '$50,000-$100,000': 3, '$100,000-$199,999': 4, '$200,000 or more': 5})
data.rename(columns={'Q8': 'Household Income'}, inplace=True)

# Column 10
data_encoded['Q10'] = data_encoded['Q10'].map({'Definitely Wrong': 1, 'Somewhat Wrong': 2, 'Unsure': 3, 'Somewhat Right': 4, 'Definitely Right': 5})
data.rename(columns={'Q10': 'Direction of US'}, inplace=True)

# Column 11
data_encoded['Q11'] = data_encoded['Q11'].map({'Definitely Wrong': 1, 'Somewhat Wrong': 2, 'Unsure': 3, 'Somewhat Right': 4, 'Definitely Right': 5})
data.rename(columns={'Q11': 'Direction of RI'}, inplace=True)

# Column 13
data_encoded['Q13'] = data_encoded['Q13'].map({'Not at all Excited': 1, 'Not Very Excited': 2, 'Somewhat Excited': 3, 'Very Excited': 4, 'Extremely Excited': 5})
data.rename(columns={'Q13': 'Excitement about vote'}, inplace=True)

# Column 14
data_encoded['Q14'] = data_encoded['Q14'].map({'Not at All Confident': 1, 'Not Very Confident': 2, 'Unsure': 3, 'Somewhat Confident': 4, 'Very Confident': 5})
data.rename(columns={'Q14': 'Confidence in RI Politicians’ Care'}, inplace=True)

# Column 15
data_encoded['Q15'] = data_encoded['Q15'].map({'Not at All Confident': 1, 'Not Very Confident': 2, 'Unsure': 3, 'Somewhat Confident': 4, 'Very Confident': 5})
data.rename(columns={'Q15': 'Confidence in Local Voting Officials’ Election Management'}, inplace=True)

# Column 16
data_encoded['Q16'] = data_encoded['Q16'].map({'Not at All Confident': 1, 'Not Very Confident': 2, 'Unsure': 3, 'Somewhat Confident': 4, 'Very Confident': 5})
data.rename(columns={'Q16': 'Confidence in Accuracy of National Votes Cast'}, inplace=True)

# Column 19
data_encoded['Q19'] = data_encoded['Q19'].map({'Definitely Not': 1, 'Probably Not': 2, 'I would need more information to make a decision': 3, 'Probably': 4, 'Definitely': 5})
data.rename(columns={'Q19': 'Same-Day Voter Registration Support'}, inplace=True)

# Column 20
data_encoded['Q20'] = data_encoded['Q20'].map({'Definitely Not': 1, 'Probably Not': 2, 'I would need more information to make a decision': 3, 'Probably': 4, 'Definitely': 5})
data.rename(columns={'Q20': 'Ranked Choice Voting Support'}, inplace=True)

In [273]:
#print(data_encoded.head())

In [274]:
# Save data frame as .csv to be opened in analysis notebook
data_encoded.to_csv('dataframe.csv', index=False)