## Data import

This notebook imports .dta data files, assigns codes to categorical variables in the FB survey, and exports .csv files for future use

In [45]:
import numpy as np
import pandas as pd

In [46]:
fb = pd.read_stata("../input/week12.dta")
uas = pd.read_stata("../input/uas244.dta")

In [47]:
fb.shape

(1777, 42)

In [48]:
uas.shape

(6407, 780)

In [49]:
fb.columns

Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration__in_seconds_', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
       'ExternalReference', 'LocationLatitude', 'LocationLongitude',
       'DistributionChannel', 'UserLanguage', 'timer_First_Click',
       'timer_Last_Click', 'timer_Page_Submit', 'timer_Click_Count', 'Q1',
       'Q2', 'Q3', 'Q3_1', 'Q4', 'Q5', 'Q6', 'Q7_1', 'Q8', 'Q9', 'Q9_6_TEXT',
       'Q10', 'Q11', 'Q12', 'SC0', 'timeload', 'DeviceIdentifier',
       'ipaddress_0', 'ResponseID_0', 'Week', 'Image'],
      dtype='object')

In [50]:
# check columns with "age" string
age_cols = [col for col in uas.columns if 'age' in col]
print(age_cols)

['language', 'age', 'agerange', 'hhmemberage_1', 'hhmemberage_2', 'hhmemberage_3', 'hhmemberage_4', 'hhmemberage_5', 'hhmemberage_6', 'hhmemberage_7', 'hhmemberage_8', 'hhmemberage_9', 'hhmemberage_10', 'hhmemberage_11', 'hhmemberage_12', 'hhmemberage_13', 'hhmemberage_14', 'hhmemberage_15', 'hhmemberage_16', 'hhmemberage_17', 'hhmemberage_18']


In [51]:
# "agerange" is the wrong variable, remove
uas['agerange'].value_counts(dropna=False).sort_index()

1 18-29       3
2 30-39       1
3 40-49       4
4 50-59       4
5 60-69       1
NaN        6394
Name: agerange, dtype: int64

In [52]:
# use "age" instead and recode it into age_cat based on fb threshold
uas['age_cat'] = pd.cut(uas['age'], bins=[17, 25, 35, 45, 55, 65, float('Inf')], labels=['1', '2', '3', '4', '5', '6'])

In [53]:
uas['age_cat'].value_counts(dropna=False).sort_index()

1       347
2       970
3      1258
4      1187
5      1331
6      1307
NaN       7
Name: age_cat, dtype: int64

In [54]:
uas = uas[["uasid", "cr027a", "cr027c", "cr030", "cr018a", "lr015", "ei002", "cr015c", "cr015d", 
           "cr015k", "cr015l", "gender", "maritalstatus", "age_cat", "education", "final_weight"]]

In [55]:
uas.columns

Index(['uasid', 'cr027a', 'cr027c', 'cr030', 'cr018a', 'lr015', 'ei002',
       'cr015c', 'cr015d', 'cr015k', 'cr015l', 'gender', 'maritalstatus',
       'age_cat', 'education', 'final_weight'],
      dtype='object')

##### Variable coding for FB survey

Codebook: https://docs.google.com/spreadsheets/d/1q9l31woXPygsYv5hhMlmM75qozfVtgO1/edit#gid=1487881949

In [56]:
cat_to_numeric = {"Q1": {"Not at all": 1, 
                         "Several days": 2, 
                         "More than half of days": 3,
                         "Nearly every day": 4},
                  "Q2": {"Not at all": 1, 
                         "Several days": 2, 
                         "More than half of days": 3,
                         "Nearly every day": 4},
                  "Q3": {"Not acceptable": 1, 
                         "Barely acceptable": 2,
                         "Medium acceptable": 3, 
                         "Quite acceptable": 4,
                         "Very acceptable": 5},
                  "Q3_1": {"Not acceptable": 1, 
                         "Barely acceptable": 2,
                         "Medium acceptable": 3, 
                         "Quite acceptable": 4,
                         "Very acceptable": 5},
                  "Q4": {"Not acceptable": 1, 
                         "Barely acceptable": 2,
                         "Medium acceptable": 3, 
                         "Quite acceptable": 4,
                         "Very acceptable": 5},
                  "Q5": {"Very unlikely": 1, 
                         "Somewhat unlikely": 2,
                         "Somewhat likely": 3,
                         "Very likely": 4,
                         "Unsure": 5},
                  "Q6": {"Yes": 1, 
                         "No": 2, 
                         "Unsure": 3},
                  "Q8": {"Yes": 1, 
                         "No": 2, 
                         "Unsure": 3},
                  "Q9": {"Woman": 1, 
                         "Man": 2, 
                         "Trans woman": 3,
                         "Trans man": 4,
                         "Non binary, gender nonconforming, or gender queer": 5,
                         "Some other description (please specify)": 6},
                  "Q10": {"Yes": 1, 
                          "No": 2, 
                          "Unsure": 3},
                  "Q11": {"Under 18 years old": 1, 
                          "19-25 years old": 2, 
                          "26-35 years old": 3,
                          "36-45 years old": 4,
                          "46-55 years old": 5,
                          "56-65 years old": 6,
                          "66+ years old": 7},
                  "Q12": {"Less than high school diploma":1,
                          "High school diploma": 2,
                          "Some college": 3,
                          "Bachelor's degree": 4,
                          "Graduate degree": 5},
                   "Image": {"Control": 1,
                             "COVID": 2,
                             "Data Privacy": 3,
                             "Finance": 4,
                             "Mental Health": 5}}

In [57]:
fb_num = fb.replace(cat_to_numeric)

In [58]:
fb_num.shape

(1777, 42)

In [59]:
cat_to_numeric_uas = {"cr027a": {"1 Not at all": 1, 
                                 "2 Several days": 2, 
                                 "3 More than half the days": 3,
                                 "4 Nearly every day": 4},
                      "cr027c": {"1 Not at all": 1, 
                                 "2 Several days": 2, 
                                 "3 More than half the days": 3,
                                 "4 Nearly every day": 4},
                      "cr030": {"1 Very unlikely": 1, 
                                "2 Somewhat unlikely": 2,
                                "3 Somewhat likely": 3,
                                "4 Very likely": 4,
                                "5 Unsure": 5},
                      "cr018a": {"1 Yes": 1, 
                                 "2 No": 2,
                                 "3 Unsure": 3},
                      "ei002": {"1 Yes": 1, 
                                "2 No": 2,
                                "3 Unsure": 3}}

In [60]:
uas_num = uas.replace(cat_to_numeric_uas)

In [61]:
uas_num.shape

(6407, 16)

In [62]:
fb.to_csv("../output/fb.csv")
fb_num.to_csv("../output/fb_numeric.csv")
uas.to_csv("../output/uas.csv")
uas_num.to_csv("../output/uas_numeric.csv")