In [94]:
import numpy as np
import pandas as pd

## Load Data

In [100]:
# load data
var_list = ['id',
            'age',
            'sex',
            'educ',
            'marital',
            'childs',
            'wrkstat',
            'income',
            'incom16', # family income while growing up
            'partyid',
            'relig',
            'fund', # fundamentalism/ liberalism of respondents religion
            'reliten',
            'courts',
            'cappun'
] # List of variables you want to save
#
output_filename = 'selected_gss_data.csv' # Name of the file you want to save the data to
#
modes = ['w','a'] # Has write mode and append mode
phase = 0 # Starts in write mode; after one iteration of loop, switches to append mode
#
col_names = []
#
for k in range(3): # for each chunk of the data
    url = 'https://github.com/DS3001/project_gss/raw/main/gss_chunk_' + str(1+k) + '.parquet' # Create url to the chunk to be processed
    print(url) # Check the url is correct
    df = pd.read_parquet(url) # Download this chunk of data
    # print(df.head()) # Visually inspect the first few rows
    global col_names
    col_names = df.columns
    df.loc[:,var_list].to_csv(output_filename, # specifies target file to save the chunk to
                              mode=modes[phase], # control write versus append
                              header=var_list, # variable names
                              index=False) # no row index saved
    phase = 1 # Switch from write mode to append mode

https://github.com/DS3001/project_gss/raw/main/gss_chunk_1.parquet
https://github.com/DS3001/project_gss/raw/main/gss_chunk_2.parquet
https://github.com/DS3001/project_gss/raw/main/gss_chunk_3.parquet


In [101]:
# df = pd.read_csv("/content/selected_gss_data.csv", low_memory=False) # colab line
df = pd.read_csv("selected_gss_data.csv", low_memory=False) # rivanna line

## Intial Exploration

In [102]:
df.head()

Unnamed: 0,id,age,sex,educ,marital,childs,wrkstat,income,incom16,partyid,relig,fund,reliten,courts,cappun
0,1,23.0,female,16.0,never married,0.0,working full time,,average,"independent, close to democrat",jewish,liberal,,about right,
1,2,70.0,male,10.0,married,5.0,retired,,above average,not very strong democrat,catholic,moderate,,not harshly enough,
2,3,48.0,female,12.0,married,4.0,working part time,,average,"independent (neither, no response)",protestant,moderate,,not harshly enough,
3,4,27.0,female,17.0,married,0.0,working full time,,average,not very strong democrat,other,,,about right,
4,5,61.0,female,12.0,married,2.0,keeping house,,below average,strong democrat,protestant,moderate,,not harshly enough,


In [103]:
df.dtypes

id         object
age        object
sex        object
educ       object
marital    object
childs     object
wrkstat    object
income     object
incom16    object
partyid    object
relig      object
fund       object
reliten    object
courts     object
cappun     object
dtype: object

The code below is to see if there are any exact duplicate rows, it turns out there are 2 that are just the column headers, probably from the way we imported the 3 chunks of data. I will drop both of these. 

In [104]:
# find duplicate rows
dup_mask = df.duplicated(keep=False)
dup_rows = df[dup_mask]

dup_rows

Unnamed: 0,id,age,sex,educ,marital,childs,wrkstat,income,incom16,partyid,relig,fund,reliten,courts,cappun
24130,id,age,sex,educ,marital,childs,wrkstat,income,incom16,partyid,relig,fund,reliten,courts,cappun
48261,id,age,sex,educ,marital,childs,wrkstat,income,incom16,partyid,relig,fund,reliten,courts,cappun


In [106]:
# drop duplicate rows 
df = df.drop_duplicates(keep=False)

In [107]:
df.shape

(72390, 15)

In [108]:
for each in df.columns:
    print(f"{each}: {df[each].isnull().sum()}")

id: 0
age: 769
sex: 112
educ: 263
marital: 51
childs: 261
wrkstat: 36
income: 8951
incom16: 13741
partyid: 485
relig: 437
fund: 5333
reliten: 11304
courts: 16279
cappun: 11543


## Clean CAPPUN Variable 

Choices made: encode "nan" as new category "no response"

In [109]:
df['cappun'].value_counts()

cappun
favor     42181
oppose    18666
Name: count, dtype: int64

In [110]:
df['cappun'] = df['cappun'].replace(np.nan, 'no response')

In [111]:
df['cappun'].value_counts()

cappun
favor          42181
oppose         18666
no response    11543
Name: count, dtype: int64

## Clean COURTS variable

Choices made: encode "nan" as new category "no response"

In [112]:
df['courts'].value_counts()

courts
not harshly enough    43636
about right            7906
too harshly            4569
Name: count, dtype: int64

In [113]:
df['courts'] = df['courts'].replace(np.nan, 'no response')

In [114]:
df['courts'].value_counts()

courts
not harshly enough    43636
no response           16279
about right            7906
too harshly            4569
Name: count, dtype: int64

## Clean INCOME variables 

income = total family income last year before taxes <br>
incom16 = family income while growing up 

Choices made: for incom16, encode "nan" as new category "no response"

In [115]:
df['incom16'].value_counts()

incom16
average                 28368
below average           14732
above average            9069
far below average        5334
far above average        1136
lived in institution       10
Name: count, dtype: int64

In [116]:
df['incom16'] = df['incom16'].replace(np.nan, 'no response')

In [117]:
df['incom16'].value_counts()

incom16
average                 28368
below average           14732
no response             13741
above average            9069
far below average        5334
far above average        1136
lived in institution       10
Name: count, dtype: int64

In [118]:
df['income'].value_counts()

income
$25,000 or more       34785
$10,000 to $14,999     6850
$20,000 to $24,999     5528
$15,000 to $19,999     5301
$8,000 to $9,999       2285
$1,000 to $2,999       1412
$7,000 to $7,999       1315
$5,000 to $5,999       1314
$3,000 to $3,999       1309
$6,000 to $6,999       1249
$4,000 to $4,999       1189
under $1,000            902
Name: count, dtype: int64

In [119]:
df['income'] = df['income'].replace(np.nan, 'no response')

In [120]:
df['income'].value_counts()

income
$25,000 or more       34785
no response            8951
$10,000 to $14,999     6850
$20,000 to $24,999     5528
$15,000 to $19,999     5301
$8,000 to $9,999       2285
$1,000 to $2,999       1412
$7,000 to $7,999       1315
$5,000 to $5,999       1314
$3,000 to $3,999       1309
$6,000 to $6,999       1249
$4,000 to $4,999       1189
under $1,000            902
Name: count, dtype: int64