# Data Collection and Formatting

In [1]:
import pandas as pd
import tarfile
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns

%run -i "functions/unzip.py"
%run -i "functions/scrape_platforms.py"
%run -i "functions/scrape_SotU.py"

# Text Data
### Political Platforms

In [None]:
years = ['1972', '1976', '1980', '1984', '1988', '1992', '1996', 
         '2000', '2008', '2012', '2016']
platforms = scrape_platforms(years)
platforms.head()

### State of the Union Addresses

In [None]:
speeches = scrape_SotU()

### Debates

In [None]:
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
debate = soup.find_all('p')
for deb in debate:
    print(deb.text)

# Survey Data
### Convert Data to Dataframes
###### Weights
Based on the sample weights provided by the General Social Survey, it appears that some samples were significantly over and underrepresented. In order to account for this weights will be applied to this analysis. In order to maintain a reasonable size of the data rather than achieving the right proportions by using the complete weights, I will use the rounded values to the nearest tenth.  

In [2]:
weights = unzip('Weights')

In [3]:
weights.head()

Unnamed: 0,year,id,Weight variable
0,1972,0,0.4446
1,1972,1,0.8893
2,1972,2,0.8893
3,1972,3,0.8893
4,1972,4,0.8893


In [4]:
weights['Weight variable'].describe()

count    64814.000000
mean         1.000015
std          0.468172
min          0.391825
25%          0.550100
50%          0.970900
75%          1.098500
max          8.739876
Name: Weight variable, dtype: float64

In [5]:
def weight(df):
    return df.loc[df.index.repeat(round(weights['Weight variable']*10, 0))].reset_index()

###### Abortion Opinions
To reduce dimensionality in the data, I will combine the reasons for abortion into three categories: health, economics, and right-to-choose. There will also be a category for abolishing legal abortion. 

In [6]:
abortion = unzip('Abortion')

In [7]:
abortion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64814 entries, 0 to 64813
Data columns (total 9 columns):
year                                      64814 non-null int64
id                                        64814 non-null int32
Strong chance of serious defect           64814 non-null object
Married--wants no more children           64814 non-null object
Woman's health seriously endangered       64814 non-null object
Low income--cant afford more children     64814 non-null object
Pregnant as result of rape                64814 non-null object
Not married                               64814 non-null object
Abortion if woman wants for any reason    64814 non-null object
dtypes: int32(1), int64(1), object(7)
memory usage: 4.2+ MB


In [8]:
abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')|
             (abortion['Married--wants no more children'] == 'Yes')|
             (abortion["Woman's health seriously endangered"] == 'Yes')|
             (abortion["Low income--cant afford more children"] == 'Yes')|
             (abortion["Pregnant as result of rape"] == 'Yes')|
             (abortion['Not married']=='Yes'), 'abortion'] = 'conditional'

abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')&
             (abortion['Married--wants no more children'] == 'Yes')&
             (abortion["Woman's health seriously endangered"] == 'Yes')&
             (abortion["Low income--cant afford more children"] == 'Yes')&
             (abortion["Pregnant as result of rape"] == 'Yes')&
             (abortion['Not married']=='Yes'), 'abortion'] = 'unrestricted'

abortion.loc[abortion['Abortion if woman wants for any reason']== 'Yes', 'abortion']= 'unrestricted'
abortion['abortion'].fillna('never', inplace = True)

In [9]:
abortion.head()

Unnamed: 0,year,id,Strong chance of serious defect,Married--wants no more children,Woman's health seriously endangered,Low income--cant afford more children,Pregnant as result of rape,Not married,Abortion if woman wants for any reason,abortion
0,1972,0,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,unrestricted
1,1972,1,Yes,No,Yes,No,Yes,Yes,Not applicable,conditional
2,1972,2,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,unrestricted
3,1972,3,No,No,Yes,Yes,Yes,Yes,Not applicable,conditional
4,1972,4,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,unrestricted


In [10]:
abortion['abortion'].value_counts(dropna=False)

never           23338
conditional     22653
unrestricted    18823
Name: abortion, dtype: int64

In [11]:
abortion = abortion[['year', 'id', 'abortion']]

In [12]:
abortion = weight(abortion)

In [13]:
abortion.to_csv('data/cleaned_and_weighted/abortion.csv')

###### Family/ SES Background

In [14]:
background = unzip('Background')

In [15]:
background.columns

Index(['year',
       'Father's occupational prestige score using threshold method (2010)',
       'Rs living standard compared to parents', 'Religion in which raised',
       'How many grandparents born outside u.s.',
       'Were rs parents born in this country', 'Was r born in this country',
       'Mothers highest degree', 'Fathers highest degree', 'id',
       'Mother's occupational prestige score using threshold method (2010)'],
      dtype='object')

In [16]:
for col in background.columns:
    print(background[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


0     13710
58     6953
29     2876
18     2458
50     1724
      ...  
74       24
7        21
77       12
67       10
5         5
Name: Father's occupational prestige score using threshold method (2010), Length: 93, dtype: int64


Not applicable     44520
Much better         6446
Somewhat better     6005
About the same      4448
Somewhat worse      2269
Much worse           790
Don't know           307
No answer             29
Name: Rs living standard compared to parents, dtype: int64


Protestant                 37042
Catholic                   179

In [17]:
background.rename(columns = {"Father's occupational prestige score using threshold method (2010)": 'father_prestige',
                             "Mother's occupational prestige score using threshold method (2010)": 'mother_prestige',
                             'Religion in which raised': 'raised_relig', 
                             'How many grandparents born outside u.s.': 'immigrant_gpar', 
                             'Were rs parents born in this country': 'immigrant_par', 
                             'Was r born in this country': 'immigrant'}, inplace=True)

I will encode the immigration status of parents so that the number of grandparents and parents born outside of the country are counted. Parents born in the U.S will be coded as 0s. Because this data will likely not be used for modeling, I will keep not applicable values as they are. 

In [18]:
background.loc[(background['raised_relig']=='Not applicable')|
               (background['raised_relig']=='No answer')|
               (background['raised_relig']=="Don't know"), 'raised_relig']= 'Other'
background.loc[(background['immigrant_gpar']== 'All in u.s'), 'immigrant_gpar']= 0
background.loc[(background['immigrant_par']=='Both in u.s')|
               (background['immigrant_par']=='Mother; fa. dk')|
               (background['immigrant_par']=='Father; mo.dk'), 'immigrant_par']= 0
background.loc[(background['immigrant_par']=='Neither in u.s')|
               (background['immigrant_par']=='Not mother;fa.dk')|
               (background['immigrant_par']=='Not father;mo.dk'), 'immigrant_par']= 2
background.loc[(background['immigrant_par']=='Mother only')|
               (background['immigrant_par']=='Father only'), 'immigrant_par']= 1
background.loc[(background['immigrant']=='Yes'), 'immigrant']= 'no'
background.loc[(background['immigrant']=='No'), 'immigrant']= 'yes'
background.loc[(background['immigrant']=='Not applicable')|
               (background['immigrant']=='No answer')|
               (background['immigrant']=="Don't know"), 'immigrant']= 'n/a'

The 'Rs living standard compared to parents' is messy and highly incomplete, so I will drop it for this analysis. In addition, after consideration, parental education levels are more appropriately placed in the education table so they will also be dropped from this table.  

In [19]:
background.drop(['Rs living standard compared to parents', 
                 'Fathers highest degree', 'Mothers highest degree'], axis= 1, inplace= True)

In [20]:
background = weight(background)

In [21]:
background.to_csv('data/cleaned_and_weighted/background.csv')

##### Criminal Justice Attitudes

In [22]:
crim_justice = unzip('Criminal_Justice')
crim_justice.columns

Index(['year', 'Citizen attacking policeman with fists',
       'Citizen attempting to escape custody',
       'Citizen questioned as murder suspect',
       'Citizen said vulgar or obscene things',
       'Ever approve of police striking citizen',
       'Should marijuana be made legal', 'Courts dealing with criminals',
       'Favor or oppose death penalty for murder', 'id',
       'Afraid to walk at night in neighborhood'],
      dtype='object')

In [23]:
crim_justice[crim_justice['Ever approve of police striking citizen']=='Yes']['Citizen attacking policeman with fists'].value_counts()

Yes           25692
No              799
Don't know       99
No answer        18
Name: Citizen attacking policeman with fists, dtype: int64

The police striking citizens columns lack information on how the police response is defined, and will likely be difficult to interpret given the nature of this analysis. Unfortunately I will have to drop them. 

In [24]:
crim_justice.drop(['Citizen attacking policeman with fists', 
                   'Citizen attempting to escape custody', 
                   'Citizen questioned as murder suspect', 
                   'Citizen said vulgar or obscene things', 
                   'Ever approve of police striking citizen'], axis = 1, inplace = True)

In [25]:
crim_justice.rename(columns = {'Should marijuana be made legal': 'legalize_marijuana',
                             'Courts dealing with criminals': 'sentencing',
                             'Favor or oppose death penalty for murder': 'death_penalty', 
                             'Afraid to walk at night in neighborhood': 'fear'}, inplace=True)

In [26]:
crim_justice = weight(crim_justice)

In [27]:
crim_justice.to_csv('data/cleaned_and_weighted/crim_justice.csv')

##### Education

In [28]:
education = unzip('Education')
education.columns

Index(['year', 'id', 'Highest year of school completed',
       'Highest year school completed, father',
       'Highest year school completed, mother', 'Rs highest degree',
       'Fathers highest degree', 'Mothers highest degree',
       'The field of degree r earned'],
      dtype='object')

In [29]:
for col in education.columns:
    print(education[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


2047     1
39558    1
12947    1
14994    1
8849     1
        ..
48445    1
46396    1
36155    1
34106    1
0        1
Name: id, Length: 64814, dtype: int64


12            19663
16             8355
14             7160
13             5360
11             3743
15             2910
10             2880
8              2724
18             2384
9              2083
17             1967
20             1439
19              920
7               879
6               828
5               402
4               319
3               257
0               165
2               

With so many not applicable entries for the field of degree, it is more appropriate to remove them from the analysis. 

In [31]:
education.drop(['The field of degree r earned'], axis=1, inplace=True)

In [32]:
education.rename(columns = {'Highest year of school completed':'school_years',
                            'Highest year school completed, father':'school_years_dad', 
                            'Highest year school completed, mother':'school_years_mom',
                            'Rs highest degree':'degree',
                            'Fathers highest degree':'degree_dad',
                            'Mothers highest degree': 'degree_mom'}, inplace=True)

Encode nan information as a placeholder so that it can be easily removed later in the analysis. 

In [35]:
education.loc[(education['school_years']=='No answer')|
              (education['school_years']=='Not applicable')|
              (education['school_years']== "Don't know"), 'school_years']= -9
education.loc[(education['school_years_dad']=='No answer')|
              (education['school_years_dad']=='Not applicable')|
              (education['school_years_dad']== "Don't know"), 'school_years_dad']= -9
education.loc[(education['school_years_mom']=='No answer')|
              (education['school_years_mom']=='Not applicable')|
              (education['school_years_mom']== "Don't know"), 'school_years_mom']= -9

In [36]:
education.head()

Unnamed: 0,year,id,school_years,school_years_dad,school_years_mom,degree,degree_dad,degree_mom
0,1972,0,16,10,-9,Bachelor,Lt high school,Not applicable
1,1972,1,10,8,8,Lt high school,Lt high school,Lt high school
2,1972,2,12,8,8,High school,Lt high school,Lt high school
3,1972,3,17,16,12,Bachelor,Bachelor,High school
4,1972,4,12,8,8,High school,Lt high school,Lt high school


In [37]:
education = pd.get_dummies(data= education, columns=['degree', 'degree_dad', 'degree_mom'])

In [38]:
education = weight(education)

In [39]:
education.to_csv('data/cleaned_and_weighted/education.csv')

##### Employment

In [40]:
employment = unzip('Employment')
employment.columns

Index(['year', 'Workers need strong unions',
       'Does r or spouse belong to union', 'If rich, continue or stop working',
       'Job or housework', 'Could r find equally good job',
       'Is r likely to lose job', 'Number of hours usually work a week',
       'Labor force status', 'id',
       'Rs occupational prestige score using threshold method (2010)'],
      dtype='object')

In [41]:
for col in employment.columns:
    print(employment[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Not applicable       57399
Disagree              2911
Agree                 2546
Strongly disagree      751
Strongly agree         736
Dont know              355
No answer              116
Name: Workers need strong unions, dtype: int64


Neither belongs        36808
Not applicable         20003
R belongs               4854
Spouse belongs          2217
R and spouse belong      680
No answer                197
Don't know                55
Name: Does r or spouse belong to union, dtype: int64


Not applicable      38978
Continue working    17715
Stop work

In [42]:
employment.drop(['Is r likely to lose job', 'Could r find equally good job', 
                 'Job or housework', 'If rich, continue or stop working', 
                 'Workers need strong unions'], axis=1, inplace = True)

In [43]:
employment.rename(columns= {'Does r or spouse belong to union': 'union',
                            'Number of hours usually work a week': 'wrk_hrs',
                            'Labor force status': 'job_status',
                            'Rs occupational prestige score using threshold method (2010)': 'prestige'}, 
                  inplace = True)

In [44]:
employment.loc[(employment['union']=='Neither belongs', 'union')]= 'non-member'
employment.loc[(employment['union']=='R belongs')|
               (employment['union']=='Spouse belongs')|
               (employment['union']=='R and spouse belong'), 'union']= 'member'

In [45]:
employment.head()

Unnamed: 0,year,union,wrk_hrs,job_status,id,prestige
0,1972,Not applicable,Not applicable,Working fulltime,0,49
1,1972,Not applicable,Not applicable,Retired,1,62
2,1972,Not applicable,Not applicable,Working parttime,2,69
3,1972,Not applicable,Not applicable,Working fulltime,3,85
4,1972,Not applicable,Not applicable,Keeping house,4,21


In [46]:
employment = weight(employment)

In [47]:
employment.to_csv('data/cleaned_and_weighted/employment.csv')

###### Family Values

In [48]:
fam_vals = unzip('Family_Values')
fam_vals.columns

Index(['year', 'Better for man to work, woman tend home',
       'Preschool kids suffer if mother works',
       'Favor spanking to discipline child', 'Ideal number of children',
       'Rs kids living standard compared to r',
       'Rs living standard compared to parents', 'To help others',
       'To work hard', 'To think for ones self', 'To be well liked or popular',
       'To obey', 'Highest year school completed, mother',
       'Highest year school completed, father', 'Number of children',
       'Number of brothers and sisters', 'id',
       'Number of family generations in household'],
      dtype='object')

In [49]:
for col in fam_vals.columns:
    print(fam_vals[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Not applicable       34111
Disagree             13061
Agree                 8992
Strongly disagree     5479
Strongly agree        2543
Don't know             534
No answer               94
Name: Better for man to work, woman tend home, dtype: int64


Not applicable       34111
Disagree             13640
Agree                10220
Strongly disagree     3588
Strongly agree        2602
Don't know             568
No answer               85
Name: Preschool kids suffer if mother works, dtype: int64


Not applicable       37644
Agree                12555
Str

In [50]:
fam_vals.rename(columns= {'Better for man to work, woman tend home': 'trad_roles', 
                          'Preschool kids suffer if mother works': 'mom_wrk_kids_suffer', 
                          'Favor spanking to discipline child': 'spanking',
                          'Ideal number of children': 'bst_num_chld', 
                          'To help others': 'tch_to_help',
                          'To work hard': 'tch_to_wrk', 
                          'To think for ones self': 'tch_ind', 
                          'To be well liked or popular': 'tch_pop',
                          'To obey': 'tch_obey', 
                          'Number of children': 'num_chld',
                          'Number of family generations in household': 'num_gen'}, inplace=True)

In [51]:
fam_vals.drop(['Rs kids living standard compared to r', 'Rs living standard compared to parents', 
               'Highest year school completed, mother', 'Highest year school completed, father', 
               'Number of brothers and sisters'], axis=1, inplace=True)

In [52]:
fam_vals.loc[fam_vals['num_gen']=='1 gen', 'num_gen']= 1
fam_vals.loc[(fam_vals['num_gen']=='2 gens, children')|
             (fam_vals['num_gen']=='2 gens, parents')|
             (fam_vals['num_gen']=='2 gens, grandchldrn'), 'num_gen']= 2
fam_vals.loc[(fam_vals['num_gen']=='3 gens, grandchldrn')|
             (fam_vals['num_gen']=='3 gens, chld, par'), 'num_gen']= 3
fam_vals.loc[fam_vals['num_gen']=='4 gens', 'num_gen']= 4

In [53]:
fam_vals.head()

Unnamed: 0,year,trad_roles,mom_wrk_kids_suffer,spanking,bst_num_chld,tch_to_help,tch_to_wrk,tch_ind,tch_pop,tch_obey,num_chld,id,num_gen
0,1972,Not applicable,Not applicable,Not applicable,2,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,0,0,Not applicable
1,1972,Not applicable,Not applicable,Not applicable,3,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,5,1,Not applicable
2,1972,Not applicable,Not applicable,Not applicable,2,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,4,2,Not applicable
3,1972,Not applicable,Not applicable,Not applicable,2,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,0,3,Not applicable
4,1972,Not applicable,Not applicable,Not applicable,2,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,2,4,Not applicable


In [54]:
fam_vals = weight(fam_vals)

In [55]:
fam_vals.to_csv('data/cleaned_and_weighted/fam_vals.csv')

###### Gender and Sexuality

In [56]:
gender = unzip('Gender_and_Sexuality')
gender.columns

Index(['year', 'Should hire and promote women                            ',
       'For or against preferential hiring of women      ',
       'Better for man to work, woman tend home',
       'Preschool kids suffer if mother works',
       'Sex before marriage -- teens 14-16', 'Sex before marriage',
       'Divorce laws', 'Sex education in public schools',
       'Birth control to teenagers 14-16', 'Women not suited for politics',
       'id', 'Homosexuals should have right to marry'],
      dtype='object')

In [57]:
for col in gender.columns:
    print(gender[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Not applicable                55917
Agree                          4092
Disagree                       1674
Strongly agree                 1639
Neither agree nor disagree      958
Strongly disagree               351
No answer                       123
Dont know                        60
Name: Should hire and promote women                            , dtype: int64


Not applicable      56391
Strongly against     3105
Against              2263
Strongly for         1739
For                   862
Don't know            371
No answer              83
Name: F

In [58]:
gender.drop(['Should hire and promote women                            ', 
             'For or against preferential hiring of women      ', 
             'Better for man to work, woman tend home', 'Preschool kids suffer if mother works',
             'Sex before marriage -- teens 14-16', 'Birth control to teenagers 14-16'], 
            axis = 1, inplace = True)

In [59]:
gender.rename(columns ={'Sex before marriage':'sex_before_marr',
                        'Divorce laws': 'divorce', 
                        'Sex education in public schools':'sex_ed',
                        'Women not suited for politics': 'no_women_pol',
                        'Homosexuals should have the right to marry':'gay_marr'})

Unnamed: 0,year,sex_before_marr,divorce,sex_ed,no_women_pol,id,Homosexuals should have right to marry
0,1972,Not wrong at all,Not applicable,Not applicable,Not applicable,0,Not applicable
1,1972,Always wrong,Not applicable,Not applicable,Not applicable,1,Not applicable
2,1972,Always wrong,Not applicable,Not applicable,Not applicable,2,Not applicable
3,1972,Always wrong,Not applicable,Not applicable,Not applicable,3,Not applicable
4,1972,Sometimes wrong,Not applicable,Not applicable,Not applicable,4,Not applicable
...,...,...,...,...,...,...,...
64809,2018,Always wrong,No answer,Favor,Disagree,64809,Strongly agree
64810,2018,Not wrong at all,More difficult,Favor,Disagree,64810,Strongly agree
64811,2018,Not applicable,Not applicable,Not applicable,Not applicable,64811,Agree
64812,2018,Not wrong at all,More difficult,Favor,Disagree,64812,Not applicable


In [60]:
gender = weight(gender)
gender.to_csv('data/cleaned_and_weighted/gender.csv')

###### Identity

In [61]:
pers_id = unzip('Identity')
pers_id.columns

Index(['year', 'id', 'Age of respondent', 'Respondents sex',
       'Race of respondent', 'Region of interview'],
      dtype='object')

In [62]:
for col in pers_id.columns:
    print(pers_id[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


2047     1
39558    1
12947    1
14994    1
8849     1
        ..
48445    1
46396    1
36155    1
34106    1
0        1
Name: id, Length: 64814, dtype: int64


30            1450
28            1432
32            1431
34            1422
27            1391
              ... 
85             197
86             186
87             148
88             121
Don't know       1
Name: Age of respondent, Length: 74, dtype: int64


Female    36200
Male      28614
Name: Respondents sex, dtype: int64


White    52033
Black     9187
Other     3594
Name: Race of respon

In [63]:
pers_id.rename(columns={'Age of respondent':'age', 
                      'Respondents sex':'sex', 
                      'Race of respondent':'race',
                      'Region of interview':'region'}, inplace = True)

In [64]:
pers_id = weight(pers_id)

In [65]:
pers_id.to_csv('data/cleaned_and_weighted/pers_id.csv')

###### Lifestyle

In [66]:
lifestyle = unzip('Lifestyle')
lifestyle.columns

Index(['year', 'Hours per day watching tv', 'How often does r read newspaper',
       'Does r or spouse hunt', 'Have gun in home', 'Spend evening at bar',
       'Spend evening with friends', 'Spend evening with neighbor',
       'Spend evening with relatives', 'Is life exciting or dull',
       'General happiness', 'R's age when 1st child born', 'Age of respondent',
       'Number of children', 'Marital status',
       'Number of hours usually work a week', 'id',
       'Does r own or rent home?'],
      dtype='object')

In [67]:
for col in lifestyle.columns:
    print(lifestyle[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Not applicable    25608
2                 10328
1                  7786
3                  7126
4                  5074
5                  2578
0                  2084
6                  1713
8                   817
10                  396
7                   377
12                  279
No answer           189
9                    82
15                   57
Don't know           55
14                   53
20                   46
24                   36
16                   35
11                   33
13                   26
18                   21
17   

In [68]:
lifestyle.rename(columns = {'Hours per day watching tv':'hrs_tv',
                            'How often does r read newspaper':'paper',
                            'Have gun in home':'gun',
                            'Is life exciting or dull':'life',
                            'General happiness':'happy',
                            "R's age when 1st child born":'age_at_frst_chld',
                            'Marital status':'married',
                            'Does r own or rent home?':'rent_or_own'}, inplace = True)

In [69]:
lifestyle.drop(['Does r or spouse hunt', 'Spend evening at bar', 
                'Spend evening with friends', 'Spend evening with neighbor', 
                'Spend evening with relatives', 'Age of respondent', 'Number of children',
                'Number of hours usually work a week'], axis=1, inplace=True)

In [70]:
lifestyle = weight(lifestyle)
lifestyle.to_csv('data/cleaned_and_weighted/lifestyle.csv')

###### Opinions

In [71]:
opinions = unzip('Opinions')
opinions.columns

Index(['year', 'Foreign aid', 'Welfare', 'Highways and bridges',
       'Social security', 'Mass transportation', 'Parks and recreation',
       'Assistance for childcare', 'Supporting scientific research        ',
       'Military, armaments, and defense',
       'Improving the conditions of blacks', 'id', 'Space exploration program',
       'Improving & protecting environment',
       'Improving & protecting nations health',
       'Solving problems of big cities', 'Halting rising crime rate',
       'Dealing with drug addiction', 'Improving nations education system',
       'Developing alternative energy sources'],
      dtype='object')

In [72]:
for col in opinions.columns:
    print(opinions[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Not applicable    27662
Too much          24317
About right        8477
Too little         2355
Don't know         1891
No answer           112
Name: Foreign aid, dtype: int64


Not applicable    27662
Too much          16980
About right       11144
Too little         7376
Don't know         1525
No answer           127
Name: Welfare, dtype: int64


About right       21903
Too little        18568
Not applicable    17581
Too much           4635
Don't know         1823
No answer           304
Name: Highways and bridges, dtype: int64


Too little        

In [74]:
opinions = weight(opinions)

In [75]:
opinions.to_csv('data/cleaned_and_weighted/opinions.csv')

###### Race Relations

In [6]:
race = unzip('Race_Relations')
race.columns

Index(['year', 'Whites hurt by aff. action', 'Hard working - lazy',
       'Hard working - lazy.1', 'Rich - poor', 'Rich - poor.1',
       'Should govt aid blacks?', 'Differences due to lack of will',
       'Differences due to lack of education',
       'Differences due to inborn disability',
       'Differences due to discrimination',
       'Blacks overcome prejudice without favors ',
       'Favor preference in hiring blacks', 'Any opp. race in neighborhood',
       'Favor law against racial intermarriage',
       'Improving the conditions of blacks', 'id',
       'Number of immigrants nowadays should be'],
      dtype='object')

In [7]:
for col in race.columns:
    print(race[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Not applicable     45027
Somewhat likely     8526
Not very likely     6999
Very likely         3483
Dont know            642
No answer            137
Name: Whites hurt by aff. action, dtype: int64


Not applicable    43808
4                  9405
5                  4069
3                  2593
6                  1842
2                   988
Lazy                726
Hardworking         602
Dont know           542
No answer           192
98                   26
99                   21
Name: Hard working - lazy, dtype: int64


Not applicable    43808
4   

Because of the ways in which these variables were coded, many of them are either redundant with other questions, or difficult to gain meaningful information from.  

In [10]:
race.drop(['Hard working - lazy', 'Hard working - lazy.1', 
           'Rich - poor', 'Rich - poor.1', 'Blacks overcome prejudice without favors ',
           'Any opp. race in neighborhood', 'Favor law against racial intermarriage', 
           'Improving the conditions of blacks'], axis = 1, inplace = True)

In [11]:
race.loc[race['Should govt aid blacks?']=='Agree with both', 'Should govt aid blacks?']= 'no opinion'
race.loc[race['Should govt aid blacks?']=='No special treatment', 'Should govt aid blacks?']= 'strongly oppose'
race.loc[race['Should govt aid blacks?']== 4, 'Should govt aid blacks?']= 'oppose'
race.loc[race['Should govt aid blacks?']=='Govt help blks', 'Should govt aid blacks?']= 'strongly favor'
race.loc[race['Should govt aid blacks?']== 2, 'Should govt aid blacks?']= 'favor'
race.loc[race['Should govt aid blacks?']=="Don't know", 'Should govt aid blacks?']= 'no opinion'
race.loc[race['Should govt aid blacks?']=="No answer", 'Should govt aid blacks?']= 'no opinion'

In [13]:
race = weight(race)

In [14]:
race.to_csv('data/cleaned_and_weighted/race.csv')

###### Religion

In [15]:
religion = unzip('Religion')
religion.columns

Index(['year', 'id', 'Rs religious preference',
       'How often r attends religious services', 'Strength of affiliation',
       'How often does r pray', 'Bible prayer in public schools'],
      dtype='object')

In [16]:
for col in religion.columns:
    print(religion[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


2047     1
39558    1
12947    1
14994    1
8849     1
        ..
48445    1
46396    1
36155    1
34106    1
0        1
Name: id, Length: 64814, dtype: int64


Protestant                 37117
Catholic                   15674
None                        7797
Jewish                      1285
Other                       1086
Christian                    791
No answer                    258
Buddhism                     198
Moslem/islam                 153
Inter-nondenominational      136
Orthodox-christian           118
Hinduism                     100


In [19]:
religion.drop(['Bible prayer in public schools'], axis = 1, inplace = True)

In [20]:
religion = weight(religion)

ValueError: operands could not be broadcast together with shape (646691,) (64814,)

In [18]:
religion.to_csv('data/cleaned_and_weighted/religion.csv')

###### Socioeconomic Status

In [21]:
ses = unzip('SES')
ses.columns

Index(['year', 'Rs income in constant $', 'Family income in constant $',
       'Standard of living of r will improve',
       'Rs kids living standard compared to r',
       'Rs living standard compared to parents', 'Opinion of family income',
       'Change in financial situation',
       'Satisfaction with financial situation',
       'Rs self ranking of social position', 'id',
       'Rs occupational prestige score using threshold method (2010)'],
      dtype='object')

In [22]:
for col in ses.columns:
    print(ses[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


0.00         26927
25582.50       301
18476.25       222
31267.50       198
15633.75       193
             ...  
56405.00        12
2951.00         12
135131.00        7
68600.00         7
74970.00         6
Name: Rs income in constant $, Length: 603, dtype: int64


0.00        6521
25582.50     394
38373.75     360
32761.00     348
27793.00     345
            ... 
2409.75        9
2951.00        9
2331.00        8
2695.00        8
444.00         6
Name: Family income in constant $, Length: 624, dtype: int64


Not applicable       42956
Agree       

In [23]:
ses.drop(['Rs income in constant $', 'Standard of living of r will improve', 
          'Rs kids living standard compared to r', 'Rs living standard compared to parents', 
          'Change in financial situation', 'Rs occupational prestige score using threshold method (2010)'], 
         axis = 1, inplace = True)

In [24]:
ses = weight(ses)

In [None]:
ses.to_csv('data/cleaned_and_weighted/ses.csv')

###### Systems

In [25]:
systems = unzip('Systems')
systems.columns

Index(['year', 'Confidence in congress', 'Confidence in scientific community',
       'Confid. in united states supreme court', 'Confidence in television',
       'Confidence in medicine', 'Confidence in press',
       'Confidence in organized labor', 'Confid. in exec branch of fed govt',
       'Confidence in education', 'Confidence in organized religion',
       'Confidence in major companies',
       'Confid in banks & financial institutions', 'id',
       'Confidence in military'],
      dtype='object')

In [26]:
for col in systems.columns:
    print(systems[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Only some         23881
Not applicable    20759
Hardly any        13640
A great deal       5185
Don't know         1210
No answer           139
Name: Confidence in congress, dtype: int64


Not applicable    20759
Only some         20308
A great deal      17514
Don't know         3104
Hardly any         2962
No answer           167
Name: Confidence in scientific community, dtype: int64


Only some         22238
Not applicable    20760
A great deal      13436
Hardly any         6503
Don't know         1740
No answer           137
Name: Confid. in united

In [None]:
systems = weight(systems)
systems.to_csv('data/cleaned_and_weighted/systems.csv')

###### Voter Identity

In [None]:
party = unzip('Voter_Identity')
party.columns

In [None]:
party = party[['year', 'Political party affiliation']]

In [None]:
party = weight(party)

In [None]:
party.to_csv('data/cleaned_and_weighted/party.csv')