# Data Collection and Formatting

In [1]:
import pandas as pd
import tarfile
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns

%run -i "functions/unzip.py"
%run -i "functions/scrape_platforms.py"
%run -i "functions/scrape_SotU.py"

# Text Data
### Political Platforms

In [None]:
years = ['1972', '1976', '1980', '1984', '1988', '1992', '1996', 
         '2000', '2008', '2012', '2016']
platforms = scrape_platforms(years)
platforms.head()

### State of the Union Addresses

In [None]:
speeches = scrape_SotU()

### Debates

In [None]:
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
debate = soup.find_all('p')
for deb in debate:
    print(deb.text)

# Survey Data
### Convert Data to Dataframes
###### Weights
Based on the sample weights provided by the General Social Survey, it appears that some samples were significantly over and underrepresented. In order to account for this weights will be applied to this analysis. In order to maintain a reasonable size of the data rather than achieving the right proportions by using the complete weights, I will use the rounded values to the nearest tenth.  

In [2]:
weights = unzip('Weights')

In [3]:
weights.head()

Unnamed: 0,year,id,Weight variable
0,1972,0,0.4446
1,1972,1,0.8893
2,1972,2,0.8893
3,1972,3,0.8893
4,1972,4,0.8893


In [4]:
weights['Weight variable'].describe()

count    64814.000000
mean         1.000015
std          0.468172
min          0.391825
25%          0.550100
50%          0.970900
75%          1.098500
max          8.739876
Name: Weight variable, dtype: float64

In [5]:
def weight(df):
    return df.loc[df.index.repeat(round(weights['Weight variable']*10, 0))].reset_index()

###### Abortion Opinions
To reduce dimensionality in the data, I will combine the reasons for abortion into three categories: health, economics, and right-to-choose. There will also be a category for abolishing legal abortion. 

In [6]:
abortion = unzip('Abortion')

In [7]:
abortion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64814 entries, 0 to 64813
Data columns (total 9 columns):
year                                      64814 non-null int64
id                                        64814 non-null int32
Strong chance of serious defect           64814 non-null object
Married--wants no more children           64814 non-null object
Woman's health seriously endangered       64814 non-null object
Low income--cant afford more children     64814 non-null object
Pregnant as result of rape                64814 non-null object
Not married                               64814 non-null object
Abortion if woman wants for any reason    64814 non-null object
dtypes: int32(1), int64(1), object(7)
memory usage: 4.2+ MB


In [8]:
abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')|
             (abortion['Married--wants no more children'] == 'Yes')|
             (abortion["Woman's health seriously endangered"] == 'Yes')|
             (abortion["Low income--cant afford more children"] == 'Yes')|
             (abortion["Pregnant as result of rape"] == 'Yes')|
             (abortion['Not married']=='Yes'), 'abortion'] = 'conditional'

abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')&
             (abortion['Married--wants no more children'] == 'Yes')&
             (abortion["Woman's health seriously endangered"] == 'Yes')&
             (abortion["Low income--cant afford more children"] == 'Yes')&
             (abortion["Pregnant as result of rape"] == 'Yes')&
             (abortion['Not married']=='Yes'), 'abortion'] = 'unrestricted'

abortion.loc[abortion['Abortion if woman wants for any reason']== 'Yes', 'abortion']= 'unrestricted'
abortion['abortion'].fillna('never', inplace = True)

In [9]:
abortion.head()

Unnamed: 0,year,id,Strong chance of serious defect,Married--wants no more children,Woman's health seriously endangered,Low income--cant afford more children,Pregnant as result of rape,Not married,Abortion if woman wants for any reason,abortion
0,1972,0,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,unrestricted
1,1972,1,Yes,No,Yes,No,Yes,Yes,Not applicable,conditional
2,1972,2,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,unrestricted
3,1972,3,No,No,Yes,Yes,Yes,Yes,Not applicable,conditional
4,1972,4,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,unrestricted


In [10]:
abortion['abortion'].value_counts(dropna=False)

never           23338
conditional     22653
unrestricted    18823
Name: abortion, dtype: int64

In [11]:
abortion = pd.get_dummies(data= abortion[['year', 'id', 'abortion']],columns=['abortion'])

In [12]:
abortion.head()

Unnamed: 0,year,id,abortion_conditional,abortion_never,abortion_unrestricted
0,1972,0,0,0,1
1,1972,1,1,0,0
2,1972,2,0,0,1
3,1972,3,1,0,0
4,1972,4,0,0,1


In [13]:
abortion = weight(abortion)

In [14]:
abortion.to_csv('data/cleaned_and_weighted/abortion.csv')

###### Family/ SES Background

In [15]:
background = unzip('Background')

In [16]:
background.columns

Index(['year',
       'Father's occupational prestige score using threshold method (2010)',
       'Rs living standard compared to parents', 'Religion in which raised',
       'How many grandparents born outside u.s.',
       'Were rs parents born in this country', 'Was r born in this country',
       'Mothers highest degree', 'Fathers highest degree', 'id',
       'Mother's occupational prestige score using threshold method (2010)'],
      dtype='object')

In [17]:
for col in background.columns:
    print(background[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


0     13710
58     6953
29     2876
18     2458
50     1724
      ...  
74       24
7        21
77       12
67       10
5         5
Name: Father's occupational prestige score using threshold method (2010), Length: 93, dtype: int64


Not applicable     44520
Much better         6446
Somewhat better     6005
About the same      4448
Somewhat worse      2269
Much worse           790
Don't know           307
No answer             29
Name: Rs living standard compared to parents, dtype: int64


Protestant                 37042
Catholic                   179

In [18]:
background.rename(columns = {"Father's occupational prestige score using threshold method (2010)": 'father_prestige',
                             "Mother's occupational prestige score using threshold method (2010)": 'mother_prestige',
                             'Religion in which raised': 'raised_relig', 
                             'How many grandparents born outside u.s.': 'immigrant_gpar', 
                             'Were rs parents born in this country': 'immigrant_par', 
                             'Was r born in this country': 'immigrant'}, inplace=True)

I will encode the immigration status of parents so that the number of grandparents and parents born outside of the country are counted. Parents born in the U.S will be coded as 0s. Because this data will likely not be used for modeling, I will keep not applicable values as they are. 

In [19]:
background.loc[(background['raised_relig']=='Not applicable')|
               (background['raised_relig']=='No answer')|
               (background['raised_relig']=="Don't know"), 'raised_relig']= 'Other'
background.loc[(background['immigrant_gpar']== 'All in u.s'), 'immigrant_gpar']= 0
background.loc[(background['immigrant_par']=='Both in u.s')|
               (background['immigrant_par']=='Mother; fa. dk')|
               (background['immigrant_par']=='Father; mo.dk'), 'immigrant_par']= 0
background.loc[(background['immigrant_par']=='Neither in u.s')|
               (background['immigrant_par']=='Not mother;fa.dk')|
               (background['immigrant_par']=='Not father;mo.dk'), 'immigrant_par']= 2
background.loc[(background['immigrant_par']=='Mother only')|
               (background['immigrant_par']=='Father only'), 'immigrant_par']= 1
background.loc[(background['immigrant']=='Yes'), 'immigrant']= 'no'
background.loc[(background['immigrant']=='No'), 'immigrant']= 'yes'
background.loc[(background['immigrant']=='Not applicable')|
               (background['immigrant']=='No answer')|
               (background['immigrant']=="Don't know"), 'immigrant']= 'n/a'

The 'Rs living standard compared to parents' is messy and highly incomplete, so I will drop it for this analysis. In addition, after consideration, parental education levels are more appropriately placed in the education table so they will also be dropped from this table.  

In [20]:
background.drop(['Rs living standard compared to parents', 
                 'Fathers highest degree', 'Mothers highest degree'], axis= 1, inplace= True)

In [21]:
background = pd.get_dummies(data= background)

In [23]:
background = weight(background)

In [24]:
background.to_csv('data/cleaned_and_weighted/background.csv')

##### Criminal Justice Attitudes

In [25]:
crim_justice = unzip('Criminal_Justice')
crim_justice.columns

Index(['year', 'Citizen attacking policeman with fists',
       'Citizen attempting to escape custody',
       'Citizen questioned as murder suspect',
       'Citizen said vulgar or obscene things',
       'Ever approve of police striking citizen',
       'Should marijuana be made legal', 'Courts dealing with criminals',
       'Favor or oppose death penalty for murder', 'id',
       'Afraid to walk at night in neighborhood'],
      dtype='object')

In [31]:
crim_justice[crim_justice['Ever approve of police striking citizen']=='Yes']['Citizen attacking policeman with fists'].value_counts()

Yes           25692
No              799
Don't know       99
No answer        18
Name: Citizen attacking policeman with fists, dtype: int64

The police striking citizens columns lack information on how the police response is defined, and will likely be difficult to interpret given the nature of this analysis. Unfortunately I will have to drop them. 

In [39]:
crim_justice.drop(['Citizen attacking policeman with fists', 
                   'Citizen attempting to escape custody', 
                   'Citizen questioned as murder suspect', 
                   'Citizen said vulgar or obscene things', 
                   'Ever approve of police striking citizen'], axis = 1, inplace = True)

In [40]:
crim_justice.rename(columns = {'Should marijuana be made legal': 'legalize_marijuana',
                             'Courts dealing with criminals': 'sentencing',
                             'Favor or oppose death penalty for murder': 'death_penalty', 
                             'Afraid to walk at night in neighborhood': 'fear'}, inplace=True)

In [43]:
crim_justice = pd.get_dummies(data = crim_justice)
crim_justice.head()

Unnamed: 0,year,id,legalize_marijuana_Don't know,legalize_marijuana_Legal,legalize_marijuana_No answer,legalize_marijuana_Not applicable,legalize_marijuana_Not legal,sentencing_About right,sentencing_Don't know,sentencing_No answer,...,death_penalty_Don't know,death_penalty_Favor,death_penalty_No answer,death_penalty_Not applicable,death_penalty_Oppose,fear_Don't know,fear_No,fear_No answer,fear_Not applicable,fear_Yes
0,1972,0,0,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1972,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,1972,2,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1972,3,0,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,1972,4,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [46]:
crim_justice = weight(crim_justice)

In [None]:
crim_justice.to_csv('data/cleaned_and_weighted/crim_justice.csv')

##### Education

In [44]:
education = unzip('Education')
education.columns

Index(['year', 'id', 'Highest year of school completed',
       'Highest year school completed, father',
       'Highest year school completed, mother', 'Rs highest degree',
       'Fathers highest degree', 'Mothers highest degree',
       'The field of degree r earned'],
      dtype='object')

In [45]:
for col in education.columns:
    print(education[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


2047     1
39558    1
12947    1
14994    1
8849     1
        ..
48445    1
46396    1
36155    1
34106    1
0        1
Name: id, Length: 64814, dtype: int64


12            19663
16             8355
14             7160
13             5360
11             3743
15             2910
10             2880
8              2724
18             2384
9              2083
17             1967
20             1439
19              920
7               879
6               828
5               402
4               319
3               257
0               165
2               

With so many not applicable entries for the field of degree, it is more appropriate to remove them from the analysis. 

In [47]:
education.drop(['The field of degree r earned'], axis=1, inplace=True)

In [52]:
education.rename(columns = {'Highest year of school completed':'school_years',
                            'Highest year school completed, father':'school_years_dad', 
                            'Highest year school completed, mother':'school_years_mom',
                            'Rs highest degree':'degree',
                            'Fathers highest degree':'degree_dad',
                            'Mothers highest degree': 'degree_mom'}, inplace=True)

In [53]:
education.head()

Unnamed: 0,year,id,school_years,school_years_dad,school_years_mom,degree,degree_dad,degree_mom
0,1972,0,16,10,Not applicable,Bachelor,Lt high school,Not applicable
1,1972,1,10,8,8,Lt high school,Lt high school,Lt high school
2,1972,2,12,8,8,High school,Lt high school,Lt high school
3,1972,3,17,16,12,Bachelor,Bachelor,High school
4,1972,4,12,8,8,High school,Lt high school,Lt high school


In [54]:
education = pd.get_dummies(data= education, columns=['degree', 'degree_dad', 'degree_mom'])

In [55]:
education.head()

Unnamed: 0,year,id,school_years,school_years_dad,school_years_mom,degree_Bachelor,degree_Don't know,degree_Graduate,degree_High school,degree_Junior college,...,degree_dad_No answer,degree_dad_Not applicable,degree_mom_Bachelor,degree_mom_Don't know,degree_mom_Graduate,degree_mom_High school,degree_mom_Junior college,degree_mom_Lt high school,degree_mom_No answer,degree_mom_Not applicable
0,1972,0,16,10,Not applicable,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1972,1,10,8,8,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1972,2,12,8,8,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,1972,3,17,16,12,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1972,4,12,8,8,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [56]:
education = weight(education)

In [None]:
education.to_csv('data/cleaned_and_weighted/education.csv')

##### Employment

In [57]:
employment = unzip('Employment')
employment.columns

Index(['year', 'Workers need strong unions',
       'Does r or spouse belong to union', 'If rich, continue or stop working',
       'Job or housework', 'Could r find equally good job',
       'Is r likely to lose job', 'Number of hours usually work a week',
       'Labor force status', 'id',
       'Rs occupational prestige score using threshold method (2010)'],
      dtype='object')

###### Family Values

In [58]:
fam_vals = unzip('Family_Values')
fam_vals.columns

Index(['year', 'Better for man to work, woman tend home',
       'Preschool kids suffer if mother works',
       'Favor spanking to discipline child', 'Ideal number of children',
       'Rs kids living standard compared to r',
       'Rs living standard compared to parents', 'To help others',
       'To work hard', 'To think for ones self', 'To be well liked or popular',
       'To obey', 'Highest year school completed, mother',
       'Highest year school completed, father', 'Number of children',
       'Number of brothers and sisters', 'id',
       'Number of family generations in household'],
      dtype='object')

###### Gender and Sexuality

In [59]:
gender = unzip('Gender_and_Sexuality')
gender.columns

Index(['year', 'Should hire and promote women                            ',
       'For or against preferential hiring of women      ',
       'Better for man to work, woman tend home',
       'Preschool kids suffer if mother works',
       'Sex before marriage -- teens 14-16', 'Sex before marriage',
       'Divorce laws', 'Sex education in public schools',
       'Birth control to teenagers 14-16', 'Women not suited for politics',
       'id', 'Homosexuals should have right to marry'],
      dtype='object')

###### Identity

In [60]:
idend = unzip('Identity')
idend.columns

Index(['year', 'id', 'Age of respondent', 'Respondents sex',
       'Race of respondent', 'Region of interview'],
      dtype='object')

###### Interests

In [61]:
interests = unzip('Interests')
interests.columns

Index(['year', 'Interested in environmental issues',
       'Interested in space exploration', 'Interested in medical discoveries',
       'Interested in technologies', 'Interested in economic issues',
       'Interested in new scientific discoveries',
       'Interested in local school issues', 'Interested in farm issues',
       'Interested in international issues', 'id',
       'Interested in military policy'],
      dtype='object')

###### Lifestyle

In [62]:
lifestyle = unzip('Lifestyle')
lifestyle.columns

Index(['year', 'Hours per day watching tv', 'How often does r read newspaper',
       'Does r or spouse hunt', 'Have gun in home', 'Spend evening at bar',
       'Spend evening with friends', 'Spend evening with neighbor',
       'Spend evening with relatives', 'Is life exciting or dull',
       'General happiness', 'R's age when 1st child born', 'Age of respondent',
       'Number of children', 'Marital status',
       'Number of hours usually work a week', 'id',
       'Does r own or rent home?'],
      dtype='object')

###### Opinions

In [None]:
opinions = unzip('Opinions')
opinions.columns

In [None]:
opinions = weight(opinions)


In [None]:
opinions.to_csv('data/cleaned_and_weighted/abortion.csv')

###### Race Relations

In [None]:
race = unzip('Race_Relations')
race.columns

###### Religion

In [None]:
religion = unzip('Religion')
religion.columns

###### Socioeconomic Status

In [None]:
ses = unzip('SES')
ses.columns

###### Systems

In [None]:
systems = unzip('Systems')
systems.columns

###### Voter Identity

In [None]:
vote_id = unzip('Voter_Identity')
vote_id.columns