# Data Collection and Formatting

In [1]:
import pandas as pd
import tarfile
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns

%run -i "functions/unzip.py"
%run -i "functions/scrape_platforms.py"
%run -i "functions/scrape_SotU.py"

# Text Data
### Political Platforms

In [2]:
years = ['1972', '1976', '1980', '1984', '1988', '1992', '1996', 
         '2000', '2008', '2012', '2016']
platforms = scrape_platforms(years)
platforms.head()

Unnamed: 0,platform,party,year
1972-Rep,\nPreamble\nThis year our Republican Party has...,Republican,1972
1972-Dem,\nNew Directions: 1972-76\nSkepticism and cyni...,Democratic,1972
1976-Rep,\nAdopted by the Republican National Conventio...,Republican,1976
1976-Dem,\nPreamble\nWe meet to adopt a Democratic plat...,Democratic,1976
1980-Rep,\nAdopted by the Republican National Conventio...,Republican,1980


### State of the Union Addresses

In [3]:
speeches = scrape_SotU()

### Debates

In [None]:
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
debate = soup.find_all('p')
for deb in debate:
    print(deb.text)

# Survey Data
### Convert Data to Dataframes
###### Weights
Based on the sample weights provided by the General Social Survey, it appears that some samples were significantly over and underrepresented. In order to account for this weights will be applied to this analysis. 

In [4]:
weights = unzip('Weights')

In [5]:
weights.head()

Unnamed: 0,year,id,Weight variable
0,1972,0,0.4446
1,1972,1,0.8893
2,1972,2,0.8893
3,1972,3,0.8893
4,1972,4,0.8893


In [6]:
weights['Weight variable'].describe()

count    64814.000000
mean         1.000015
std          0.468172
min          0.391825
25%          0.550100
50%          0.970900
75%          1.098500
max          8.739876
Name: Weight variable, dtype: float64

###### Abortion Opinions
To reduce dimensionality in the data, I will combine the reasons for abortion into three categories: health, economics, and right-to-choose. There will also be a category for abolishing legal abortion. 

In [7]:
abortion = unzip('Abortion')

In [8]:
abortion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64814 entries, 0 to 64813
Data columns (total 9 columns):
year                                      64814 non-null int64
id                                        64814 non-null int32
Strong chance of serious defect           64814 non-null object
Married--wants no more children           64814 non-null object
Woman's health seriously endangered       64814 non-null object
Low income--cant afford more children     64814 non-null object
Pregnant as result of rape                64814 non-null object
Not married                               64814 non-null object
Abortion if woman wants for any reason    64814 non-null object
dtypes: int32(1), int64(1), object(7)
memory usage: 4.2+ MB


In [22]:
abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')|
             (abortion['Married--wants no more children'] == 'Yes')|
             (abortion["Woman's health seriously endangered"] == 'Yes')|
             (abortion["Low income--cant afford more children"] == 'Yes')|
             (abortion["Pregnant as result of rape"] == 'Yes')|
             (abortion['Not married']=='Yes'), 'abortion'] = 'conditional'

abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')&
             (abortion['Married--wants no more children'] == 'Yes')&
             (abortion["Woman's health seriously endangered"] == 'Yes')&
             (abortion["Low income--cant afford more children"] == 'Yes')&
             (abortion["Pregnant as result of rape"] == 'Yes')&
             (abortion['Not married']=='Yes'), 'abortion'] = 'unrestricted'

abortion.loc[abortion['Abortion if woman wants for any reason']== 'Yes', 'abortion']= 'unrestricted'
abortion['abortion'].fillna('Never', inplace = True)

In [23]:
abortion.head()

Unnamed: 0,year,id,Strong chance of serious defect,Married--wants no more children,Woman's health seriously endangered,Low income--cant afford more children,Pregnant as result of rape,Not married,Abortion if woman wants for any reason,abortion
0,1972,0,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,Unrestricted
1,1972,1,Yes,No,Yes,No,Yes,Yes,Not applicable,Conditional
2,1972,2,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,Unrestricted
3,1972,3,No,No,Yes,Yes,Yes,Yes,Not applicable,Conditional
4,1972,4,Yes,Yes,Yes,Yes,Yes,Yes,Not applicable,Unrestricted


In [24]:
abortion['abortion'].value_counts(dropna=False)

Never           23338
Conditional     22653
Unrestricted    18823
Name: abortion, dtype: int64

In [32]:
abortion = pd.get_dummies(data= abortion[['year', 'id', 'abortion']],columns=['abortion'])

In [33]:
abortion.head()

Unnamed: 0,year,id,abortion_Conditional,abortion_Never,abortion_Unrestricted
0,1972,0,0,0,1
1,1972,1,1,0,0
2,1972,2,0,0,1
3,1972,3,1,0,0
4,1972,4,0,0,1


In [None]:
sns.lineplot(data=abortion);

###### Family/ SES Background

In [34]:
background = unzip('Background')

In [35]:
background.columns

Index(['year',
       'Father's occupational prestige score using threshold method (2010)',
       'Rs living standard compared to parents', 'Religion in which raised',
       'How many grandparents born outside u.s.',
       'Were rs parents born in this country', 'Was r born in this country',
       'Mothers highest degree', 'Fathers highest degree', 'id',
       'Mother's occupational prestige score using threshold method (2010)'],
      dtype='object')

In [39]:
for col in background.columns:
    print(background[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


0     13710
58     6953
29     2876
18     2458
50     1724
      ...  
74       24
7        21
77       12
67       10
5         5
Name: Father's occupational prestige score using threshold method (2010), Length: 93, dtype: int64


Not applicable     44520
Much better         6446
Somewhat better     6005
About the same      4448
Somewhat worse      2269
Much worse           790
Don't know           307
No answer             29
Name: Rs living standard compared to parents, dtype: int64


Protestant                 37042
Catholic                   179

In [47]:
background.rename(columns = {"Father's occupational prestige score using threshold method (2010)": 'father_prestige',
                             "Mother's occupational prestige score using threshold method (2010)": 'mother_prestige',
                             'Religion in which raised': 'raised_relig', 
                             'How many grandparents born outside u.s.': 'immigrant_gpar', 
                             'Were rs parents born in this country': 'immigrant_par', 
                             'Was r born in this country': 'immigrant'}, inplace=True)

I will encode the immigration status of parents so that the number of grandparents and parents born outside of the country are counted. Parents born in the U.S will be coded as 0s. Because this data will likely not be used for modeling, I will keep not applicable values as they are. 

In [49]:
background.loc[(background['raised_relig']=='Not applicable')|
               (background['raised_relig']=='No answer')|
               (background['raised_relig']=="Don't know"), 'raised_relig']= 'Other'
background.loc[(background['immigrant_gpar']== 'All in u.s'), 'immigrant_gpar']= 0
background.loc[(background['immigrant_par']=='Both in u.s')|
               (background['immigrant_par']=='Mother; fa. dk')|
               (background['immigrant_par']=='Father; mo.dk'), 'immigrant_par']= 0
background.loc[(background['immigrant_par']=='Neither in u.s')|
               (background['immigrant_par']=='Not mother;fa.dk')|
               (background['immigrant_par']=='Not father;mo.dk'), 'immigrant_par']= 2
background.loc[(background['immigrant_par']=='Mother only')|
               (background['immigrant_par']=='Father only'), 'immigrant_par']= 1
background.loc[(background['immigrant']=='Yes'), 'immigrant']= 'no'
background.loc[(background['immigrant']=='No'), 'immigrant']= 'yes'
background.loc[(background['immigrant']=='Not applicable')|
               (background['immigrant']=='No answer')|
               (background['immigrant']=="Don't know"), 'immigrant']= 'n/a'

The 'Rs living standard compared to parents' is messy and highly incomplete, so I will drop it for this analysis. In addition, after consideration, parental education levels are more appropriately placed in the education table so they will also be dropped from this table.  

In [52]:
background.drop(['Rs living standard compared to parents', 
                 'Fathers highest degree', 'Mothers highest degree'], axis= 1, inplace= True)

In [53]:
background.head()

Unnamed: 0,year,father_prestige,raised_relig,immigrant_gpar,immigrant_par,immigrant,id,mother_prestige
0,1972,55,Other,Not applicable,Not applicable,,0,0
1,1972,42,Other,Not applicable,Not applicable,,1,0
2,1972,27,Other,Not applicable,Not applicable,,2,0
3,1972,65,Other,Not applicable,Not applicable,,3,0
4,1972,37,Other,Not applicable,Not applicable,,4,0


##### Criminal Justice Attitudes

In [56]:
crim_justice = unzip('Criminal_Justice')
crim_justice.columns

Index(['year', 'Citizen attacking policeman with fists',
       'Citizen attempting to escape custody',
       'Citizen questioned as murder suspect',
       'Citizen said vulgar or obscene things',
       'Ever approve of police striking citizen',
       'Should marijuana be made legal', 'Courts dealing with criminals',
       'Favor or oppose death penalty for murder', 'id',
       'Afraid to walk at night in neighborhood'],
      dtype='object')

##### Education

In [57]:
education = unzip('Education')
education.columns

Index(['year', 'id', 'Highest year of school completed',
       'Highest year school completed, father',
       'Highest year school completed, mother', 'Rs highest degree',
       'Fathers highest degree', 'Mothers highest degree',
       'The field of degree r earned'],
      dtype='object')

##### Employment

In [58]:
employment = unzip('Employment')
employment.columns

Index(['year', 'Workers need strong unions',
       'Does r or spouse belong to union', 'If rich, continue or stop working',
       'Job or housework', 'Could r find equally good job',
       'Is r likely to lose job', 'Number of hours usually work a week',
       'Labor force status', 'id',
       'Rs occupational prestige score using threshold method (2010)'],
      dtype='object')

###### Family Values

In [59]:
fam_vals = unzip('Family_Values')
fam_vals.columns

Index(['year', 'Better for man to work, woman tend home',
       'Preschool kids suffer if mother works',
       'Favor spanking to discipline child', 'Ideal number of children',
       'Rs kids living standard compared to r',
       'Rs living standard compared to parents', 'To help others',
       'To work hard', 'To think for ones self', 'To be well liked or popular',
       'To obey', 'Highest year school completed, mother',
       'Highest year school completed, father', 'Number of children',
       'Number of brothers and sisters', 'id',
       'Number of family generations in household'],
      dtype='object')

###### Gender and Sexuality

In [60]:
gender = unzip('Gender_and_Sexuality')
gender.columns

Index(['year', 'Should hire and promote women                            ',
       'For or against preferential hiring of women      ',
       'Better for man to work, woman tend home',
       'Preschool kids suffer if mother works',
       'Sex before marriage -- teens 14-16', 'Sex before marriage',
       'Divorce laws', 'Sex education in public schools',
       'Birth control to teenagers 14-16', 'Women not suited for politics',
       'id', 'Homosexuals should have right to marry'],
      dtype='object')

###### Identity

In [62]:
idend = unzip('Identity')
idend.columns

Index(['year', 'id', 'Age of respondent', 'Respondents sex',
       'Race of respondent', 'Region of interview'],
      dtype='object')

###### Interests

In [63]:
interests = unzip('Interests')
interests.columns

Index(['year', 'Interested in environmental issues',
       'Interested in space exploration', 'Interested in medical discoveries',
       'Interested in technologies', 'Interested in economic issues',
       'Interested in new scientific discoveries',
       'Interested in local school issues', 'Interested in farm issues',
       'Interested in international issues', 'id',
       'Interested in military policy'],
      dtype='object')

###### Lifestyle

In [64]:
lifestyle = unzip('Lifestyle')
lifestyle.columns

Index(['year', 'Hours per day watching tv', 'How often does r read newspaper',
       'Does r or spouse hunt', 'Have gun in home', 'Spend evening at bar',
       'Spend evening with friends', 'Spend evening with neighbor',
       'Spend evening with relatives', 'Is life exciting or dull',
       'General happiness', 'R's age when 1st child born', 'Age of respondent',
       'Number of children', 'Marital status',
       'Number of hours usually work a week', 'id',
       'Does r own or rent home?'],
      dtype='object')

###### Opinions

In [65]:
opinions = unzip('Opinions')
opinions.columns

Index(['year', 'Foreign aid', 'Welfare', 'Highways and bridges',
       'Social security', 'Mass transportation', 'Parks and recreation',
       'Assistance for childcare', 'Supporting scientific research        ',
       'Military, armaments, and defense',
       'Improving the conditions of blacks', 'id', 'Space exploration program',
       'Improving & protecting environment',
       'Improving & protecting nations health',
       'Solving problems of big cities', 'Halting rising crime rate',
       'Dealing with drug addiction', 'Improving nations education system',
       'Developing alternative energy sources'],
      dtype='object')

###### Race Relations

In [66]:
race = unzip('Race_Relations')
race.columns

Index(['year', 'Whites hurt by aff. action', 'Hard working - lazy',
       'Hard working - lazy.1', 'Rich - poor', 'Rich - poor.1',
       'Should govt aid blacks?', 'Differences due to lack of will',
       'Differences due to lack of education',
       'Differences due to inborn disability',
       'Differences due to discrimination',
       'Blacks overcome prejudice without favors ',
       'Favor preference in hiring blacks', 'Any opp. race in neighborhood',
       'Favor law against racial intermarriage',
       'Improving the conditions of blacks', 'id',
       'Number of immigrants nowadays should be'],
      dtype='object')

###### Religion

In [67]:
religion = unzip('Religion')
religion.columns

Index(['year', 'id', 'Rs religious preference',
       'How often r attends religious services', 'Strength of affiliation',
       'How often does r pray', 'Bible prayer in public schools'],
      dtype='object')

###### Socioeconomic Status

In [68]:
ses = unzip('SES')
ses.columns

Index(['year', 'Rs income in constant $', 'Family income in constant $',
       'Standard of living of r will improve',
       'Rs kids living standard compared to r',
       'Rs living standard compared to parents', 'Opinion of family income',
       'Change in financial situation',
       'Satisfaction with financial situation',
       'Rs self ranking of social position', 'id',
       'Rs occupational prestige score using threshold method (2010)'],
      dtype='object')

###### Systems

In [69]:
systems = unzip('Systems')
systems.columns

Index(['year', 'Confidence in congress', 'Confidence in scientific community',
       'Confid. in united states supreme court', 'Confidence in television',
       'Confidence in medicine', 'Confidence in press',
       'Confidence in organized labor', 'Confid. in exec branch of fed govt',
       'Confidence in education', 'Confidence in organized religion',
       'Confidence in major companies',
       'Confid in banks & financial institutions', 'id',
       'Confidence in military'],
      dtype='object')

###### Voter Identity

In [70]:
vote_id = unzip('Voter_Identity')
vote_id.columns

Index(['year', 'Did r vote in 1992 election', 'Vote for clinton, bush, perot',
       'Who would r have voted for-1992 election',
       'Did r vote in 1996 election', 'Vote for clinton, dole, perot',
       'Who would r have voted for-1996 election',
       'Did r vote in 2000 election', 'Vote for gore, bush, nader',
       'Who would r has voted for in 2000 election',
       'Did r vote in 2004 election', 'Vote for kerry, bush, nader',
       'Who would r has voted for in 2004 election',
       'Did r vote in 2008 election', 'Vote obama or mccain',
       'Who you would have voted for', 'Did r vote in 2012 election',
       'Vote obama or romney', 'Who would r have voted for-1988 election',
       'Vote for dukakis or bush', 'Did r vote in 1988 election', 'id',
       'Political party affiliation', 'Did r vote in 1968 election',
       'Vote for humphrey, nixon, or wallace',
       'Who would r have voted for-1968 election',
       'Did r vote in 1972 election', 'Vote for mcgovern or