# Data Collection and Formatting

In [1]:
import pandas as pd
import tarfile
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns

%run -i "functions/unzip.py"
%run -i "functions/scrape_platforms.py"
%run -i "functions/scrape_SotU.py"

# Text Data
### Political Platforms

In [None]:
years = ['1972', '1976', '1980', '1984', '1988', '1992', '1996', 
         '2000', '2008', '2012', '2016']
platforms = scrape_platforms(years)
platforms.head()

### State of the Union Addresses

In [None]:
speeches = scrape_SotU()

### Debates

In [None]:
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
debate = soup.find_all('p')
for deb in debate:
    print(deb.text)

# Survey Data
### Convert Data to Dataframes
###### Weights
Based on the sample weights provided by the General Social Survey, it appears that some samples were significantly over and underrepresented. In order to account for this weights will be applied to this analysis. In order to maintain a reasonable size of the data rather than achieving the right proportions by using the complete weights, I will use the rounded values to the nearest tenth.  

In [7]:
weights = unzip('Weights')

In [8]:
weights.head()

Unnamed: 0,year,id,Weight variable
0,1972,0,0.4446
1,1972,1,0.8893
2,1972,2,0.8893
3,1972,3,0.8893
4,1972,4,0.8893


In [9]:
weights['Weight variable'].describe()

count    64814.000000
mean         1.000015
std          0.468172
min          0.391825
25%          0.550100
50%          0.970900
75%          1.098500
max          8.739876
Name: Weight variable, dtype: float64

In [10]:
def weight(df):
    return df.loc[df.index.repeat(round(weights['Weight variable']*10, 0))].reset_index()

###### Abortion Opinions
To reduce dimensionality in the data, I will combine the reasons for abortion into three categories: health, economics, and right-to-choose. There will also be a category for abolishing legal abortion. 

In [None]:
abortion = unzip('Abortion')

In [None]:
abortion.info()

In [None]:
abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')|
             (abortion['Married--wants no more children'] == 'Yes')|
             (abortion["Woman's health seriously endangered"] == 'Yes')|
             (abortion["Low income--cant afford more children"] == 'Yes')|
             (abortion["Pregnant as result of rape"] == 'Yes')|
             (abortion['Not married']=='Yes'), 'abortion'] = 'conditional'

abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')&
             (abortion['Married--wants no more children'] == 'Yes')&
             (abortion["Woman's health seriously endangered"] == 'Yes')&
             (abortion["Low income--cant afford more children"] == 'Yes')&
             (abortion["Pregnant as result of rape"] == 'Yes')&
             (abortion['Not married']=='Yes'), 'abortion'] = 'unrestricted'

abortion.loc[abortion['Abortion if woman wants for any reason']== 'Yes', 'abortion']= 'unrestricted'
abortion['abortion'].fillna('never', inplace = True)

In [None]:
abortion.head()

In [None]:
abortion['abortion'].value_counts(dropna=False)

In [None]:
abortion = abortion[['year', 'id', 'abortion']]

In [None]:
abortion = weight(abortion)

In [None]:
abortion.to_csv('data/cleaned_and_weighted/abortion.csv')

###### Family/ SES Background

In [None]:
background = unzip('Background')

In [None]:
background.columns

In [None]:
for col in background.columns:
    print(background[col].value_counts())
    print('\n')

In [None]:
background.rename(columns = {"Father's occupational prestige score using threshold method (2010)": 'father_prestige',
                             "Mother's occupational prestige score using threshold method (2010)": 'mother_prestige',
                             'Religion in which raised': 'raised_relig', 
                             'How many grandparents born outside u.s.': 'immigrant_gpar', 
                             'Were rs parents born in this country': 'immigrant_par', 
                             'Was r born in this country': 'immigrant'}, inplace=True)

I will encode the immigration status of parents so that the number of grandparents and parents born outside of the country are counted. Parents born in the U.S will be coded as 0s. Because this data will likely not be used for modeling, I will keep not applicable values as they are. 

In [None]:
background.loc[(background['raised_relig']=='Not applicable')|
               (background['raised_relig']=='No answer')|
               (background['raised_relig']=="Don't know"), 'raised_relig']= 'Other'
background.loc[(background['immigrant_gpar']== 'All in u.s'), 'immigrant_gpar']= 0
background.loc[(background['immigrant_par']=='Both in u.s')|
               (background['immigrant_par']=='Mother; fa. dk')|
               (background['immigrant_par']=='Father; mo.dk'), 'immigrant_par']= 0
background.loc[(background['immigrant_par']=='Neither in u.s')|
               (background['immigrant_par']=='Not mother;fa.dk')|
               (background['immigrant_par']=='Not father;mo.dk'), 'immigrant_par']= 2
background.loc[(background['immigrant_par']=='Mother only')|
               (background['immigrant_par']=='Father only'), 'immigrant_par']= 1
background.loc[(background['immigrant']=='Yes'), 'immigrant']= 'no'
background.loc[(background['immigrant']=='No'), 'immigrant']= 'yes'
background.loc[(background['immigrant']=='Not applicable')|
               (background['immigrant']=='No answer')|
               (background['immigrant']=="Don't know"), 'immigrant']= 'n/a'

The 'Rs living standard compared to parents' is messy and highly incomplete, so I will drop it for this analysis. In addition, after consideration, parental education levels are more appropriately placed in the education table so they will also be dropped from this table.  

In [None]:
background.drop(['Rs living standard compared to parents', 
                 'Fathers highest degree', 'Mothers highest degree'], axis= 1, inplace= True)

In [None]:
background = weight(background)

In [None]:
background.to_csv('data/cleaned_and_weighted/background.csv')

##### Criminal Justice Attitudes

In [None]:
crim_justice = unzip('Criminal_Justice')
crim_justice.columns

In [None]:
crim_justice[crim_justice['Ever approve of police striking citizen']=='Yes']['Citizen attacking policeman with fists'].value_counts()

The police striking citizens columns lack information on how the police response is defined, and will likely be difficult to interpret given the nature of this analysis. Unfortunately I will have to drop them. 

In [None]:
crim_justice.drop(['Citizen attacking policeman with fists', 
                   'Citizen attempting to escape custody', 
                   'Citizen questioned as murder suspect', 
                   'Citizen said vulgar or obscene things', 
                   'Ever approve of police striking citizen'], axis = 1, inplace = True)

In [None]:
crim_justice.rename(columns = {'Should marijuana be made legal': 'legalize_marijuana',
                             'Courts dealing with criminals': 'sentencing',
                             'Favor or oppose death penalty for murder': 'death_penalty', 
                             'Afraid to walk at night in neighborhood': 'fear'}, inplace=True)

In [None]:
crim_justice = weight(crim_justice)

In [None]:
crim_justice.to_csv('data/cleaned_and_weighted/crim_justice.csv')

##### Education

In [None]:
education = unzip('Education')
education.columns

In [None]:
for col in education.columns:
    print(education[col].value_counts())
    print('\n')

With so many not applicable entries for the field of degree, it is more appropriate to remove them from the analysis. 

In [None]:
education.drop(['The field of degree r earned'], axis=1, inplace=True)

In [None]:
education.rename(columns = {'Highest year of school completed':'school_years',
                            'Highest year school completed, father':'school_years_dad', 
                            'Highest year school completed, mother':'school_years_mom',
                            'Rs highest degree':'degree',
                            'Fathers highest degree':'degree_dad',
                            'Mothers highest degree': 'degree_mom'}, inplace=True)

Encode nan information as a placeholder so that it can be easily removed later in the analysis. 

In [None]:
education.loc[(education['school_years']=='No answer')|
              (education['school_years']=='Not applicable')|
              (education['school_years']== "Don't know"), 'school_years']= -9
education.loc[(education['school_years_dad']=='No answer')|
              (education['school_years_dad']=='Not applicable')|
              (education['school_years_dad']== "Don't know"), 'school_years_dad']= -9
education.loc[(education['school_years_mom']=='No answer')|
              (education['school_years_mom']=='Not applicable')|
              (education['school_years_mom']== "Don't know"), 'school_years_mom']= -9

In [None]:
education.head()

In [None]:
education = pd.get_dummies(data= education, columns=['degree', 'degree_dad', 'degree_mom'])

In [None]:
education = weight(education)

In [None]:
education.to_csv('data/cleaned_and_weighted/education.csv')

##### Employment

In [None]:
employment = unzip('Employment')
employment.columns

In [None]:
for col in employment.columns:
    print(employment[col].value_counts())
    print('\n')

In [None]:
employment.drop(['Is r likely to lose job', 'Could r find equally good job', 
                 'Job or housework', 'If rich, continue or stop working', 
                 'Workers need strong unions'], axis=1, inplace = True)

In [None]:
employment.rename(columns= {'Does r or spouse belong to union': 'union',
                            'Number of hours usually work a week': 'wrk_hrs',
                            'Labor force status': 'job_status',
                            'Rs occupational prestige score using threshold method (2010)': 'prestige'}, 
                  inplace = True)

In [None]:
employment.loc[(employment['union']=='Neither belongs', 'union')]= 'non-member'
employment.loc[(employment['union']=='R belongs')|
               (employment['union']=='Spouse belongs')|
               (employment['union']=='R and spouse belong'), 'union']= 'member'

In [None]:
employment.head()

In [None]:
employment = weight(employment)

In [None]:
employment.to_csv('data/cleaned_and_weighted/employment.csv')

###### Family Values

In [None]:
fam_vals = unzip('Family_Values')
fam_vals.columns

In [None]:
for col in fam_vals.columns:
    print(fam_vals[col].value_counts())
    print('\n')

In [None]:
fam_vals.rename(columns= {'Better for man to work, woman tend home': 'trad_roles', 
                          'Preschool kids suffer if mother works': 'mom_wrk_kids_suffer', 
                          'Ideal number of children': 'bst_num_chld', 
                          'To help others': 'tch_to_help',
                          'To work hard': 'tch_to_wrk', 
                          'To think for ones self': 'tch_ind', 
                          'To be well liked or popular': 'tch_pop',
                          'To obey': 'tch_obey', 
                          'Number of children': 'num_chld',
                          'Number of family generations in household': 'num_gen'}, inplace=True)

In [None]:
fam_vals.drop(['Rs kids living standard compared to r', 'Rs living standard compared to parents', 
               'Highest year school completed, mother', 'Highest year school completed, father', 
               'Number of brothers and sisters', 'Favor spanking to discipline child'], axis=1, inplace=True)

In [None]:
fam_vals.loc[fam_vals['num_gen']=='1 gen', 'num_gen']= 1
fam_vals.loc[(fam_vals['num_gen']=='2 gens, children')|
             (fam_vals['num_gen']=='2 gens, parents')|
             (fam_vals['num_gen']=='2 gens, grandchldrn'), 'num_gen']= 2
fam_vals.loc[(fam_vals['num_gen']=='3 gens, grandchldrn')|
             (fam_vals['num_gen']=='3 gens, chld, par'), 'num_gen']= 3
fam_vals.loc[fam_vals['num_gen']=='4 gens', 'num_gen']= 4

In [None]:
fam_vals.head()

In [None]:
fam_vals = weight(fam_vals)

In [None]:
fam_vals.to_csv('data/cleaned_and_weighted/fam_vals.csv')

###### Gender and Sexuality

In [None]:
gender = unzip('Gender_and_Sexuality')
gender.columns

In [None]:
for col in gender.columns:
    print(gender[col].value_counts())
    print('\n')

In [None]:
gender.drop(['Should hire and promote women                            ', 
             'For or against preferential hiring of women      ', 
             'Better for man to work, woman tend home', 'Preschool kids suffer if mother works',
             'Sex before marriage -- teens 14-16', 'Birth control to teenagers 14-16'], 
            axis = 1, inplace = True)

In [None]:
gender.rename(columns ={'Sex before marriage':'sex_before_marr',
                        'Divorce laws': 'divorce', 
                        'Sex education in public schools':'sex_ed',
                        'Women not suited for politics': 'no_women_pol',
                        'Homosexuals should have the right to marry':'gay_marr'})

In [None]:
gender = weight(gender)
gender.to_csv('data/cleaned_and_weighted/gender.csv')

###### Identity

In [None]:
pers_id = unzip('Identity')
pers_id.columns

In [None]:
for col in pers_id.columns:
    print(pers_id[col].value_counts())
    print('\n')

In [None]:
pers_id.rename(columns={'Age of respondent':'age', 
                      'Respondents sex':'sex', 
                      'Race of respondent':'race',
                      'Region of interview':'region'}, inplace = True)

In [None]:
pers_id = weight(pers_id)

In [None]:
pers_id.to_csv('data/cleaned_and_weighted/pers_id.csv')

###### Lifestyle

In [None]:
lifestyle = unzip('Lifestyle')
lifestyle.columns

In [None]:
for col in lifestyle.columns:
    print(lifestyle[col].value_counts())
    print('\n')

In [None]:
lifestyle.rename(columns = {'Hours per day watching tv':'hrs_tv',
                            'How often does r read newspaper':'paper',
                            'Have gun in home':'gun',
                            'Is life exciting or dull':'life',
                            'General happiness':'happy',
                            "R's age when 1st child born":'age_at_frst_chld',
                            'Marital status':'married',
                            'Does r own or rent home?':'rent_or_own'}, inplace = True)

In [None]:
lifestyle.drop(['Does r or spouse hunt', 'Spend evening at bar', 
                'Spend evening with friends', 'Spend evening with neighbor', 
                'Spend evening with relatives', 'Age of respondent', 'Number of children',
                'Number of hours usually work a week'], axis=1, inplace=True)

In [None]:
lifestyle = weight(lifestyle)
lifestyle.to_csv('data/cleaned_and_weighted/lifestyle.csv')

###### Opinions

In [None]:
opinions = unzip('Opinions')
opinions.columns

In [None]:
for col in opinions.columns:
    print(opinions[col].value_counts())
    print('\n')

In [None]:
opinions = weight(opinions)

In [None]:
opinions.to_csv('data/cleaned_and_weighted/opinions.csv')

###### Race Relations

In [None]:
race = unzip('Race_Relations')
race.columns

In [None]:
for col in race.columns:
    print(race[col].value_counts())
    print('\n')

Because of the ways in which these variables were coded, many of them are either redundant with other questions, or difficult to gain meaningful information from.  

In [None]:
race.drop(['Hard working - lazy', 'Hard working - lazy.1', 
           'Rich - poor', 'Rich - poor.1', 'Blacks overcome prejudice without favors ',
           'Any opp. race in neighborhood', 'Favor law against racial intermarriage', 
           'Improving the conditions of blacks'], axis = 1, inplace = True)

In [None]:
race.loc[race['Should govt aid blacks?']=='Agree with both', 'Should govt aid blacks?']= 'no opinion'
race.loc[race['Should govt aid blacks?']=='No special treatment', 'Should govt aid blacks?']= 'strongly oppose'
race.loc[race['Should govt aid blacks?']== 4, 'Should govt aid blacks?']= 'oppose'
race.loc[race['Should govt aid blacks?']=='Govt help blks', 'Should govt aid blacks?']= 'strongly favor'
race.loc[race['Should govt aid blacks?']== 2, 'Should govt aid blacks?']= 'favor'
race.loc[race['Should govt aid blacks?']=="Don't know", 'Should govt aid blacks?']= 'no opinion'
race.loc[race['Should govt aid blacks?']=="No answer", 'Should govt aid blacks?']= 'no opinion'

In [None]:
race = weight(race)

In [None]:
race.to_csv('data/cleaned_and_weighted/race.csv')

###### Religion

In [None]:
religion = unzip('Religion')
religion.columns

In [None]:
for col in religion.columns:
    print(religion[col].value_counts())
    print('\n')

In [None]:
religion.drop(['Bible prayer in public schools'], axis = 1, inplace = True)

In [None]:
religion = weight(religion)

In [None]:
religion.to_csv('data/cleaned_and_weighted/religion.csv')

###### Socioeconomic Status

In [None]:
ses = unzip('SES')
ses.columns

In [None]:
for col in ses.columns:
    print(ses[col].value_counts())
    print('\n')

In [None]:
ses.drop(['Rs income in constant $', 'Standard of living of r will improve', 
          'Rs kids living standard compared to r', 'Rs living standard compared to parents', 
          'Change in financial situation', 'Rs occupational prestige score using threshold method (2010)'], 
         axis = 1, inplace = True)

In [None]:
ses = weight(ses)

In [None]:
ses.to_csv('data/cleaned_and_weighted/ses.csv')

###### Systems

In [2]:
systems = unzip('Systems')
systems.columns

Index(['year', 'Confidence in congress', 'Confidence in scientific community',
       'Confid. in united states supreme court', 'Confidence in television',
       'Confidence in medicine', 'Confidence in press',
       'Confidence in organized labor', 'Confid. in exec branch of fed govt',
       'Confidence in education', 'Confidence in organized religion',
       'Confidence in major companies',
       'Confid in banks & financial institutions', 'id',
       'Confidence in military'],
      dtype='object')

In [3]:
for col in systems.columns:
    print(systems[col].value_counts())
    print('\n')

2006    4510
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: year, dtype: int64


Only some         23881
Not applicable    20759
Hardly any        13640
A great deal       5185
Don't know         1210
No answer           139
Name: Confidence in congress, dtype: int64


Not applicable    20759
Only some         20308
A great deal      17514
Don't know         3104
Hardly any         2962
No answer           167
Name: Confidence in scientific community, dtype: int64


Only some         22238
Not applicable    20760
A great deal      13436
Hardly any         6503
Don't know         1740
No answer           137
Name: Confid. in united

In [5]:
systems.drop(['Confidence in medicine'], axis = 1, inplace = True)

In [None]:
systems = weight(systems)
systems.to_csv('data/cleaned_and_weighted/systems.csv')

###### Voter Identity

In [None]:
party = unzip('Voter_Identity')
party.columns

In [None]:
party = party[['year', 'Political party affiliation']]

In [None]:
party = weight(party)

In [None]:
party.to_csv('data/cleaned_and_weighted/party.csv')