# Data Collection and Formatting

In [1]:
import pandas as pd
import tarfile
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns

%run -i "functions/unzip.py"
%run -i "functions/scrape_platforms.py"
%run -i "functions/scrape_SotU.py"

# Text Data
### Political Platforms

In [None]:
years = ['1972', '1976', '1980', '1984', '1988', '1992', '1996', 
         '2000', '2008', '2012', '2016']
platforms = scrape_platforms(years)
platforms.head()

### State of the Union Addresses

In [None]:
speeches = scrape_SotU()

### Debates

In [None]:
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
debate = soup.find_all('p')
for deb in debate:
    print(deb.text)

# Survey Data
### Convert Data to Dataframes
###### Weights
Based on the sample weights provided by the General Social Survey, it appears that some samples were significantly over and underrepresented. In order to account for this weights will be applied to this analysis. 

In [2]:
weights = unzip('Weights')

In [3]:
weights.head()

Unnamed: 0,year,id,Weight variable
0,1972,0,0.4446
1,1972,1,0.8893
2,1972,2,0.8893
3,1972,3,0.8893
4,1972,4,0.8893


In [4]:
weights['Weight variable'].describe()

count    64814.000000
mean         1.000015
std          0.468172
min          0.391825
25%          0.550100
50%          0.970900
75%          1.098500
max          8.739876
Name: Weight variable, dtype: float64

In [5]:
weight_t = weights.loc[weights.index.repeat(round(weights['Weight variable']*10, 0))].reset_index(inplace = True)
weight_t.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
def weight(df):
    return df.loc[df.index.repeat(round(weights['Weight variable']*10, 0))].reset_index(inplace = True)

###### Abortion Opinions
To reduce dimensionality in the data, I will combine the reasons for abortion into three categories: health, economics, and right-to-choose. There will also be a category for abolishing legal abortion. 

In [None]:
abortion = unzip('Abortion')

In [None]:
abortion.info()

In [None]:
abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')|
             (abortion['Married--wants no more children'] == 'Yes')|
             (abortion["Woman's health seriously endangered"] == 'Yes')|
             (abortion["Low income--cant afford more children"] == 'Yes')|
             (abortion["Pregnant as result of rape"] == 'Yes')|
             (abortion['Not married']=='Yes'), 'abortion'] = 'conditional'

abortion.loc[(abortion['Strong chance of serious defect'] == 'Yes')&
             (abortion['Married--wants no more children'] == 'Yes')&
             (abortion["Woman's health seriously endangered"] == 'Yes')&
             (abortion["Low income--cant afford more children"] == 'Yes')&
             (abortion["Pregnant as result of rape"] == 'Yes')&
             (abortion['Not married']=='Yes'), 'abortion'] = 'unrestricted'

abortion.loc[abortion['Abortion if woman wants for any reason']== 'Yes', 'abortion']= 'unrestricted'
abortion['abortion'].fillna('never', inplace = True)

In [None]:
abortion.head()

In [None]:
abortion['abortion'].value_counts(dropna=False)

In [None]:
abortion = pd.get_dummies(data= abortion[['year', 'id', 'abortion']],columns=['abortion'])

In [None]:
abortion.head()

In [None]:
abortion = weight(abortion)

In [None]:
abortion.head()

###### Family/ SES Background

In [None]:
background = unzip('Background')

In [None]:
background.columns

In [None]:
for col in background.columns:
    print(background[col].value_counts())
    print('\n')

In [None]:
background.rename(columns = {"Father's occupational prestige score using threshold method (2010)": 'father_prestige',
                             "Mother's occupational prestige score using threshold method (2010)": 'mother_prestige',
                             'Religion in which raised': 'raised_relig', 
                             'How many grandparents born outside u.s.': 'immigrant_gpar', 
                             'Were rs parents born in this country': 'immigrant_par', 
                             'Was r born in this country': 'immigrant'}, inplace=True)

I will encode the immigration status of parents so that the number of grandparents and parents born outside of the country are counted. Parents born in the U.S will be coded as 0s. Because this data will likely not be used for modeling, I will keep not applicable values as they are. 

In [None]:
background.loc[(background['raised_relig']=='Not applicable')|
               (background['raised_relig']=='No answer')|
               (background['raised_relig']=="Don't know"), 'raised_relig']= 'Other'
background.loc[(background['immigrant_gpar']== 'All in u.s'), 'immigrant_gpar']= 0
background.loc[(background['immigrant_par']=='Both in u.s')|
               (background['immigrant_par']=='Mother; fa. dk')|
               (background['immigrant_par']=='Father; mo.dk'), 'immigrant_par']= 0
background.loc[(background['immigrant_par']=='Neither in u.s')|
               (background['immigrant_par']=='Not mother;fa.dk')|
               (background['immigrant_par']=='Not father;mo.dk'), 'immigrant_par']= 2
background.loc[(background['immigrant_par']=='Mother only')|
               (background['immigrant_par']=='Father only'), 'immigrant_par']= 1
background.loc[(background['immigrant']=='Yes'), 'immigrant']= 'no'
background.loc[(background['immigrant']=='No'), 'immigrant']= 'yes'
background.loc[(background['immigrant']=='Not applicable')|
               (background['immigrant']=='No answer')|
               (background['immigrant']=="Don't know"), 'immigrant']= 'n/a'

The 'Rs living standard compared to parents' is messy and highly incomplete, so I will drop it for this analysis. In addition, after consideration, parental education levels are more appropriately placed in the education table so they will also be dropped from this table.  

In [None]:
background.drop(['Rs living standard compared to parents', 
                 'Fathers highest degree', 'Mothers highest degree'], axis= 1, inplace= True)

In [None]:
background.head()

##### Criminal Justice Attitudes

In [None]:
crim_justice = unzip('Criminal_Justice')
crim_justice.columns

##### Education

In [None]:
education = unzip('Education')
education.columns

##### Employment

In [None]:
employment = unzip('Employment')
employment.columns

###### Family Values

In [None]:
fam_vals = unzip('Family_Values')
fam_vals.columns

###### Gender and Sexuality

In [None]:
gender = unzip('Gender_and_Sexuality')
gender.columns

###### Identity

In [None]:
idend = unzip('Identity')
idend.columns

###### Interests

In [None]:
interests = unzip('Interests')
interests.columns

###### Lifestyle

In [None]:
lifestyle = unzip('Lifestyle')
lifestyle.columns

###### Opinions

In [None]:
opinions = unzip('Opinions')
opinions.columns

###### Race Relations

In [None]:
race = unzip('Race_Relations')
race.columns

###### Religion

In [None]:
religion = unzip('Religion')
religion.columns

###### Socioeconomic Status

In [None]:
ses = unzip('SES')
ses.columns

###### Systems

In [None]:
systems = unzip('Systems')
systems.columns

###### Voter Identity

In [None]:
vote_id = unzip('Voter_Identity')
vote_id.columns