<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Presidential-Election-Data" data-toc-modified-id="Presidential-Election-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Presidential Election Data</a></span><ul class="toc-item"><li><span><a href="#Existing-files-online" data-toc-modified-id="Existing-files-online-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Existing files online</a></span></li><li><span><a href="#Web-Scraping" data-toc-modified-id="Web-Scraping-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Web Scraping</a></span></li></ul></li><li><span><a href="#Movie-list-(LGBT-&amp;-Feminism)" data-toc-modified-id="Movie-list-(LGBT-&amp;-Feminism)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Movie list (LGBT &amp; Feminism)</a></span></li><li><span><a href="#Average-Movie-ratings" data-toc-modified-id="Average-Movie-ratings-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Average Movie ratings</a></span></li><li><span><a href="#Tweets" data-toc-modified-id="Tweets-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Tweets</a></span></li></ul></div>

In [33]:
# import all necessary packages
import tweepy
import pandas as pd
import requests as req
from lxml import etree
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from tqdm import tqdm_notebook
import csv

## Presidential Election Data
**Data set**: Presidential election data from 1996 to 2016. <br>
**Goal**: Identify conservative, liberal and swing states. <br><br>
**Data Source:** 
1. Existing csv/xls/xlw files online
2. Web Scraping 1996 election data online

### Existing files online
Online source: https://transition.fec.gov/pubrec/electionresults.shtml <br>
It contains data of 2000, 2004, 2008, 2012, 2016.

### Web Scraping
Web scraping source: https://transition.fec.gov/pubrec/fe1996/elecpop.htm <br>
It contains data from 1996.

In [35]:
# 1996 ELECTORAL AND POPULAR VOTE SUMMARY is in html format
def simple_get(url):
    """
    Attempts to get the content at 'url' by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                # r.text is the content of the response in unicode, 
                # and r.content is the content of the response in bytes.
                return resp.content
            else:
                return None
            
    except RequestException as e:
        log_error('Error during requests to {0}:{1}'.format(url, str(e)))
        return None

In [36]:
def is_good_response(resp):
    """
    Return True if the response seems to be HTML/HTM, Flase otherwise.
    """
    content_type = resp.headers['Content-Type']
    return (resp.status_code == 200 
            and content_type is not None
            and content_type.find('html') > -1)

In [37]:
def log_error(e):
    """
    Print log errors.
    """
    print(e)

In [6]:
# Download 1996 presidential ELECTORAL AND POPULAR VOTE 
url = 'https://transition.fec.gov/pubrec/fe1996/elecpop.htm'
response = simple_get(url)
if response is not None:
    htm = BeautifulSoup(response, 'html.parser')
    # cast to string
    para = str(htm.find_all('pre'))
    temp_content = para[para.find('>AL'):]
    table_content = temp_content[1:temp_content.find('<st')]

In [7]:
table_content_li = [x for x in table_content.split('\r\n')]

In [8]:
content = []
for row in table_content_li[:-1]:
    a = row.split('        ')
    if a[1] == '':
        a[1] = 'n'
    if a[2] == '':
        a[2] = 'n'
    content.append(a)

In [9]:
# process exception: different length of row
content[8]

['DC', '3', 'n', '     158,220       17,339', '3,611', '      185,726 ']

In [10]:
update = content[8][3] + ' ' + content[8][4]
content[8][3] = update

In [11]:
temp_row = content[8]
del temp_row[4]
temp_row

['DC', '3', 'n', '     158,220       17,339 3,611', '      185,726 ']

In [12]:
# replace exception with updated row
content[8] = temp_row

In [13]:
# convert list to numpy array then to dataframe
df96 = pd.DataFrame(content)

In [14]:
df96.columns = ['State', 'Clinton','Dole','Popular vote','Total Popular vote']

In [15]:
df96

Unnamed: 0,State,Clinton,Dole,Popular vote,Total Popular vote
0,AL,n,9,"662,165 769,044 92,149",1534349
1,AK,n,3,"80,380 122,746 26,333",241620
2,AZ,8,n,"653,288 622,073 112,072",1404405
3,AR,6,n,"475,171 325,416 69,884",884262
4,CA,54,n,"5,119,835 3,828,380 697,847",10019484
5,CO,n,8,"671,152 691,848 99,629",1510704
6,CT,8,n,"735,740 483,109 139,523",1392614
7,DE,3,n,"140,355 99,062 28,719",270845
8,DC,3,n,"158,220 17,339 3,611",185726
9,FL,25,n,"2,546,870 2,244,536 483,870",5303794


## Movie list (LGBT & Feminism)

In [28]:
# LGBT related movie
origin_page = req.get("https://en.wikipedia.org/wiki/List_of_LGBT-related_films")

soup = BeautifulSoup(origin_page.text, "html.parser")

movie_name1 = ''
for element in soup.find_all('a'):
    if element.get('title') is not None:
        movie_name1 += (str(element.string) + "***")

# Get movie names part only
chunks = movie_name1.split('edit***')
for chunk in chunks:
    if chunk.startswith('Z'):
        z_index = chunks.index(chunk)
    if chunk.startswith('$'):
        a_index = chunks.index(chunk)

movie_list1 = chunks[a_index : z_index+1]

# convert each movie into an element of a list
movie_names1 = []
for movie_chunk in movie_list1:
    movie_temp = movie_chunk.split('***')
    movie = movie_temp[:-1]
    movie_names1.extend(movie)
    
# a list of all lgbt movie names from Wiki page
print(movie_names1)



In [65]:
# Feminism related movies
origin_page = req.get("https://en.wikipedia.org/wiki/Category:Feminist_films")

soup = BeautifulSoup(origin_page.text, "html.parser")

movie_name2 = ''
for element in soup.find_all('a'):
    if element.get('title') is not None:
        movie_name2 += (str(element.string) + "***")

# Get movie names part only
chunks = movie_name2.split('***')
for chunk in chunks:
#     print(chunk)
    if chunk.startswith('Nor'):
        z_index = chunks.index(chunk)
    if chunk.startswith('5'):
        a_index = chunks.index(chunk)

movie_list2 = chunks[a_index : z_index+1]

# convert each movie into an element of a list
movie_names2 = []
for movie_chunk in movie_list2:
    if movie_chunk.endswith("film)"):
        movie_names2.append(movie_chunk[0:movie_chunk.index("(")-1])
    else:
        movie_names2.append(movie_chunk)
    
# a list of all lgbt movie names from Wiki page
print(movie_names2)

['5 Girls', '9 to 5', '10 Hours of Walking in NYC as a Woman', '10 Things I Hate About You', '20th Century Women', '22 Female Kottayam', '36 Vayadhinile', 'Aadavantha Deivam', 'Aandhi', 'Aaravalli', 'Aayiram Thalai Vaangi Apoorva Chinthamani', 'The Accused', 'Akka Thangai', 'Aletta Jacobs: Het Hoogste Streven', "Alice Doesn't Live Here Anymore", 'Alice in Wonderland', 'Aliens', 'All About My Mother', 'Anastasia', 'Anatomy of Hell', 'An Angel at My Table', 'Angry Indian Goddesses', 'Anthuleni Katha', "Antonia's Line", 'Arangetram', 'Archana IAS', 'Arth', 'Ask for Jane', 'Assassination Nation', 'The Associate', 'Astitva', 'Attack of the 50 Ft. Woman', 'Aval Appadithan', 'Aval Oru Thodar Kathai', 'Avargal', 'Bad Girls', 'Bad Moms', 'Bagdad Cafe', 'The Ballad of Josie', 'The Ballad of Little Jo', 'Bandit Queen', 'Barb Wire', 'Basic Instinct', 'Battle of the Sexes', 'Becoming Jane', 'Bed and Sofa', 'The Beguiled', 'Big Eyes', 'Bol', 'Born in Flames', 'Brave', 'Brimstone', 'Buddha Collapsed 

## Average Movie ratings

## Tweets
Using Twitter API to get people's reviews on movies.

In [2]:
# authorization


auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# Create tweepy object for twitter API
api = tweepy.API(auth)

In [25]:
# Read from Twitter ID file
tweet_summary_map = {}
nameList = ['being julia', 'brokeback mountain']
for movie in nameList:
    with open("Tweet Data/Tweepy-API-xPath/"+movie+".txt", 'r') as f:
        x = f.read().splitlines()
    id_list = [line.split('/')[-1] for line in x]
    tweet_summary = pd.DataFrame(columns=['Timezone', 'Full Tweet', 'user_name', 'user_location', 'coordinates', 'country_code', 'place'])
    tweet_summary.index.name = 'Tweet Time'
    for id in tqdm_notebook(id_list):
        tweet_info = api.get_status(id, lang = 'en', tweet_mode='extended')
        if 'retweeted_status' in dir(tweet_info):
            tweet=tweet_info.retweeted_status.full_text
        else:
            tweet=tweet_info.full_text
        if tweet_info.place:
            place = tweet_info.place.full_name
            country_code = tweet_info.place.country_code
        else:
            place = None
            country_code = None

        tweet_summary.loc[tweet_info.created_at] = [tweet_info.user.time_zone, tweet, tweet_info.user.name, tweet_info.user.location, tweet_info.coordinates, country_code, place]
    tweet_summary_map[movie] = tweet_summary

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

TweepError: [{'code': 179, 'message': 'Sorry, you are not authorized to see this status.'}]