In [1]:
import wikipedia
import pandas as pd
import numpy as np
from lxml import html
import requests
from bs4 import BeautifulSoup
import spacy
import en_core_web_sm
import arcgis
from arcgis.gis import GIS
from arcgis.mapping import WebMap
from arcgis.features import SpatialDataFrame
from arcgis.geocoding import geocode
from arcgis.features import GeoAccessor, GeoSeriesAccessor
from IPython.display import display
import arcpy

In [2]:
# populate list of wiki lists the boring way
wiki_western_lists = ['https://en.wikipedia.org/wiki/List_of_Western_films_of_the_1920s','https://en.wikipedia.org/wiki/List_of_Western_films_of_the_1930s','https://en.wikipedia.org/wiki/List_of_Western_films_of_the_1940s','https://en.wikipedia.org/wiki/List_of_Western_films_1950-54','https://en.wikipedia.org/wiki/List_of_Western_films_1955-59','https://en.wikipedia.org/wiki/List_of_Western_films_of_the_1960s']

In [3]:
# get that sweet, sweet HTML from the above lists

links = []
titles = []
plots = []
iteration = 0
for wiki_western_list in wiki_western_lists:
    html = requests.get(wiki_western_list)
    
    # stick HTML into beautiful soup text object
    
    b = BeautifulSoup(html.text, features = "lxml")
    
    # movie titles/links were the only thing italicized with <i> tag, so we will 'find_all' on 'i'
    for i in b.find_all(name = 'i'):
            
        # now that we have the line we want, we can pull just the link using 'href'
        for link in i.find_all('a', href=True):
            links.append(link['href'])
            
            # also grab the title for later
            titles.append(link['title'])
            iteration += 1
            if iteration % 250 == 0:
                print(str(iteration) + ' titles appended')

# not all these movies have wiki articles, so we'll nuke those from our big list                        
links_existing = [x for x in links if "redlink" not in x]

titles_existing = [i for i in titles if '(page does not exist)' not in i]
            
# have to make the full url from what we grabbed
wiki_western_links = ['https://en.wikipedia.org' + i for i in links_existing]
wiki_western_titles = titles_existing

250 titles appended
500 titles appended
750 titles appended
1000 titles appended
1250 titles appended
1500 titles appended
1750 titles appended
2000 titles appended
2250 titles appended


In [4]:
print(f'Number of Western Links: {len(wiki_western_links)}')
print(f'Number of Western Titles: {len(wiki_western_titles)}')

Number of Western Links: 1995
Number of Western Titles: 1995


In [5]:
wiki_western_pages = list(zip(wiki_western_titles, wiki_western_links))
len(wiki_western_pages)

1995

In [6]:
# create list to handle loose wiki standards
# great article about this stuff on Medium.com by This Time Is Different
possibles = ['Plot','Synopsis','Plot synopsis','Plot summary', 
             'Story','Plotline','The Beginning','Summary',
             'Content','Premise']
possibles_edit = [i + 'Edit' for i in possibles]
all_possibles = possibles + possibles_edit
# go ask wikipedia for page info using the titles we put together
iteration = 0
exception = 0
for t in wiki_western_pages:
    iteration += 250
    if iteration % 10 == 0:
        print(str(iteration) + ' plots appended')
    wik = wikipedia.WikipediaPage(t[0])
    
    # is there plot info of some sort?
    try:
        for p in all_possibles:
            if wik.section(p) != None:
                
                # add whatever we find to plots list
                plot = wik.section(p).replace('\n','').replace("\'","")
                plots.append(plot)
                
    # handle exceptions
    except:
        exception += 1
        if exception % 250 == 0:
            print(str(exception) + ' exceptions skipped')
        plot = np.NaN

250 plots appended
500 plots appended
750 plots appended
1000 plots appended
1250 plots appended
1500 plots appended
1750 plots appended


In [7]:
# run plots through a natural language processor to find geopolitical entities
# you first need to download/install en_core_web_sm at https://spacy.io/models/en
nlp = en_core_web_sm.load()
gpe = []
iteration = 0
appendnum = 0
for plot in plots:
    doc = nlp(plot)
    for ent in doc.ents:
        if (ent.label_ == 'GPE'):
            gpe.append(ent.text)
            appendnum += 1
            if appendnum % 250 == 0:
                print(str(appendnum) + ' entities appended')

250 entities appended
500 entities appended
750 entities appended
1000 entities appended
1250 entities appended
1500 entities appended
1750 entities appended
2000 entities appended
2250 entities appended
2500 entities appended
2750 entities appended
3000 entities appended
3250 entities appended
3500 entities appended
3750 entities appended


In [8]:
# we want to query wikipedia to clean up our list
# if the wiki summary returned from a search on the geopolitical entities has a city-like word AND a western State we keep it
addresses = []
valid_gpe_list = ['city', 'municipality', 'capital', 'town', 'village', 'census']
# continental USA west of the mississippi
valid_state_list = ['Arizona','California','Colorado','Idaho','Montana','Nevada','New Mexico','Oregon',
                    'Utah','Washington','Wyoming','North Dakota','South Dakota','Nebraska','Kansas',
                    'Oklahoma','Texas','Minnesota','Iowa','Missouri','Arkansas','Louisiana']
dice = 0
no_dice = 0
lost = 0

for e in gpe:
    try: 
        summary = str(wikipedia.summary(e))

        if any(v in summary for v in valid_gpe_list):
            if any(s in summary for s in valid_state_list):
                addresses.append(e + ', United States')
                dice += 1
                if dice % 250 == 0:
                    print(str(dice) + ' entities accepted')
            else:
                no_dice += 1
        else:
            no_dice += 1
            if no_dice % 250 == 0:
                print(str(no_dice) + ' entities rejected')

    except:
        lost += 1
        if lost % 250 == 0:
            print(str(lost) + ' entities not in wiki')



  lis = BeautifulSoup(html).find_all('li')


250 entities not in wiki
250 entities rejected
500 entities accepted
750 entities accepted
500 entities not in wiki
500 entities rejected
750 entities not in wiki
1250 entities accepted
750 entities rejected
1500 entities accepted
1000 entities not in wiki
1750 entities accepted


In [9]:
# create dataframe for locations
# geocode entities to get xy's
# append entities and xy's to dataframe
gis = GIS()
df = pd.DataFrame()
iteration = 0
geocode_pass = 0
for address in addresses:
    try:
        location = geocode(address)[0]
        df = df.append({'address': address[:-5], 'x_column': location['attributes']['DisplayX'],'y_column': location['attributes']['DisplayY']},ignore_index=True)
        iteration += 1
        if iteration % 250 == 0:
            print(str(iteration) + ' addresses geocoded')
    except:
        geocode_pass += 1
        if geocode_pass % 250 == 0:
            print(str(geocode_pass) + ' geocodes passed')
        pass

250 addresses geocoded
500 addresses geocoded
750 addresses geocoded
1000 addresses geocoded


In [106]:
# convert dataframe to spatially enabled dataframe
wsdf = pd.DataFrame.spatial.from_xy(df, 'x_column', 'y_column', sr=4326)

In [118]:
# remove dumb things we found after looking at the data points
drop_list = ['US, United S', 'United S', 'U.S., United S', 'America, United S', 'the United States, United S','United States, United S','United States of America, United S']
wsdf = wsdf[~wsdf['address'].isin(drop_list)]
mdf = wsdf.copy()
#mdf

In [119]:
mdf.spatial.centroid

(-106.2693241648988, 36.35037924014938)

In [120]:
m1 = GIS().map('United States')
m1

MapView(layout=Layout(height='400px', width='100%'))

In [121]:
m1.zoom = 4
m1.center = [39,-98]
layer = mdf.spatial.to_feature_collection()
m1.add_layer(layer, {"renderer":"HeatmapRenderer","opacity":0.5})

In [122]:
mdf.spatial.plot(map_widget=m1,
        symbol_type='simple',
        symbol_style='s',
        cmap='Blues_r',
        cstep=35,
        outline_color='binary',
        marker_size=5,
        line_width=.5,)

True