# Points

Calculating the points in polygons. In this notebook I...

1. Read in tweets from database
2. Read in counties
3. Format some data, missing vals, etc
4. Spatially join
5. *Save an updated counties geojson file, and a county tweet counts csv*

In [1]:
# Libraries

%run utilities.py

import sqlite3

import geopandas as gpd
from matplotlib import pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch
from shapely.geometry import Point, Polygon, MultiPolygon

%matplotlib inline

---

In [2]:
# establish the DB connection
conn = sqlite3.connect(tweet_db)

In [3]:
# gets lat/lons
tweets = pd.read_sql_query("select tweetID, longitude, latitude from Raw;", conn)

# create a geometry column in our point dataset to contain shapely geometry for geopandas to use
tweets['geometry'] = tweets.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
tweets['geometry'] = gpd.GeoSeries(tweets.geometry)

tweets.head(2)

KeyboardInterrupt: 

In [None]:
### Maps

# colors
land_color = '#DDDDDD'
water_color = '#D2F5FF'
coastline_color = '#333333'
border_color = '#999999'

# Dimensions
map_width_m = 5000 * 1000 #5000 km
map_height_m = 3500 * 1000 #3500 km

# Albers Conical Equal Area projection for USA
albers_usa = {
    'datum':'NAD83',
    'ellps':'GRS80',
    'proj':'aea', 
    'lat_1':33, 
    'lat_2':45, 
    'lon_0':-97, 
    'lat_0':39, 
    'x_0':map_width_m/2, 
    'y_0':map_height_m/2,
    'units':'m'
 }

In [None]:
# matplotlib
"""m = Basemap(projection='merc',
            llcrnrlat=-80, urcrnrlat=80,
            llcrnrlon=-180, urcrnrlon=180,
            lat_ts=20, resolution='c')"""

m.drawcoastlines(color=coastline_color)
m.drawcountries(color=border_color)
m.fillcontinents(color=land_color, lake_color=water_color)
m.drawstates(color=border_color)
m.drawmapboundary(fill_color=water_color)

#m.scatter(x=tweets['x'], y=pts_df['y'], s=5, color='r', edgecolor='None', alpha=0.4, zorder=10)
#plt.show()

#### Spatial Join

In [None]:
# grap the counties to join with the tweets
counties = gpd.read_file(counties_f)

# drop the empty geometric rows
counties.dropna(subset=['geometry'], inplace=True)

# get tweets?
tweets = gpd.GeoDataFrame(tweets)

In [None]:
counties.plot();

In [None]:
# CHECK THE PROJECTIONS!!!

print ('Tweet CRS: {}'.format(tweets.crs))
print ('U.S. Counties CRS: {}'.format(counties.crs))

In [None]:
# fix that
tweets.crs = counties.crs

In [None]:
# use op='within' to use rtree spatial index for much faster operation
us_tweets = gpd.sjoin(tweets, counties[['FIPS', 'geometry']], how="inner", op='within') # op='intersects')

In [None]:
# check to make sure we didn't get too much
len(us_tweets)

# 536239

In [None]:
# get the county count
county_tweets = us_tweets['FIPS'].value_counts().reset_index()
county_tweets.columns = ['FIPS', 'tweets']

county_tweets.head()

In [None]:
# join county counts to our dataframe

print ('counties: ', len(counties))
print ('county tweet counts: ', len(county_tweets))

counties = counties.join(county_tweets, lsuffix='FIPS', rsuffix='FIPS', how='left')

print (len(counties))

**Write out the files**

In [None]:
us_tweets_f = '../../data/processed/web/county-tweets.json'
us_tweets.to_file(us_tweets_f)

In [None]:
county_tweets_f = '../../data/finals/web/raw-county-tweets.csv'
county_tweets.to_csv(county_tweets_f, index=False)