# Features - Space

Joining tweets to the U.S. County polygons

---

**Helpful link on Postgres geographic manipulations:** [How-To: Manipulate Coordinates in Postgres](https://www.dataiku.com/learn/guide/other/geo/convert-coordinates-with-PostGIS.html)

**Caglar SQL cheatsheet:** 

    --List the counties where age groups 18_21 and 22_29 stand for more than 40% of the total population.
     SELECT C.name, C.state_name, (C.age_18_21 + C.age_22_29)* 100.0 / C.pop2010 as young
      FROM twitter.census_counties_2010 as C 
      WHERE (C.age_18_21 + C.age_22_29) * 100.0 / C.pop2010 > 60

    --How many tweets are there in Johnson County, Iowa? 
      SELECT Count(T.message)
      FROM twitter.tweet as T, twitter.census_counties_2010 as C
      WHERE ST_Within(T.geom, C.geom) and C.name = 'Johnson' and C.state_name = 'Iowa';

    --How many distinct twitter users are there in Johnson County, Iowa?
      SELECT count(distinct T.username)
      FROM twitter.tweet as T, twitter.census_counties_2010 as C
      WHERE ST_Within(T.geom, C.geom) and C.name = 'Johnson' and C.state_name = 'Iowa';

    --Display the name, state_name, count of distinct twitter users and pop2010 of the counties that have a median age (med_age) above 45. 
      SELECT C.name, C.state_name, count(distinct(T.username)),C.pop2010
      FROM twitter.tweet as T, twitter.census_counties_2010 as C
      WHERE ST_Within(T.geom, C.geom) and C.med_age > 45
      group by C.name, C.state_name, C.pop2010;

    --Display the county name, hashtag name and the number of times each hashtag was tweeted in the State of Iowa?
      SELECT T.hashtag1,C.name,Count(T.hashtag1)
      FROM twitter.tweet as T, twitter.census_counties_2010 as C
      WHERE ST_Within(T.geom, C.geom) and C.state_name = 'Iowa' and hashtag1 <> ''
      GROUP BY T.hashtag1, C.name
      order by Count(T.hashtag1) desc;


In [12]:
# Environment vars
import os
from dotenv import find_dotenv, load_dotenv

# Used in analysis
import pandas as pd
import psycopg2 as pg

from time import time
from multiprocessing import Pool

In [2]:
# Set up shop
load_dotenv(find_dotenv())
db_url = os.environ.get('DATABASE_URL')

conn = pg.connect(db_url)
curr = conn.cursor()

In [3]:
test_sql = 'SELECT "tweetID", longitude, latitude FROM filter_tweets LIMIT 10000;'
test_def = pd.io.sql.read_sql_query(test_sql, conn)

test_def.head()

Unnamed: 0,tweetID,longitude,latitude
0,826472736822325248,-76.61553,39.301903
1,826472737132670976,4.202226,51.059372
2,826472738319630341,-82.99527,39.96379
3,826472738399338497,-79.38579,43.65546
4,826472739028340740,-0.161341,51.612667


### Casting to Geographic Point

Lat/lng => geography

    > (ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)
    > ST_SetSRID(ST_Point(longitude,latitude),4326)

In [4]:
cast_sql = """
SELECT 
    "tweetID", 
    ST_SetSRID(ST_MakePoint(filter_tweets.longitude, filter_tweets.latitude), 4326) AS "gps" 
FROM 
    filter_tweets 
LIMIT 10000;
"""
cast_df = pd.io.sql.read_sql_query(cast_sql, conn)

cast_df.head()

Unnamed: 0,tweetID,gps
0,826478334561370114,0101000020E6100000E605D847A7CA52C07172BF4351F8...
1,826478334838239232,0101000020E6100000E90E62670AC952C05EF415A419FB...
2,826478335047798784,0101000020E61000004EE262FB7FFB0A405BD6581D94E3...
3,826478335060561920,0101000020E61000008282F7FAFF5DFABF09336DFFCAE6...
4,826478335362490370,0101000020E61000001DC9E53FA44A5DC0988922A46E5B...


In [5]:
cast_df.iloc[:10]

Unnamed: 0,tweetID,gps
0,826478334561370114,0101000020E6100000E605D847A7CA52C07172BF4351F8...
1,826478334838239232,0101000020E6100000E90E62670AC952C05EF415A419FB...
2,826478335047798784,0101000020E61000004EE262FB7FFB0A405BD6581D94E3...
3,826478335060561920,0101000020E61000008282F7FAFF5DFABF09336DFFCAE6...
4,826478335362490370,0101000020E61000001DC9E53FA44A5DC0988922A46E5B...
5,826478335362527237,0101000020E610000080A77FFF9C080B4028CC20E3F618...
6,826478335626616833,0101000020E6100000F0DC7BB8E41A5940A1BDFA78E80B...
7,826478336260009984,0101000020E61000003CA583F57FCE0A405BD6581D94E3...
8,826478336545222656,0101000020E6100000A5FCB50600B80A40309E4143FF3C...
9,826478336931160065,0101000020E6100000CB9C2E8B893158C0A165DD3F1666...


### Counties Selection

In [6]:
cnty_sql = 'SELECT name, countyfp, statefp, geoid FROM counties;'
cnty_df = pd.io.sql.read_sql_query(cnty_sql, conn)

cnty_df.head()

Unnamed: 0,name,countyfp,statefp,geoid
0,Cuming,39,31,31039
1,Wahkiakum,69,53,53069
2,De Baca,11,35,35011
3,Lancaster,109,31,31109
4,Nuckolls,129,31,31129


### Point in Polygon

`SELECT b.the_geom As bgeom, p.the_geom As pgeom, 
		ST_Intersection(b.the_geom, p.the_geom) As intersect_bp
	FROM buildings b INNER JOIN parcels p ON ST_Intersection(b,p)
	WHERE ST_Overlaps(b.the_geom, p.the_geom)
	LIMIT 1;`
    
    SELECT name
        FROM jacksonco_schools, medford_citylimits
        WHERE ST_Within(jacksonco_schools.the_geom, medford_citylimits.the_geom);
        
    SELECT a.id, b.id 
        FROM pointTableName a, polygonTableName b 
        WHERE ST_Intersects(a.myPointGeo, b.myPolygonGeo);

In [10]:
# Grabs all tweets, regardless of if they're matched or not
# FOR EVALUATING HOW SUCCESFUL/LOSSY FILTERING BY LOCATION IS
intersect_all_sql = """
SELECT 
    filter_tweets."tweetID",
    counties.geoid 
FROM
    filter_tweets 
LEFT OUTER JOIN
    counties 
ON
    ST_Intersects(
        ST_SetSRID(
            ST_MakePoint(filter_tweets.longitude, filter_tweets.latitude), 
            4326),
        counties.geom
    );
"""

# For selecting only the valid tweets
intersect_valid_sql = """
SELECT 
    T."tweetID",
    T.message,
    T."userID",
    T.date,
    C.geoid
FROM
    filter_tweets T,
    counties C
WHERE
    ST_Intersects( 
       ST_SetSRID( 
            ST_MakePoint(T.longitude, T.latitude), 
            4326), 
        C.geom
    ) 
LIMIT 1000;
"""

In [11]:
intersect_df[intersect_df.geoid.isnull() == False].head(20)

Unnamed: 0,tweetID,geoid
3,826484233627643904,36061
5,826484234261057537,4013
6,826484234801930242,17031
8,826484235393499136,20137
9,826484236383293440,36061
11,826484238207836162,11001
12,826484239247998976,26163
14,826484239592017925,1125
17,826484240753782785,6073
18,826484240896368640,17019


In [14]:
start = time()

In [15]:
start - time()

-4.59832501411438

In [18]:
start = time()

tw_all_df = pd.io.sql.read_sql_query(intersect_all_sql, conn)

elapsed = time() - start
print ('{} tweets geolocated in {} secs'.format(len(tw_all_df.index), elapsed))

8497484 tweets geolocated in 3000.6679813861847 secs


In [19]:
tw_all_df.to_csv('../geoids.csv')

In [22]:
# number of null-located tweets

not_located = len(tw_all_df[tw_all_df['geoid'].isnull()])
located = len(tw_all_df[tw_all_df['geoid'].isnull() == False])

print ('Total:\t\t{}'.format(len(tw_all_df)))
print ('Located:\t{}'.format(located))
print ('Not ^^:\t\t{}'.format(not_located))

Total:		8497484
Located:	2195819
Not ^^:		6301665


In [23]:
# close up
curr.close()
conn.close()