In [2]:
from pyspark import SparkContext

In [3]:
def createIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(4326))
    index = rtree.Rtree()
    for idx,geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

def findZone(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if zones.geometry[idx].contains(p):
            return idx

In [4]:
def getDrugsFromFileToSet(fn1,fn2):
    w=set()
    phrase = set()
    with open(fn1) as reader:
        rows = reader.read()
        for row in rows.split("\n"):
            if " " in row:
                phrase.add(row.strip())
            else:
                w.add(row.strip())
    with open(fn2) as reader:
        rows = reader.read()
        for row in rows.split("\n"):
            if " " in row:
                phrase.add(row.strip())
            else:
                w.add(row.strip())
    return w,phrase
# def lineToBigrams(tweets):
#     import nltk
#     tweet_bigram = ()
#     tweet_token = ()
#     for tweet in tweets[0][3:]:
#         tweet_token+=tuple(tweet.split(" "))
#         tweet_bigram += tuple(nltk.bigrams(nltk.word_tokenize(tweet)))
#     tweet_bigram_refine = tuple(map(lambda x: " ".join(x).lower(),tweet_bigram))
#     tweet_token_refine = tuple(map(lambda x: x.lower(),tweet_token))
#     return (tweet_token_refine+tweet_bigram_refine)
# def testToBigram(tweets):
#     drug_tweet = 0
#     tweet_token = lineToBigrams(tweets)
#     drug_name_set = getDrugsFromFileToSet('drug_illegal.txt','drug_sched2.txt')
#     if(drug_name_set.intersection(set(tweet_token))):
#         drug_tweet = 1
#     return tweets[0][1:3],drug_tweet
# def filterDrugs(tweets):
#     import re
#     a,b = getDrugsFromFileToSet('drug_illegal.txt','drug_sched2.txt')
#     for tweet in tweets[2]:
#         for drug in b:
#             sets = re.findall(r' '+drug+' |^'+drug+' | '+drug+'$',tweet)
#             if sets:
#                 return True
#     return False
def filterDrugs(tweets):
    import re
    a,b = getDrugsFromFileToSet('drug_illegal.txt','drug_sched2.txt')
    for tweet in tweets[2]:
        sets = a.intersection(set(tweet.split(" ")))
        if sets:
            print("word----------------",sets)
            return True
        else:
            for drug in b:
                if drug in tweet:
                    print("phrase--------------",drug)
                    return True
    return False

In [5]:
def partition(tweetData):
    import pyproj
    import shapely.geometry as geom
    proj = pyproj.Proj(init="epsg:4326", preserve_units=False)
    main_index , main_zone = createIndex('500cities_tracts.geojson')
    city_index = None
    for tweet in tweetData:
        tweet_split = tweet.split('|')
        try:
            x = float(tweet_split[2])
            y = float(tweet_split[1])
            point = geom.Point((x,y))
            city_index = findZone(point,main_index,main_zone)
        except:
            pass
        if not city_index:
            pass
            #print(tweet_split[0])
        else:
            yield(main_zone['plctract10'][city_index],main_zone['plctrpop10'][city_index],tweet_split[5:])

In [6]:
def mapper(data):
    return (data[0],data[1]),1
def normalizer(data):
    return data[0][0],data[1]/data[0][1]

In [7]:
if __name__=='__main__':
    #sc = SparkContext()
    rdd = sc.textFile('input.txt')
    rdd1=rdd.mapPartitions(partition).filter(filterDrugs).map(mapper).reduceByKey(lambda x,y:x+y).map(normalizer)
    rdd1.sortBy(lambda x :x [0]).saveAsTextFile("final05")


In [10]:
import fiona
import fiona.crs
import shapely
import rtree

import pandas as pd
import geopandas as gpd
neighborhoods = gpd.read_file('500cities_tracts.geojson').to_crs(fiona.crs.from_epsg(2263))

In [11]:
neighborhoods

Unnamed: 0,plctract10,plctrpop10,geometry
0,0107000-01073005910,4612,(POLYGON ((-2901259.247383744 -2109114.1476620...
1,0107000-01073010801,168,(POLYGON ((-2916943.229076727 -2144480.3017811...
2,0107000-01073012701,44,(POLYGON ((-2884835.522685619 -2145459.4804165...
3,0107000-01073012703,498,(POLYGON ((-2884278.553546395 -2163666.8747638...
4,0107000-01073012704,113,(POLYGON ((-2884976.178560413 -2164988.8798753...
5,0135896-01073012704,27,(POLYGON ((-2881277.422380826 -2167670.9543521...
6,0135896-01073012907,2696,(POLYGON ((-2935742.825081142 -2189570.4728454...
7,0135896-01073014409,3266,(POLYGON ((-2950089.16421522 -2191174.67947150...
8,0135896-01117030217,4878,(POLYGON ((-2885250.550574879 -2171205.3496426...
9,0135896-01117030303,2728,(POLYGON ((-2910024.639962874 -2178759.4991512...


In [15]:
neighborhoods.loc[neighborhoods['plctrpop10'] == 0]

Unnamed: 0,plctract10,plctrpop10,geometry
112,0107000-01073010302,0,(POLYGON ((-2979029.533029025 -2175916.8297865...
113,0150000-01097007700,0,(POLYGON ((-3486144.980344494 -3094954.5355395...
115,0177256-01125010404,0,(POLYGON ((-3185684.258056872 -2215306.2843363...
116,0477000-04019004112,0,(POLYGON ((-10228086.23919678 -533934.65594960...
125,0524550-05131010201,0,(POLYGON ((-5068944.739245834 -1081292.0859682...
324,0662000-06065040201,0,(POLYGON ((-11725719.99927591 959544.565645482...
325,0665000-06071007603,0,(POLYGON ((-11660629.30787544 973203.089993610...
327,0666000-06073020401,0,(POLYGON ((-11806042.52520934 614306.630368226...
328,0816000-08041005000,0,(POLYGON ((-7588354.178474437 1045708.22489064...
329,1239550-12011060208,0,(POLYGON ((-1124875.220016089 -5075941.2160069...
