Search Data Processing
=========================



In [347]:
import numpy as np
import pandas as pd
import pyspark
import re
import string

In [5]:
search_data = pd.read_csv("./data/sample_openData_searchTerms_clean.csv")

In [6]:
search_data.shape

(5000, 8)

In [7]:
search_data.tail()

Unnamed: 0,Search Term,Exit Page,Total Unique Searches,Results Pageviews / Search,% Search Exits,% Search Refinements,Time after Search,Average Search Depth
4995,address range,/Geographic-Locations-and-Boundaries/Streets-o...,2,1.5,0.00%,0.00%,0:02:09,8.0
4996,address shapefile,/browse/embed?category=&limit=20&limitTo=&q=ad...,2,1.5,0.00%,66.67%,0:00:18,3.0
4997,address to geographic location,/browse/embed?category=&limit=20&limitTo=&q=ad...,2,1.0,0.00%,0.00%,0:00:03,1.5
4998,address with resident names,/browse/embed?category=&limit=20&limitTo=&q=ad...,2,1.0,0.00%,0.00%,0:00:05,1.5
4999,addresse,/browse/embed?category=&limit=20&limitTo=&q=po...,2,1.0,0.00%,100.00%,0:00:03,1.5


In [8]:
#search_data[search_data['Total Unique Searches']<5]

In [9]:
search_terms = list(set(search_data['Search Term']))

In [10]:
len(search_terms)

2451

In [11]:
#print search_terms

In [12]:
search_data.ix[search_data["Search Term"] == "194415"]

Unnamed: 0,Search Term,Exit Page,Total Unique Searches,Results Pageviews / Search,% Search Exits,% Search Refinements,Time after Search,Average Search Depth
3519,194415,/Economy-and-Community/Registered-Business-Loc...,2,1,0.00%,0.00%,0:00:06,2.5


In [13]:
search_terms_data = search_data[["Search Term"]]
search_terms_data = search_terms_data.rename(columns={"Search Term": "search_term"})

In [14]:
search_terms_data.head()

Unnamed: 0,search_term
0,business
1,crime
2,crime
3,311
4,streets


In [15]:
search_terms_data['processed_data'] = search_terms_data.search_term\
                                        .apply(lambda text: text.decode('ascii' ,"ignore" ).decode('utf-8','ignore'))\
                                        .apply(lambda text: text.lower())

In [16]:
search_terms_list =  list(set(search_terms_data.processed_data))
#print search_terms_list

#### We need to generally tag searches in categories of quality

Many search terms indicate that users are not informed on what the purpose of the website is or how to properly use search

Search Tags
* Good Quality Search
    * complete words or phrases
    * minor typos
* Bad Quality
    * Addresses
    * Dates
    * Zipcodes or just a string of numbers
    * General nonsenses eg. ('></script><script>alert(1)</script>', '///', '16exc-3031') 

In [17]:
dates_r = re.compile('[0-9]/[0-9]/[0-9]')
numbers_r = re.compile('^[0-9][0-9]*[0-9]$')
html_r = re.compile('^<.*>$')


In [18]:
filter(dates_r.match, search_terms_list)

[u'9/1/01', u'3/1/01']

In [19]:
print filter(html_r.match, search_terms_list)

[u'</script><script>alert(1)</script>']


In [20]:
# removing punctuation

def removePunctuation(text):

    for c in string.punctuation:
        text = text.replace(c,"").strip().lower()
    return text

In [21]:
# iterative process

def text_processing(search):
    
    return [removePunctuation(i) for i in search]
        
        

In [22]:
search = text_processing(search_terms_list)
#search

In [254]:
regex = "\d{1,4}.?\d{0,4}\s[a-zA-Z|\d+]{2,30}\s[a-zA-Z]|\s[a-zA-Z]*"

f = [re.findall(regex, i) for i in search
     if re.findall(regex, i)!= [] 
     if re.findall(regex, i)[0][:3] != '311'
     ]

# http://regexlib.com/REDetails.aspx?regexp_id=430

In [255]:
year = [str(j) for j in range(2000,2017)]

addresses = [i for i in f if i[0][:4] not in year ]
#addresses

In [409]:
# applying existing code to the full data

query = pd.read_csv("./data/all_queries.csv")

In [410]:
query.head()

Unnamed: 0,ga.searchKeyword,ga.searchStartPage,ga.searchAfterDestinationPage,ga.searchUniques,ga.avgSearchResultViews,ga.avgSearchDepth,ga.percentSearchRefinements,ga.searchDuration,ga.searchExitRate
0,crime,'/,'/data?search=crime,451,1.066519,0.0,0.0,62,0
1,business,'/,'/data?search=business,319,1.106583,0.0,0.0,5,0
2,311,'/,'/data?search=311,221,1.135747,0.0,0.0,7,0
3,crime,'/,'/browse/embed?Department-Metrics_Publishing-D...,200,1.07,4.54,12.149533,48646,0
4,streets,'/,'/data?search=streets,169,1.047337,0.0,0.0,3,0


In [412]:
query.shape


(116339, 9)

In [413]:
# seeing the data

query_list =  list(query['ga.searchKeyword'].values)
query_list = [str(word).decode('ascii' ,"ignore" ).decode('utf-8','ignore') for word in query_list]

In [414]:
set_query = len(set(query_list))
set_query

20600

In [415]:
#adresses
regex = "\d{1,4}.?\d{0,4}\s[a-zA-Z|\d+|\W+]{2,30}\s[a-zA-Z]{2,15}"

full = [re.findall(regex, i) for i in query_list
     if re.findall(regex, i)!= [] 
     if re.findall(regex, i)[0][:3] != '311'
     ]


In [416]:
len(full)

1967

In [417]:
full

[[u'2010 census tracts'],
 [u'460 forms Schedule'],
 [u'1010 fitzgerald ave, san francisco, ca'],
 [u'1600 California street'],
 [u'1996-2013 city survey'],
 [u'2 hour parking'],
 [u'2 hour parking'],
 [u'2011 herrera campaign contributions'],
 [u'2012 SFO Customer Survey'],
 [u'2013 housing inventory'],
 [u'2598 Mission Street'],
 [u'405 howard st'],
 [u'5 thomas mellon circle'],
 [u'2000 Blocks (no water'],
 [u'2000 Blocks (no water'],
 [u'2000 Tracts (no water'],
 [u'2000 Tracts (no water'],
 [u'2000 Blocks (no water'],
 [u'2000 blocks (no water'],
 [u'2000 tracts (no water'],
 [u'2000 tracts (no water'],
 [u'120 lake street'],
 [u'0 Beatrice Rd, Brisbane, CA'],
 [u'19/2015 dashiell hammet'],
 [u'1 Embarcadero San Francisco'],
 [u'1 Hawkins Ln SF CA'],
 [u'1 Longview Court'],
 [u'1 Polk st'],
 [u'1 Tuscany Alley'],
 [u'1 hour parking'],
 [u'1 post st'],
 [u'1 scott street'],
 [u'1 south van ness ave'],
 [u'1 south vanness'],
 [u'1. BRIAN DUSSEAULT'],
 [u'10 gb dataset'],
 [u'10 glen

In [418]:
# flatten full list 
full = sc.parallelize(full).flatMap(lambda row: row).collect()

In [419]:
len(full)

1978

In [420]:
(696.0 / 18520.0) * 100

3.7580993520518358

In [421]:
year_full = [str(j) for j in range(2000,2017)]
# taking out the years

addresses_full = [i.lower() for i in full if i[:4] not in year ]
addresses_full = list(set(addresses_full))
addresses_full

[u'141 milton street san francisco, ca',
 u'220 30th ave',
 u'631 folsom st',
 u'939 jackson st',
 u'1480 larkin st',
 u'425 10th st',
 u'135 post street',
 u'400 duboce street',
 u'466 9th avenue',
 u'32 richland avenue',
 u'56 moss st',
 u'1. brian dusseault',
 u'619 union street',
 u'1700 newhall street',
 u'3067 25th street san francisco ca',
 u'20 jones st',
 u'425 market street energy',
 u'2201 market street',
 u'1416 polk street',
 u'944 treat av',
 u'3864 23rd street',
 u'1500 mc allister street san francisco',
 u'201 hale street',
 u'1407 egbert street san francisco',
 u'141 eddy street sf ca',
 u'5 thomas mellon circle',
 u'729 shotwell st',
 u'322 30th ave',
 u'200 mississippi street',
 u'521 clement street',
 u'3143 folsom st',
 u'33 allston way',
 u'5300 3rd street',
 u'1146a guerrero street, san francisco',
 u'601 van ness ave',
 u'1630 30th avenue',
 u'1385 carroll ave',
 u'3060 16th street building',
 u'60 agua way',
 u'111 powell street',
 u'4182 mission street',
 u'33

In [422]:
year_listing = [i.lower() for i in full if i[:4] in year ]
year_listing = list(set(year_listing))
year_listing

[u'2000 block group',
 u'2007 housing inventory',
 u'2010 census block for san francisco',
 u'2000 blocks (no water',
 u'2010 census tracts no water',
 u'2010: census blocks for san',
 u'2016 assessor map',
 u'2010 census population',
 u'2015 crime statistics',
 u'2014 energy benchmark 285 geary',
 u'2009 sfo survey',
 u'2015 enacted budget',
 u'2010 tract population',
 u'2015 housing inventory',
 u'2013 city survey report',
 u'2012 campaign finance',
 u'2005 pipeline report',
 u'2000 tracts no water',
 u'2014 city serve report',
 u'2016 budget publication',
 u'2000 census tract',
 u'2011 mayor campaign contributions',
 u'2000 block goup',
 u'2012 sfo customer survey',
 u'2015 richmond district',
 u'2010 demographic profile data, sf county',
 u'2015 in san francisco',
 u'2015 form 460; schedule',
 u'2014 map:crime incidents',
 u'2011 herrera campaign contributions',
 u'2015 residential development pipeline',
 u'2004 bicycle network',
 u'2010 vehicle ownership',
 u'2008 crime data',
 u'

In [423]:
clean_list = sc.parallelize(query_list).filter(lambda word: word.lower() not in addresses_full).collect()

In [424]:
links = "((https?|http):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)"

https = [re.findall(links, i) for i in clean_list
     if re.findall(links, i)!= [] 
     if re.findall(links, i)!= "//" 
     ]
# http://stackoverflow.com/questions/6718633/python-regular-expression-again-match-url

In [425]:
# finding all the unique links that people have put into the search query

https = (sc.parallelize(https).flatMap(lambda word: word)
        .flatMap(lambda word: word)
        .filter(lambda word: word != '//')
        .filter(lambda word: word != 'https')
        .filter(lambda word: word != '''''')
        .filter(lambda word: word != 'http')
        .collect())

In [426]:
https

[u'https://data.sfgov.org/Public-Health/Excessive-Rent-Burden-San-Francisco-CA/9wty-qwgq',
 u'https://data.sfgov.org/Public-Health/Restaurant-Scores/stya-26eb',
 u'http://googlewebmastercentral.blogspot.com/2014/11/helping-users-find-mobile-friendly-pages.html',
 u'http://www.bsis.ca.gov/forms_pubs/fire_fact.shtml',
 u'http://www.datasf.org/story.php?title=street-sweeper-schedule-and-route-',
 u'https://10.183.241.201/rgcertprintv2default.aspx',
 u'https://data.sfgov.org/Economy-and-Community/Off-Sale-Alcohol-Outlets-San-Francisco-CA/fIy-9zhp',
 u'https://data.sfgov.org/Economy-and-Community/Registered-Business-Locations-San-Francisco/g8m3-pdis',
 u'https://data.sfgov.org/Economy-and-Community/Registered-Business-Locations-San-Francisco/g8m3-pdis?',
 u'https://data.sfgov.org/d/ejmn-jyk6',
 u'https://data.sfgov.org/data?category=',
 u'https://data.sfgov.org/data?category=Transportation',
 u'https://extxfer.sfdph.org/food/',
 u'https://www.fiverr.com/kawsarhossain',
 u'https://www.kaggle

In [427]:
clean_list = (sc.parallelize(query_list).map(lambda word: word.lower()).filter(lambda word: word not in https)
              .filter(lambda word: word not in addresses_full)
              .collect())

In [430]:
from collections import Counter
word_count = Counter(clean_list)
word_count_sorted = sorted(word_count.items(),key = lambda x: x[1], reverse=True)

In [431]:
word_count_sorted = [i[0] for i in word_count_sorted if i[1]>2]
word_count_sorted

[u'crime',
 u'streets',
 u'shapefile',
 u'street',
 u'parking',
 u'census',
 u'bike',
 u'business',
 u'building',
 u'water',
 u'neighborhood',
 u'parks',
 u'population',
 u'311',
 u'traffic',
 u'bicycle',
 u'muni',
 u'parcel',
 u'income',
 u'fire',
 u'housing',
 u'school',
 u'rent',
 u'bart',
 u'zoning',
 u'land use',
 u'neighborhoods',
 u'bus',
 u'gis',
 u'parcels',
 u'homeless',
 u'elevation',
 u'food',
 u'restaurant',
 u'buildings',
 u'san francisco',
 u'boundary',
 u'pipeline',
 u'roads',
 u'police',
 u'schools',
 u'park',
 u'address',
 u'transit',
 u'budget',
 u'assessor',
 u'tree',
 u'transportation',
 u'taxi',
 u'sfmta',
 u'restaurants',
 u'building footprints',
 u'property',
 u'city',
 u'trees',
 u'pedestrian',
 u'construction',
 u'health',
 u'businesses',
 u'shapefiles',
 u'bridge',
 u'block',
 u'education',
 u'business license',
 u'bay area',
 u'district',
 u'graffiti',
 u'building permits',
 u'building footprint',
 u'street cleaning',
 u'road',
 u'sfpd',
 u'permit',
 u'censu

In [458]:
string_word = removePunctuation(re.sub("\u" , '', str(word_count_sorted)))
string_word1 = removePunctuation(re.sub("\u" , '', str(query_list)))

In [465]:
from polyglot.text import Text

NER = Text(string_word1)

NER = words.entities

for entity in NER:
    if entity.tag == "I-PER":
        print entity
    


[u'bildings']
[u'san']
[u'bernal']
[u'castro']
[u'bsiness', u'maher']
[u'san']
[u'bart']
[u'ellis']
[u'diana', u'lerner']
[u'ellis']
[u'bart']
[u'levis']
[u'ord']
[u'valle']
[u'steven', u'holland']
[u'consmer']
[u'cort']
[u'san', u'francisco']
[u'drgs']
[u'brigitte', u'wilson']
[u'jane', u'kim']
[u'aaron', u'peskin']
[u'bart']
[u'thomas', u'angelo', u'hetch']
[u'stadim']
[u'stitt']
[u'salesforcecom', u'maher']
[u'stosp']
[u'aaron', u'apperson']
[u'alain', u'gervais']
[u'mintes']
[u'soth', u'san']
[u'soltions', u'san']
[u'jeff', u'startps']
[u'sadi']
[u'jng', u'dccc']
[u'christensen']
[u'bilding', u'rajewen', u'haight']
[u'watson']
[u'lewis']
[u'francisco']
[u'bondary']
[u'noe']
[u'cort']
[u'san']
[u'jim', u'jones']
[u'andrew', u'rex']
[u'weiner']
[u'bart']
[u'sanborn']
[u'ariel', u'tandoori']
[u'kate', u'spasde', u'constrctionlist']
[u'hosing', u'condit']
[u'ca94118']
[u'daly']
[u'carman', u'tomarah', u'l']
[u'anda', u'bsinesse', u'david', u'jordan', u'sfchamp']
[u'apn']
[u'christopher

# important people:
- dennis herrera , city attorney
- aaron peskin , sf supervisor
- jane kim , SF District 6 Supervisor
- hilary ronen , SF District 9 supervisor
- raymond chow , san francisco gangster/felon (high possibility)
- Kenneth Malvar , City Sightseeing operator (high possibility)
http://www.sfgate.com/bayarea/article/Driver-blamed-for-tour-bus-crash-in-SF-s-Union-6994435.php


- Blake Rawdin , private MD(possibly)
- Antonin Scalia, Supreme Court Justice (possibly)
- charle, walton Co-Founder and Partner, Kindred Partners, LLC (possibly)
- Henry albert , private eye doctor(possibly)
- melanie lok , owner of mlok consulting private consulting (possibly)
- bateman, bateman group(possibly)
- David Owen , SF Attorney(possibly)
- Jonah Yee, Private Eye Doctor(possibly)
- Molly Seager, Private Therapist(possibly)
- anson mooney(arrested)
https://jailalert.com/arrest-records/anson-mooney-310655.html
- alain gervais, private owner salon

In [398]:
#not_caught = [ '3180 18th street' , '100 church', '17th street' , '800 university avenue, palo alto, california', '17 san andreas way, san francisco' , '2631 23rd']

In [399]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

In [400]:
vec=CountVectorizer(stop_words='english',analyzer='word')
X_train_counts = vec.fit_transform(word_count_sorted)
vocab = vec.get_feature_names()
nmf = NMF(n_components = 10, random_state = 1)
nmf.fit(X_train_counts)

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, nls_max_iter=2000, random_state=1, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [401]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [402]:
print(print_top_words(nmf, clean_list, 50))

Topic #0:
gtfs imagery graffiti ocean pipeline realtor_neighborhoods employment shapefile movie parking meters elevation buildings sanfrancisco.gdb sewer city owned race parking dem companies dog business license school fte hospital BART murder homeless recidivism income calls for service land-use lidar current year road network bicycle parking traffic citylots muni home sales elevation sfpd fires shapefile street neighborhood business license trees json street cleaning contracts building
Topic #1:
noise bay area parking taxi parcels SFMTA Bikeway Network san francisco streets supervisor districts building data graffiti Gentrification parcels historic districts meters airport Fire senior centers population recidivism chapter 20 pedestrian high injury SF solar map GPS crash human rights affordable housing evictions Street parking city map parcels BAN grafitti neighborhood city survey budget zoning house census bicycle accidents population neighborhood crime crime median income charity s

In [403]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(word_count_sorted)
indices = np.argsort(vectorizer.idf_)
features = vectorizer.get_feature_names()
top_n = 100
top_features = [features[i] for i in indices[:top_n]] 

print top_features

[u'san', u'francisco', u'street', u'city', u'business', u'shapefile', u'data', u'of', u'census', u'building', u'map', u'parking', u'sf', u'and', u'bay', u'water', u'crime', u'housing', u'bike', u'area', u'district', u'public', u'traffic', u'property', u'block', u'in', u'streets', u'food', u'open', u'code', u'districts', u'campaign', u'population', u'income', u'use', u'tax', u'car', u'neighborhood', u'bicycle', u'2000', u'fire', u'by', u'buildings', u'health', u'gis', u'2015', u'community', u'transit', u'pipeline', u'planning', u'school', u'park', u'market', u'permit', u'land', u'parks', u'permits', u'number', u'bus', u'police', u'parcel', u'zip', u'muni', u'2010', u'for', u'zoning', u'center', u'460', u'sfpd', u'rent', u'finance', u'2014', u'neighborhoods', u'blocks', u'county', u'businesses', u'lines', u'commercial', u'ca', u'facilities', u'locations', u'space', u'service', u'construction', u'schedule', u'registered', u'no', u'rate', u'boundary', u'311', u'development', u'tree', u'inc

# Recommendations:

- spell corrector for mispelled words
- recommendations for words while typing of words of correct search keywords, fuzzy matching
- education for how to properly use the open data portal