Search Data Processing
=========================



In [10]:
import pandas as pd
import re
from search_terms import *
from enchant.checker import SpellChecker
import enchant

# for paralellization
from functools import partial
from rosetta.parallel.parallel_easy import map_easy

In [114]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#search_data = pd.read_csv("./data/sample_openData_searchTerms_clean.csv") # sample data only
all_search_data_df = pd.read_csv("./data/all_queries.csv")
all_search_data_df = all_search_data_df.rename(columns={"ga.searchKeyword": "search_term"})

In [3]:
all_search_data_df.shape

(116291, 9)

In [4]:
all_search_data_df.head()

Unnamed: 0,search_term,ga.searchStartPage,ga.searchAfterDestinationPage,ga.searchUniques,ga.avgSearchResultViews,ga.avgSearchDepth,ga.percentSearchRefinements,ga.searchDuration,ga.searchExitRate
0,crime,'/,'/data?search=crime,451,1.066519,0.0,0.0,62,0
1,business,'/,'/data?search=business,319,1.106583,0.0,0.0,5,0
2,311,'/,'/data?search=311,221,1.135747,0.0,0.0,7,0
3,crime,'/,'/browse/embed?Department-Metrics_Publishing-D...,200,1.07,4.54,12.149533,48646,0
4,streets,'/,'/data?search=streets,169,1.047337,0.0,0.0,3,0


In [5]:
search_terms_df = all_search_data_df[['search_term']]
search_terms_list = list(set(all_search_data_df['search_term']))

In [6]:
search_terms_df.head()

Unnamed: 0,search_term
0,crime
1,business
2,311
3,crime
4,streets


In [7]:
print search_terms_list[0:100]

['sea temperature', '"Basemap Street Centerlines', 'tree coverage', '631 folsom st', 'farmers market', 'vilations', 'active business locations', 'Lyft drivers', 'lot LINE', 'lot and block numbers', 'traffic calming', 'salla vaerma-jadlos', 'INCIDENTS', 'community based organizations', 'city-owned', 'technology staff', 'pdf parcel', 'energy performance', 'historical weather data', 'street tress', 'nature veature', 'car traffic', 'electricity', 'sexual assault', 'survey 2014', 'survey 2015', 'consulate', 'Casey', 'sffd', 'traffic control camera location', 'business San Francisco', 'massage establishments', 'muni stop', 'zipcar', 'Bike crime data', 'socrata', 'san francisco parking', 'Tank hill', 'San Francisco Bay region', 'Existing SF Commercial Wireless Facilities', 'lobbyist payments', 'lyft driver', 'illegal immigrant', 'same sex marriage', 'census block shapefile', 'solar panel cost', 'city council candidate', 'wind data', 'parking regulations', 'narcotic', '0309953', 'elevations', 

In [115]:
search_terms_df['processed_search_term'] = search_terms_df.search_term\
                                    .apply(lambda text: str(text).lower())\
                                    .apply(lambda text: text.decode('utf-8','ignore').encode("utf-8"))\
                                        

In [116]:
search_terms_df.head()

Unnamed: 0,search_term,processed_search_term,spelling_errors
0,crime,crime,0
1,business,business,0
2,311,311,0
3,crime,crime,0
4,streets,streets,0


## Each search term needs to be tagged in categories of quality

Many search terms indicate that users are not informed on what the purpose of the website is or how to properly use search.
### Search Tags
* #### Good Quality Search
    * complete words or phrases
    * minor typos
* #### Bad Quality
    * Addresses
    * Dates
    * Zipcodes or just a string of numbers
    * General nonsenses eg. ('></script><script>alert(1)</script>', '///', '16exc-3031') 

## Implementing Spell Checker

In [117]:
# Load Spellchecker objects
d = enchant.Dict('en_US')
chkr = SpellChecker('en_US')

def check_phrase(phrase):
    '''
    Uses enchant spellchecker to count the number of mispelled words in the phrase
    returns the number of errors found
    '''
    errors = 0
    chkr.set_text(phrase)
    for err in chkr:
        errors += 1
        
    return errors

In [118]:
search_terms_df['spelling_errors'] = search_terms_df.processed_search_term.apply(lambda x : check_phrase(x))

In [119]:
search_spelling_errors = search_terms_df.ix[search_terms_df.spelling_errors >0]

In [108]:
search_spelling_errors.head()

Unnamed: 0,index,search_term,processed_search_term,spelling_errors
0,16,shapefile,shapefile,1
1,38,inclusionary,inclusionary,1
2,64,shapefiles,shapefiles,1
3,67,gis,gis,1
4,84,sfpd,sfpd,1


In [113]:
search_spelling_errors_count = pd.DataFrame(search_spelling_errors.groupby(['processed_search_term','spelling_errors']).size())

search_spelling_errors_count = search_spelling_errors_count.reset_index()
search_spelling_errors_count.columns = ['processed_search_term','spelling_errors' 'count']
search_spelling_errors_count = search_spelling_errors_count.sort('count', ascending=False)

ValueError: Length mismatch: Expected axis has 3 elements, new values have 2 elements

In [112]:
search_spelling_errors_count.head()

Unnamed: 0,processed_search_term,spelling_errors,0
0,! cannot open c:\users\cafethai\appdata\local\...,12,2
1,!dfsd,1,2
2,""" ami"" 1990",1,2
3,"""326 santiago""",1,1
4,"""960 howard""",1,1


In [107]:
search_spelling_errors_count.ix[search_spelling_errors_count['count'] < 5]

Unnamed: 0,processed_search_term,count
56,.drg,4
3115,holland,4
1419,bus stop shapefile,4
1418,bus shapefile,4
4297,northrop,4
4298,nosie,4
3771,lung canser,4
5113,richard chiriboga,4
5316,san francisco city boundary,4
3110,hiv planning council,4


In [25]:
d.suggest('GIS')

['GUYS',
 'GEES',
 'IS',
 'GS',
 'GINS',
 'GIGS',
 'GIST',
 'SIS',
 'TIS',
 'DIS',
 'MIS',
 'PIS',
 'HIS',
 'BIS',
 'VIS']

#### For Testing of Regex

In [10]:
import re
dates_r = re.compile('[0-9]*[0-9]/[0-9]*[0-9]/[0-9]*[0-9]')
numbers_r = re.compile('^[0-9][0-9]*[0-9]$')
html_r = re.compile('^<.*>$')
address_r = re.compile('[0-9].*[0-9] [a-z].*[a-z]')

In [11]:
bool(address_r.match("1190 mission stree"))

True

In [12]:
processed_search_terms_list = list(search_terms_df['processed_search_term'])

In [13]:
print processed_search_terms_list[100:150]

['operate business', 'pipeline', 'trees', 'building footprint', 'fire incidents', 'land use', 'shapefile', 'shapefile', 'shapefiles', 'zoning districts', 'address', 'addresses', 'assessor', 'business', 'inventory', 'shapefile', 'street cleaning', 'trees', 'business license', 'census', 'demographics', 'inventory', 'lobbyist', 'neighborhood', 'parking', 'roads', 'school', 'zoning', 'building permits', 'citylots', 'contours', 'open business', 'parking', 'parks', 'permit', 'traffic', 'transit', 'water', '311', 'eviction', 'graffiti', 'human waste', 'meter', 'parcel', 'parking meter', 'restaurant', 'topography', 'weather', '497', 'crime']


In [26]:
search_terms_df['search_tag'] = search_terms_df.search_term.apply(lambda x: search_term_type(x))

In [59]:
search_terms_tagged = pd.read_csv("./processed_search_term_data/tagged_search_terms.csv")

In [65]:
search_terms_tagged.tail()

Unnamed: 0,search_term,processed_search_term,search_tag
114092,incidents,incidents,Search Term
114093,incidents,incidents,Search Term
114094,incidents,incidents,Search Term
114095,incidents,incidents,Search Term
114096,incidents,incidents,Search Term


In [60]:
#%time tokenized_queries_corrections = map_easy(search_term_type, processed_search_terms_list, -1 )

In [63]:
numbers = search_terms_tagged.ix[search_terms_tagged.search_tag == "Number"]

In [62]:
numbers.head()

Unnamed: 0,search_term,processed_search_term,search_tag


In [37]:
#pd.DataFrame(numbers['processed_search_term'].value_counts()).head()

In [55]:
dates = search_terms_tagged.ix[search_terms_tagged.search_tag == "Date"]

In [56]:
print list(set(dates['search_term']))

[]


In [40]:
links = search_terms_df.ix[search_terms_df.search_tag == 'Link']

In [41]:
print list(set(links['search_term']))

['https://data.sfgov.org/Economy-and-Community/Off-Sale-Alcohol-Outlets-San-Francisco-CA/fIy-9zhp', 'https://10.183.241.201/rgcertprintv2default.aspx', 'https://www.fiverr.com/kawsarhossain', 'Scores https://data.sfgov.org/Public-Health/Restaurant-Scores/stya-26eb', 'https', 'https://data.sfgov.org/Economy-and-Community/Registered-Business-Locations-San-Francisco/g8m3-pdis?', 'https://data.sfgov.org/data?category=', 'https://data.sfgov.org/d/ejmn-jyk6', 'httplinks', 'https://data.sfgov.org/Economy-and-Community/Registered-Business-Locations-San-Francisco/g8m3-pdis', 'https://www.kaggle.com/c/sf-crime', 'http://www.bsis.ca.gov/forms_pubs/fire_fact.shtml', 'https://data.sfgov.org/data?category=Transportation', 'http://www.datasf.org/story.php?title=street-sweeper-schedule-and-route-', ': https://data.sfgov.org/Public-Health/Excessive-Rent-Burden-San-Francisco-CA/9wty-qwgq', 'http://googlewebmastercentral.blogspot.com/2014/11/helping-users-find-mobile-friendly-pages.html', 'https://extxfe

In [45]:
addresses = search_terms_df.ix[search_terms_df.search_tag == 'Address']

In [46]:
print list(set(addresses['search_term']))

['501 beale', '555 california', '1480 larkin st', '56 moss st', '631 folsom st', '619 union street', '20 jones st', '425 market street energy', '1416 polk street', '311 cases by channe;', '2315 clement', '988 filbert', '944 treat av', '445 anderson', '1454 shafter', '2660 diamond', '407 jackson st', '450 golden gate', '199 new Montgomery st', '800 detroit', '311 cases by channel', '3143 folsom st', '33 allston way', '560 davis st san francisco', '2649 polk s', '63 woodland', '711 clayton st.', '1385 carroll ave', '60 agua way', '1235 bay', '111 powell street', '337 spurce st san francisco ca', '420 grove', '409 illinois', '1023 vallejo street', '1485 bayshore', '301 baltimore way', '855 folsom', '94 stratford drive', '560 brannan', '162 landers st', '100 year storm flood plain', '311 cases', '1798 grove street', '208 caselli ave', '1359 sacramento st', '3 rudden avenue sf ca', '2385 bush street', '62 linden avenue, san bruno, ca', '106 grand view', '1363 alabama street', '3247 reserve 

In [42]:
search_terms = search_terms_df.ix[search_terms_df.search_tag == 'Search Term']

In [15]:
print list(set(search_terms['search_term']))

NameError: name 'search_terms' is not defined

In [47]:
search_terms.to_csv("tagged_search_terms.csv", index=False)

In [25]:
search_terms = pd.read_csv('tagged_search_terms.csv')
search_terms['processed_search_term'] = search_terms['processed_search_term'].apply(lambda x: removePunctuation(x))

In [26]:
search_terms.head()

Unnamed: 0,search_term,processed_search_term,search_tag
0,crime,crime,Search Term
1,business,business,Search Term
2,311,311,Search Term
3,crime,crime,Search Term
4,streets,streets,Search Term


In [55]:
search_count = pd.DataFrame(search_terms.groupby(['processed_search_term'])['processed_search_term'].count())
#search_count.reset_index()
search_count.columns = [ 'count']
#search_count.shape

In [56]:
search_count = search_count.reset_index()
search_count = search_count.sort('count', ascending=False)

  from ipykernel import kernelapp as app


In [57]:
search_count.head()

Unnamed: 0,processed_search_term,count
4436,crime,1118
15258,streets,851
14293,shapefile,744
15030,street,697
10750,parking,693


In [72]:
common_searches = search_count.ix[search_count['count'] > 10]
common_searches.to_csv('common_searches.csv', index=False)

In [68]:
common_searches.head()

Unnamed: 0,processed_search_term,count
4436,crime,1118
15258,streets,851
14293,shapefile,744
15030,street,697
10750,parking,693


In [69]:
common_searches.shape

(1681, 2)

In [71]:
common_search_list = list(common_searches['processed_search_term'])

['crime', 'streets', 'shapefile', 'street', 'parking', 'census', 'bike', 'business', 'building', 'water', 'neighborhood', 'population', 'parks', '311', 'traffic', 'bicycle', 'muni', 'income', 'parcel', 'fire', 'housing', 'school', 'rent', 'bart', 'land use', 'zoning', 'bus', 'neighborhoods', 'gis', 'parcels', 'restaurant', 'homeless', 'food', 'elevation', 'buildings', 'san francisco', 'boundary', 'pipeline', 'roads', 'police', 'schools', 'park', 'address', 'transit', 'budget', 'assessor', 'tree', 'transportation', 'taxi', 'restaurants', 'sfmta', 'building footprints', 'property', 'city', 'trees', 'pedestrian', 'construction', 'health', 'businesses', 'shapefiles', 'bridge', 'education', 'block', 'business license', 'district', 'bay area', 'graffiti', 'street cleaning', 'building permits', 'building footprint', 'census tract', 'road', 'sfpd', 'permit', 'sewer', 'noise', 'blocks', 'car', 'map', 'hospital', 'districts', 'city boundary', 'airport', 'zip code', 'topography', 'business licens