Search Data Processing
=========================



In [104]:
import pandas as pd
import re
import string
import pyspark

In [25]:
search_data = pd.read_csv("./data/sample_openData_searchTerms_clean.csv")

In [26]:
search_data.shape

(5000, 8)

In [27]:
search_data.tail()

Unnamed: 0,Search Term,Exit Page,Total Unique Searches,Results Pageviews / Search,% Search Exits,% Search Refinements,Time after Search,Average Search Depth
4995,address range,/Geographic-Locations-and-Boundaries/Streets-o...,2,1.5,0.00%,0.00%,0:02:09,8.0
4996,address shapefile,/browse/embed?category=&limit=20&limitTo=&q=ad...,2,1.5,0.00%,66.67%,0:00:18,3.0
4997,address to geographic location,/browse/embed?category=&limit=20&limitTo=&q=ad...,2,1.0,0.00%,0.00%,0:00:03,1.5
4998,address with resident names,/browse/embed?category=&limit=20&limitTo=&q=ad...,2,1.0,0.00%,0.00%,0:00:05,1.5
4999,addresse,/browse/embed?category=&limit=20&limitTo=&q=po...,2,1.0,0.00%,100.00%,0:00:03,1.5


In [28]:
#search_data[search_data['Total Unique Searches']<5]

In [29]:
search_terms = list(set(search_data['Search Term']))

In [30]:
len(search_terms)

2451

In [31]:
#print search_terms

In [32]:
search_data.ix[search_data["Search Term"] == "194415"]

Unnamed: 0,Search Term,Exit Page,Total Unique Searches,Results Pageviews / Search,% Search Exits,% Search Refinements,Time after Search,Average Search Depth
3519,194415,/Economy-and-Community/Registered-Business-Loc...,2,1,0.00%,0.00%,0:00:06,2.5


In [33]:
search_terms_data = search_data[["Search Term"]]
search_terms_data = search_terms_data.rename(columns={"Search Term": "search_term"})

In [34]:
search_terms_data.head()

Unnamed: 0,search_term
0,business
1,crime
2,crime
3,311
4,streets


In [35]:
search_terms_data['processed_data'] = search_terms_data.search_term\
                                        .apply(lambda text: text.decode('ascii' ,"ignore" ).decode('utf-8','ignore'))\
                                        .apply(lambda text: text.lower())

In [36]:
search_terms_list =  list(set(search_terms_data.processed_data))
#print search_terms_list

#### We need to generally tag searches in categories of quality

Many search terms indicate that users are not informed on what the purpose of the website is or how to properly use search

Search Tags
* Good Quality Search
    * complete words or phrases
    * minor typos
* Bad Quality
    * Addresses
    * Dates
    * Zipcodes or just a string of numbers
    * General nonsenses eg. ('></script><script>alert(1)</script>', '///', '16exc-3031') 

In [37]:
dates_r = re.compile('[0-9]/[0-9]/[0-9]')
numbers_r = re.compile('^[0-9][0-9]*[0-9]$')
html_r = re.compile('^<.*>$')


In [38]:
filter(dates_r.match, search_terms_list)

[u'9/1/01', u'3/1/01']

In [39]:
print filter(html_r.match, search_terms_list)

[u'</script><script>alert(1)</script>']


In [40]:
# removing punctuation

def removePunctuation(text):

    for c in string.punctuation:
        text = text.replace(c,"").strip().lower()
    return text

In [41]:
# iterative process

def text_processing(search):
    
    return [removePunctuation(i) for i in search]
        
        

In [42]:
search = text_processing(search_terms_list)
#search

In [85]:
regex = "\d{1,4}.?\d{0,4}\s[a-zA-Z|\d+]{2,30}\s[a-zA-Z]{2,15}"

f = [re.findall(regex, i) for i in search
     if re.findall(regex, i)!= [] 
     if re.findall(regex, i)[0][:3] != '311'
     ]

# http://regexlib.com/REDetails.aspx?regexp_id=430

In [86]:
len(f)

409

In [87]:
year = [str(j) for j in range(2000,2017)]

addresses = [i for i in f if i[0][:4] not in year ]
addresses

[[u'220 30th ave'],
 [u'1746 church street'],
 [u'560 davis st'],
 [u'573 andover street'],
 [u'301 baltimore way'],
 [u'3 rudden avenue'],
 [u'106 grand view'],
 [u'4610 mission street'],
 [u'123 retiro way'],
 [u'64 perine place'],
 [u'3067 25th street'],
 [u'469 union st'],
 [u'400 castro soul'],
 [u'1931 35th ave'],
 [u'530 brannan street'],
 [u'301 king st'],
 [u'600 van ness'],
 [u'240 2nd street'],
 [u'1050 sansome st'],
 [u'2945 baker st'],
 [u'337 spurce street'],
 [u'701 teresita san'],
 [u'258 9th st'],
 [u'1660 stockton at'],
 [u'1 tuscany alley'],
 [u'141 eddy street'],
 [u'665 pine st'],
 [u'600 fell st'],
 [u'67 silverview drive'],
 [u'525 29th st'],
 [u'453 masonic st'],
 [u'13 melrose ave'],
 [u'1230 market st'],
 [u'3d building models'],
 [u'106 powell street'],
 [u'709 geary st'],
 [u'278 states street'],
 [u'882 31st ave'],
 [u'430 lake st'],
 [u'41 genebern way'],
 [u'331 jersey st'],
 [u'177 stillman st'],
 [u'5 hudson ct'],
 [u'1952 larkin street'],
 [u'235 presi

In [46]:
# applying existing code to the full data

query = pd.read_csv("./data/all_queries.csv")

In [93]:
query.head()

Unnamed: 0,ga.searchKeyword,ga.searchStartPage,ga.searchAfterDestinationPage,ga.searchUniques,ga.avgSearchResultViews,ga.avgSearchDepth,ga.percentSearchRefinements,ga.searchDuration,ga.searchExitRate
0,crime,'/,'/data?search=crime,451,1.066519,0.0,0.0,62,0
1,business,'/,'/data?search=business,319,1.106583,0.0,0.0,5,0
2,311,'/,'/data?search=311,221,1.135747,0.0,0.0,7,0
3,crime,'/,'/browse/embed?Department-Metrics_Publishing-D...,200,1.07,4.54,12.149533,48646,0
4,streets,'/,'/data?search=streets,169,1.047337,0.0,0.0,3,0


In [78]:
# seeing the data

query_list =  list(query['ga.searchKeyword'].values)
query_list = [str(word).decode('ascii' ,"ignore" ).decode('utf-8','ignore') for word in query_list]

In [124]:
#query_list

In [90]:
# from collections import Counter
# Counter(query_list)

In [101]:
regex = "\d{1,4}.?\d{0,4}\s[a-zA-Z|\d+]{2,30}\s[a-zA-Z]{2,15}"

full = [re.findall(regex, i) for i in query_list
     if re.findall(regex, i)!= [] 
     if re.findall(regex, i)[0][:3] != '311'
     ]


In [102]:
len(full)

1845

In [105]:
# flatten full list 
full = sc.parallelize(full).flatMap(lambda row: row).collect()

In [106]:
full

[u'2010 census tracts',
 u'460 forms Schedule',
 u'1010 fitzgerald ave',
 u'1600 California street',
 u'1996-2013 city survey',
 u'2 hour parking',
 u'2 hour parking',
 u'2011 herrera campaign',
 u'2012 SFO Customer',
 u'2013 housing inventory',
 u'2598 Mission Street',
 u'405 howard st',
 u'5 thomas mellon',
 u'120 lake street',
 u'0 Beatrice Rd',
 u'19/2015 dashiell hammet',
 u'1 Embarcadero San',
 u'1 Hawkins Ln',
 u'1 Longview Court',
 u'1 Polk st',
 u'1 Tuscany Alley',
 u'1 hour parking',
 u'1 post st',
 u'1 scott street',
 u'1 south van',
 u'1 south vanness',
 u'1. BRIAN DUSSEAULT',
 u'10 gb dataset',
 u'10 glendale street',
 u'100 church street',
 u'100 delano ave',
 u'100 year flood',
 u'100 year flood',
 u'100 year storm',
 u'1000 Sutter Street',
 u'1000 brannan street',
 u'1000 howard st',
 u'1008 Santiago street',
 u'101 SAN ALESO',
 u'101 utah street',
 u'1023 vallejo street',
 u'1033 polk street',
 u'1040 howard st',
 u'1049 Market Street',
 u'1050 Sansome St',
 u'1051 de 

In [89]:
year_full = [str(j) for j in range(2000,2017)]

addresses_full = [i for i in full if i[0][:4] not in year ]
addresses_full

[[u'460 forms Schedule'],
 [u'1010 fitzgerald ave'],
 [u'1600 California street'],
 [u'1996-2013 city survey'],
 [u'2 hour parking'],
 [u'2 hour parking'],
 [u'2598 Mission Street'],
 [u'405 howard st'],
 [u'5 thomas mellon'],
 [u'120 lake street'],
 [u'0 Beatrice Rd'],
 [u'19/2015 dashiell hammet'],
 [u'1 Embarcadero San'],
 [u'1 Hawkins Ln'],
 [u'1 Longview Court'],
 [u'1 Polk st'],
 [u'1 Tuscany Alley'],
 [u'1 hour parking'],
 [u'1 post st'],
 [u'1 scott street'],
 [u'1 south van'],
 [u'1 south vanness'],
 [u'1. BRIAN DUSSEAULT'],
 [u'10 gb dataset'],
 [u'10 glendale street'],
 [u'100 church street'],
 [u'100 delano ave'],
 [u'100 year flood'],
 [u'100 year flood'],
 [u'100 year storm'],
 [u'1000 Sutter Street'],
 [u'1000 brannan street'],
 [u'1000 howard st'],
 [u'1008 Santiago street'],
 [u'101 SAN ALESO'],
 [u'101 utah street'],
 [u'1023 vallejo street'],
 [u'1033 polk street'],
 [u'1040 howard st'],
 [u'1049 Market Street'],
 [u'1050 Sansome St'],
 [u'1051 de haro'],
 [u'1051 gr

In [107]:
addresses_full = sc.parallelize(addresses_full).flatMap(lambda row: row).collect()

In [113]:
addresses_full

[u'460 forms Schedule',
 u'1010 fitzgerald ave',
 u'1600 California street',
 u'1996-2013 city survey',
 u'2 hour parking',
 u'2 hour parking',
 u'2598 Mission Street',
 u'405 howard st',
 u'5 thomas mellon',
 u'120 lake street',
 u'0 Beatrice Rd',
 u'19/2015 dashiell hammet',
 u'1 Embarcadero San',
 u'1 Hawkins Ln',
 u'1 Longview Court',
 u'1 Polk st',
 u'1 Tuscany Alley',
 u'1 hour parking',
 u'1 post st',
 u'1 scott street',
 u'1 south van',
 u'1 south vanness',
 u'1. BRIAN DUSSEAULT',
 u'10 gb dataset',
 u'10 glendale street',
 u'100 church street',
 u'100 delano ave',
 u'100 year flood',
 u'100 year flood',
 u'100 year storm',
 u'1000 Sutter Street',
 u'1000 brannan street',
 u'1000 howard st',
 u'1008 Santiago street',
 u'101 SAN ALESO',
 u'101 utah street',
 u'1023 vallejo street',
 u'1033 polk street',
 u'1040 howard st',
 u'1049 Market Street',
 u'1050 Sansome St',
 u'1051 de haro',
 u'1051 grant ave',
 u'1051 market st',
 u'106 Powell Street',
 u'106 grand view',
 u'1071 Alab

In [138]:
clean_list = sc.parallelize(query_list).filter(lambda word: word not in addresses_full).collect()

In [220]:
links = "((https?):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)"

https = [re.findall(links, i) for i in clean_list
     if re.findall(links, i)!= [] 
     if re.findall(links, i)!= "//" 
     ]
# http://stackoverflow.com/questions/6718633/python-regular-expression-again-match-url

In [221]:
# finding all the unique links that people have put into the search query

https = (sc.parallelize(https).flatMap(lambda word: word)
        .flatMap(lambda word: word)
        .filter(lambda word: word != '//')
        .filter(lambda word: word != 'https')
        .filter(lambda word: word != '''''')
        .collect())

In [222]:
https

[u'https://data.sfgov.org/Public-Health/Excessive-Rent-Burden-San-Francisco-CA/9wty-qwgq',
 u'https://data.sfgov.org/Public-Health/Restaurant-Scores/stya-26eb',
 u'http://googlewebmastercentral.blogspot.com/2014/11/helping-users-find-mobile-friendly-pages.html',
 u'http',
 u'http://www.bsis.ca.gov/forms_pubs/fire_fact.shtml',
 u'http',
 u'http://www.datasf.org/story.php?title=street-sweeper-schedule-and-route-',
 u'http',
 u'https://10.183.241.201/rgcertprintv2default.aspx',
 u'https://data.sfgov.org/Economy-and-Community/Off-Sale-Alcohol-Outlets-San-Francisco-CA/fIy-9zhp',
 u'https://data.sfgov.org/Economy-and-Community/Registered-Business-Locations-San-Francisco/g8m3-pdis',
 u'https://data.sfgov.org/Economy-and-Community/Registered-Business-Locations-San-Francisco/g8m3-pdis?',
 u'https://data.sfgov.org/d/ejmn-jyk6',
 u'https://data.sfgov.org/data?category=',
 u'https://data.sfgov.org/data?category=Transportation',
 u'https://extxfer.sfdph.org/food/',
 u'https://www.fiverr.com/kawsarh