Search Data Processing
=========================



In [20]:
import sys
sys.path.insert(0, 'helper_modules/')

In [21]:
import pandas as pd
import re
from search_terms import *
from enchant.checker import SpellChecker
import enchant
from spell_corrector import *

# for paralellization
from functools import partial
from rosetta.parallel.parallel_easy import map_easy

import warnings
warnings.filterwarnings('ignore')

## Import Search Data

In [4]:
#search_data = pd.read_csv("./data/sample_openData_searchTerms_clean.csv") # sample data only
all_search_data_df = pd.read_csv("./data/all_queries.csv")
all_search_data_df = all_search_data_df.rename(columns={"ga.searchKeyword":"search_term"})\
                                       .rename(columns={"ga.searchUniques":"num_unique_searches"})

In [5]:
all_search_data_df.shape

(116291, 9)

In [6]:
all_search_data_df.head()

Unnamed: 0,search_term,ga.searchStartPage,ga.searchAfterDestinationPage,num_unique_searches,ga.avgSearchResultViews,ga.avgSearchDepth,ga.percentSearchRefinements,ga.searchDuration,ga.searchExitRate
0,crime,'/,'/data?search=crime,451,1.066519,0.0,0.0,62,0
1,business,'/,'/data?search=business,319,1.106583,0.0,0.0,5,0
2,311,'/,'/data?search=311,221,1.135747,0.0,0.0,7,0
3,crime,'/,'/browse/embed?Department-Metrics_Publishing-D...,200,1.07,4.54,12.149533,48646,0
4,streets,'/,'/data?search=streets,169,1.047337,0.0,0.0,3,0


In [7]:
search_terms_df = all_search_data_df[['search_term','num_unique_searches']]
search_terms_list = list(set(all_search_data_df['search_term']))

In [8]:
search_terms_df.head()

Unnamed: 0,search_term,num_unique_searches
0,crime,451
1,business,319
2,311,221
3,crime,200
4,streets,169


In [9]:
print search_terms_list[0:50]

['sea temperature', '"Basemap Street Centerlines', 'tree coverage', '631 folsom st', 'farmers market', 'vilations', 'active business locations', 'Lyft drivers', 'lot LINE', 'lot and block numbers', 'traffic calming', 'salla vaerma-jadlos', 'INCIDENTS', 'community based organizations', 'city-owned', 'technology staff', 'pdf parcel', 'energy performance', 'historical weather data', 'street tress', 'nature veature', 'car traffic', 'electricity', 'sexual assault', 'survey 2014', 'survey 2015', 'consulate', 'Casey', 'sffd', 'traffic control camera location', 'business San Francisco', 'massage establishments', 'muni stop', 'zipcar', 'Bike crime data', 'socrata', 'san francisco parking', 'Tank hill', 'San Francisco Bay region', 'Existing SF Commercial Wireless Facilities', 'lobbyist payments', 'lyft driver', 'illegal immigrant', 'same sex marriage', 'census block shapefile', 'solar panel cost', 'city council candidate', 'wind data', 'parking regulations', 'narcotic']


## Light Processing of Search Terms

In [10]:
search_terms_df['processed_search_term'] = search_terms_df.search_term\
                                    .apply(lambda text: str(text).lower().strip())\
                                    .apply(lambda text: text.decode('utf-8','ignore').encode("utf-8"))\
                                    .apply(lambda text: re.sub(r'^"|"$', '', text))  # remove leading/trailing quotes
                                        

In [11]:
search_terms_df.head()

Unnamed: 0,search_term,num_unique_searches,processed_search_term
0,crime,451,crime
1,business,319,business
2,311,221,311
3,crime,200,crime
4,streets,169,streets


In [12]:
search_terms_totals = search_terms_df.groupby(by = 'processed_search_term')[['num_unique_searches']]\
                                     .sum().sort_values('num_unique_searches', ascending = False).reset_index()

In [31]:
search_terms_totals.shape

(18370, 3)

## Most popular search terms

In [13]:
search_terms_totals.head(20)

Unnamed: 0,processed_search_term,num_unique_searches
0,crime,2031
1,parking,862
2,business,846
3,streets,780
4,311,677
5,parcel,461
6,shapefile,458
7,street,458
8,population,428
9,restaurants,428


In [14]:
search_terms_totals.tail(10)

Unnamed: 0,processed_search_term,num_unique_searches
18360,fertility,1
18361,fertility rate,1
18362,festivals,1
18363,fetal,1
18364,fewer,1
18365,fiber optics,1
18366,fibre,1
18367,ficticious business name,1
18368,ficticious business names,1
18369,Ｓａｎ Ｆｒａｎｃｉｓｃｏ,1


## Each search term needs to be tagged in categories of quality

Many search terms indicate that users are not informed on what the purpose of the website is or how to properly use search.
### Search Tags
* #### Good Quality Search
    * complete words or phrases
    * minor typos
* #### Bad Quality
    * Addresses
    * Dates
    * Zipcodes or just a string of numbers
    * General nonsenses eg. ('></script><script>alert(1)</script>', '///', '16exc-3031') 

## Implementing Spell Checker

In [22]:
# Load Spellchecker objects
d = enchant.Dict('en_US')
chkr = SpellChecker('en_US')

#from NER_people_location import *
from NER_spacy import *

def check_phrase(phrase):
    '''
    Uses enchant spellchecker to count the number of mispelled words in the phrase
    returns the number of errors found
    '''
    phrase = str(phrase)
    errors = 0
    chkr.set_text(phrase)
    for err in chkr:
        errors += 1
        
    return errors

def check_phrase2(phrase):
    phrase = phrase.strip()
    print phrase
    errors = 0
    word_list = phrase.split(" ")
    for word in word_list:
        try:
            if d.check(word) == False:
                errors+=1
            else:
                pass
        except:
            return "error"
    return errors

In [23]:
print correction_phrase("home busienss")

home business


In [24]:
check_phrase("feminrrist technolrogy collective ")

2

In [32]:
search_terms_totals['spelling_errors'] = search_terms_totals.processed_search_term\
                                            .apply(lambda x: check_phrase(x))

In [26]:
search_terms_totals.head()

Unnamed: 0,processed_search_term,num_unique_searches,spelling_errors
0,crime,2031,0
1,parking,862,0
2,business,846,0
3,streets,780,0
4,311,677,0


In [33]:
search_spelling_errors = search_terms_totals.ix[search_terms_totals.spelling_errors > 0]
non_spelling_errors = search_terms_totals.ix[search_terms_totals.spelling_errors == 0]

# Create sample
search_spelling_errors = search_spelling_errors[0:100]

In [34]:
non_spelling_errors.tail(10)

Unnamed: 0,processed_search_term,num_unique_searches,spelling_errors
18353,fern st,1,0
18354,ferry map,1,0
18355,ferry routes,1,0
18358,ferry treasure island,1,0
18360,fertility,1,0
18361,fertility rate,1,0
18362,festivals,1,0
18363,fetal,1,0
18364,fewer,1,0
18365,fiber optics,1,0


In [29]:
search_spelling_errors.head(10)

Unnamed: 0,processed_search_term,num_unique_searches,spelling_errors
6,shapefile,458,1
19,muni,313,1
42,bart,189,1
43,gis,186,1
51,san francisco,154,2
54,sfpd,146,1
55,shapefiles,146,1
64,sfmta,125,1
85,inclusionary,100,1
111,sfo,75,1


In [30]:
print len(search_spelling_errors)
print len(non_spelling_errors)

100
11223


In [35]:
search_spelling_errors['correction'] = search_spelling_errors['processed_search_term'].apply(lambda x: correction(x))

In [36]:
search_spelling_errors.head(10)

Unnamed: 0,processed_search_term,num_unique_searches,spelling_errors,correction
6,shapefile,458,1,shapefile
19,muni,313,1,muni
42,bart,189,1,bart
43,gis,186,1,gis
51,san francisco,154,2,san francisco
54,sfpd,146,1,spy
55,shapefiles,146,1,shapefile
64,sfmta,125,1,soma
85,inclusionary,100,1,inclusionary
111,sfo,75,1,seo


In [36]:
#likely_spelling_errors = search_spelling_errors.ix[search_spelling_errors.num_unique_searches < 2].tail(100)

In [37]:
spelling_error_list = search_spelling_errors['processed_search_term'].tolist()

#spelling_error_list = spelling_error_list[0:100]

In [38]:
print spelling_error_list

['shapefile', 'muni', 'bart', 'gis', 'san francisco', 'sfpd', 'shapefiles', 'sfmta', 'inclusionary', 'sfo', 'streets of san francisco', 'topo', 'dem', 'json', 'landuse', 'sfpd incidents', 'basemap', 'census shapefile', 'csv', 'citylots', 'envista', 'orthophoto', 'sfmta bikeway network', 'lidar', 'cnn', 'popos', 'ortho', 'water bodies in san francisco', 'sffind neighborhoods', 'shoreline shapefile', 'shp', 'zipcode', 'inclusionary housing', 'hiv', 'uber', 'gps', 'wifi', 'geojson', 'mta', 'san francisco crime', 'sanfrancisco.gdb', 'lyft', 'airbnb', 'sffd', 'maher', 'eas', 'sfpark', 'pbc', 'bikeway', 'kml', 'stclines', 'sfopenbook', 'sfshore', 'sro', 'caltrain', 'taz', 'address locator', 'internet', 'dui', 'stormwater', 'basemap street centerlines', 'sanfrancisco', 'chinatown', 'presidio', 'dpw', 'api', 'realtor_neighborhoods', 'fte', 'oakland', 'zipcodes', 'sfusd', 'sfpd incident', 'open busienss', 'street of san francisco', 'dbi', 'berkeley', 'healthcare', 'gdp', 'apn', 'lgbt', '.shp', 

In [39]:
# %time search_spelling_errors_prob['google_results'] = search_spelling_errors_prob.processed_search_term.apply(lambda x: spell_check(x))

In [45]:
%time spelling_corrections = map_easy(correction_phrase, spelling_error_list, -1)

CPU times: user 8.93 ms, sys: 18.3 ms, total: 27.2 ms
Wall time: 2.59 s


In [46]:
search_spelling_errors['spelling_corrections'] = spelling_corrections

In [47]:
#search_spelling_errors.to_csv('corrected_spellng_errors.csv', index =False)

In [48]:
#search_spelling_errors.ix[search_spelling_errors.google_corrections == 'traffic']

In [49]:
search_spelling_errors.tail(50)

Unnamed: 0,processed_search_term,num_unique_searches,spelling_errors,spelling_corrections
376,stclines,26,1,stclines
378,sfopenbook,25,1,sfopenbook
380,sfshore,25,1,shore
387,sro,25,1,src
389,caltrain,25,1,captain
391,taz,25,1,tap
404,address locator,24,1,address locator
415,internet,23,1,internet
417,dui,23,1,dui
418,stormwater,23,1,stormwater


In [None]:
# search_terms_counts_df = pd.DataFrame(search_terms_df.groupby(['processed_search_term','spelling_errors']).size())

# search_spelling_errors_count = search_spelling_errors_count.reset_index()
# search_spelling_errors_count.columns = ['processed_search_term','spelling_errors', 'count']
# search_spelling_errors_count = search_spelling_errors_count.sort('count', ascending=False)

In [50]:
len(search_spelling_errors_count)

NameError: name 'search_spelling_errors_count' is not defined

In [None]:
search_spelling_errors_count.head(30)

In [None]:
search_spelling_errors_count.tail(30)

In [None]:
search_spelling_errors_count.ix[search_spelling_errors_count['count'] < 5]

In [None]:
d.suggest('GIS')

#### For Testing of Regex

In [21]:
import re
dates_r = re.compile('[0-9]*[0-9]/[0-9]*[0-9]/[0-9]*[0-9]')
numbers_r = re.compile('^[0-9][0-9]*[0-9]$')
html_r = re.compile('^<.*>$')
address_r = re.compile('[0-9].*[0-9] [a-z].*[a-z]')

In [22]:
address_r.match("1190 mission stree")

<_sre.SRE_Match at 0x1090dcac0>

In [None]:
processed_search_terms_list = list(search_terms_df['processed_search_term'])

In [None]:
print processed_search_terms_list[100:150]

In [None]:
search_terms_df['search_tag'] = search_terms_df.search_term.apply(lambda x: search_term_type(x))

In [None]:
search_terms_tagged = pd.read_csv("./processed_search_term_data/tagged_search_terms.csv")

In [None]:
search_terms_tagged.tail()

In [None]:
%time tokenized_queries_corrections = map_easy(search_term_type, processed_search_terms_list, -1 )

In [None]:
numbers = search_terms_tagged.ix[search_terms_tagged.search_tag == "Number"]

In [None]:
numbers.head()

In [None]:
#pd.DataFrame(numbers['processed_search_term'].value_counts()).head()

In [None]:
dates = search_terms_tagged.ix[search_terms_tagged.search_tag == "Date"]

In [None]:
print list(set(dates['search_term']))

In [None]:
links = search_terms_df.ix[search_terms_df.search_tag == 'Link']

In [None]:
print list(set(links['search_term']))

In [None]:
addresses = search_terms_df.ix[search_terms_df.search_tag == 'Address']

In [None]:
print list(set(addresses['search_term']))

In [None]:
search_terms = search_terms_df.ix[search_terms_df.search_tag == 'Search Term']

In [None]:
print list(set(search_terms['search_term']))

In [None]:
search_terms.to_csv("tagged_search_terms.csv", index=False)

In [None]:
search_terms = pd.read_csv('tagged_search_terms.csv')
search_terms['processed_search_term'] = search_terms['processed_search_term'].apply(lambda x: removePunctuation(x))

In [None]:
search_terms.head()

In [None]:
search_count = pd.DataFrame(search_terms.groupby(['processed_search_term'])['processed_search_term'].count())
#search_count.reset_index()
search_count.columns = [ 'count']
#search_count.shape

In [None]:
search_count = search_count.reset_index()
search_count = search_count.sort('count', ascending=False)

In [None]:
search_count.head()

In [None]:
common_searches = search_count.ix[search_count['count'] > 10]
common_searches.to_csv('common_searches.csv', index=False)

In [None]:
common_searches.head()

In [None]:
common_searches.shape

In [None]:
common_search_list = list(common_searches['processed_search_term'])