# Autosuggestion Collection
This function handles the core process of collecting autosuggestion data from Google or Bing.

In [1]:
import requests
import urllib

# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    #print query.decode('utf-8')
    #print 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.urlencode({'q': query, 'hl': lang}))
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.urlencode({'q': query, 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.urlencode({'query': query, 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    for i in suggestions:
        i = i.encode('ascii', 'ignore').decode('ascii')
    return suggestions

In [2]:
import pandas as pd
import datetime

#this function is used to get autosuggestions from a specified tld and language
def get_suggestions(tld, lang, filename, subjects, predicates):

    #initialize arrays
    csvdata = []
    bing_all = []
    google_all = []
    auto_bing = []
    auto_google = []

    #iterate through all subject+predicate combinations
    for s in subjects:
        for p in predicates:
            query = s + " " + p + " "
            auto_bing = collect_autosuggestions("bing", tld, lang, query)
            auto_google = collect_autosuggestions("google", tld, lang, query)
            bing_all.append(auto_bing)
            google_all.append(auto_google)

    #gather all autosuggestions from bing and save to array with index "bing"
    bing_frames = []
    for i in bing_all:
        bing_frames.append(pd.DataFrame({"bing": i}))

    bing_csv_data = pd.concat(bing_frames)

    #repeat the process for google
    google_frames = []
    for i in google_all:
        google_frames.append(pd.DataFrame({"google": i}))

    #turn the array data into a dataframe
    google_df = pd.concat([d for d in google_frames], ignore_index=True)
    bing_df = pd.concat([d for d in bing_frames], ignore_index=True)
    
    #call join on the larger of the two dataframes to ensure no data is lost
    if(len(bing_df)>len(google_df)):
        result_df = bing_df.join(google_df)
    else:
        result_df = google_df.join(bing_df)
        
    #turn dataframe into a csv
    result_df.to_csv(filename, encoding='utf-8')
    return result_df


#we combine one subject with one predicate per query from this list
en_subjects = ["USA", "America", "Barack Obama", "Obama", "The United States"]
en_predicates = ["should", "is doing", "should be", "is"]

ru_subjects = ["США","Америка","Барак Обама", "Обама", "Соединенные Штаты"]
ru_predicates = ["должны", "делают", "должны быть"] # 'is' doesn't have a russian equivalent

es_subjects = ["EE.UU.", "America", "Barack Obama", "Obama", "Estados Unidos"]
es_predicates = ["debería", "está haciendo", "debería ser", "es"]

fr_subjects = ["L'Amérique","Barack Obama", "Obama", "Etats-Unis"] #USA and United States have the same translation
fr_predicates = ["devrait","fait","devrait être","est"]

#example run of the get_suggestions function:

result_df = get_suggestions("ru", "ru", "ru_autosuggest.csv", ru_subjects, ru_predicates)
#result_df = get_suggestions("com", "en", "en_autosuggest.csv", en_subjects, en_predicates)

print result_df

                                              google                      bing
0                                   сша должны китаю  сша должны быть разрушен
1                               сша должны сами себе  сша должны быть разрушен
2                                  сша должны россии                       NaN
3                          сша должны быть разрушены                       NaN
4                         сша должны быть уничтожены                       NaN
5                            сша должны китаю золото                       NaN
6                      сша должны отдать нам украину                       NaN
7                              сша должны казахстану                       NaN
8                                         сша должны                       NaN
9                                сша должны сдохнуть                       NaN
10         в сша делают обрезание всем новорожденным                       NaN
11                            в сша делают обрезание

In [3]:
import nltk

#open the files containing positive and negative terms, save to an array

positive_terms = []
f = open('Data/positive_terms.txt', "r")
positive_terms = f.read().splitlines()
f.close()

negative_terms = []
f = open('Data/negative_terms.txt', "r")
negative_terms = f.read().splitlines()
f.close()

In [4]:
import string 
import numpy as np
from nltk.tokenize import WhitespaceTokenizer

# Create a tokenizer from nltk which will create tokens based on the whitespace in between words
tokenizer = WhitespaceTokenizer()

#import tools to help with the sentiment analysis
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')
porter = nltk.PorterStemmer()

#remove any punctuation from the autosuggestions
def remove_punctuation(text):
    # Grab the list of standard punctuation symbols that are provided in the string library
    punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~

    # But don't strip out apostrophes, as we want to preserve possessives and contractions, an alternative would be to expand contractions
    excluded_punctuations = ["'"]
    for p in punctuations:
        if p not in excluded_punctuations:
            # replace each punctuation symbol by a space
            text = text.replace(p, " ") 

    return text

#turn autosuggestion text into token array
def normalize_review_text(text):
    if pd.isnull(text):
        return [""]
    text = text.lower()
    text = remove_punctuation(text)
    text = " ".join(text.split())
    text_tokens = tokenizer.tokenize(text)
    text_tokens = [porter.stem(w) for w in text_tokens if w not in stopword_list]
    return text_tokens

# Apply the function above to the text column
def normalize_dataframe(result_df):
    result_df["bing_tokens"] = result_df["bing"].apply(normalize_review_text)
    result_df["google_tokens"] = result_df["google"].apply(normalize_review_text)
    return result_df

normalize_dataframe(result_df)
result_df

Unnamed: 0,google,bing,bing_tokens,google_tokens
0,сша должны китаю,сша должны быть разрушен,"[сша, должны, быть, разрушен]","[сша, должны, китаю]"
1,сша должны сами себе,сша должны быть разрушен,"[сша, должны, быть, разрушен]","[сша, должны, сами, себе]"
2,сша должны россии,,[],"[сша, должны, россии]"
3,сша должны быть разрушены,,[],"[сша, должны, быть, разрушены]"
4,сша должны быть уничтожены,,[],"[сша, должны, быть, уничтожены]"
5,сша должны китаю золото,,[],"[сша, должны, китаю, золото]"
6,сша должны отдать нам украину,,[],"[сша, должны, отдать, нам, украину]"
7,сша должны казахстану,,[],"[сша, должны, казахстану]"
8,сша должны,,[],"[сша, должны]"
9,сша должны сдохнуть,,[],"[сша, должны, сдохнуть]"


In [5]:
def calculate_positivity(text):
    num_tokens = len(text)
    num_positive_tokens = 0
    for t in text:
        if t in positive_terms:
            num_positive_tokens = num_positive_tokens + 1
    # The positivity score is the fraction of tokens that were found in the positive dictionary
    return float(num_positive_tokens) / float(num_tokens)


def calculate_negativity(text):
    num_tokens = len(text)
    num_negative_tokens = 0
    for t in text:
        if t in negative_terms:
            num_negative_tokens = num_negative_tokens + 1
    # The negativity score is the fraction of tokens that were found in the negative dictionary
    return float(num_negative_tokens) / float(num_tokens)

#add sentiment data into the dataframe and csv
def get_sentiment(result_df, filename):
    result_df["bing_positivity_score"] = result_df["bing_tokens"].apply(calculate_positivity)
    result_df["google_positivity_score"] = result_df["google_tokens"].apply(calculate_positivity)
    result_df["bing_negativity_score"] = result_df["bing_tokens"].apply(calculate_negativity)
    result_df["google_negativity_score"] = result_df["google_tokens"].apply(calculate_negativity)
    result_df = result_df[["bing","bing_tokens","bing_positivity_score","bing_negativity_score","google","google_tokens","google_positivity_score","google_negativity_score"]]
    result_df.to_csv(filename, encoding='utf-8')
    

In [6]:
#print the avg. positivity/negativity sentiment of bing and google autosuggestions
def print_results(result_df):
    #bing scores
    total_score = 0
    for i in result_df["bing_positivity_score"]:
        total_score += i

    bing_pos_avg = total_score/result_df["bing"].count()
    print "Bing positivity avg:",bing_pos_avg

    total_score = 0
    for i in result_df["bing_negativity_score"]:
        total_score += i

    bing_neg_avg = total_score/result_df["bing"].count()
    print "Bing negativity avg:",bing_neg_avg

    #google scores
    total_score = 0
    for i in result_df["google_positivity_score"]:
        total_score += i

    google_pos_avg = total_score/result_df["google"].count()
    print "Google positivity avg:",google_pos_avg

    total_score = 0
    for i in result_df["google_negativity_score"]:
        total_score += i

    google_neg_avg = total_score/result_df["google"].count()
    print "Google negativity avg:",google_neg_avg


In [7]:
from nltk.probability import FreqDist

#return 25 most common words from the autosuggestions, excluding subjects/predicates used earlier
def get_most_common_en(result_df):
    bing_tokens_total = []
    for t in result_df["bing_tokens"]:
        bing_tokens_total += t

    bing_tokens_total = [x for x in bing_tokens_total if x != "obama" and x !="america" and x !="usa" \
                        and x != "barack" and x !="unit" and x != "state" and x!=""]    

    bing_frequency_distribution = FreqDist(bing_tokens_total)
    bing_common = bing_frequency_distribution.most_common(25)

    print "Bing Tokens FreqDist: ", bing_common

    google_tokens_total = []
    for t in result_df["google_tokens"]:
        google_tokens_total += t

    google_tokens_total = [x for x in google_tokens_total if x != "obama" and x !="america" and x !="usa" \
                        and x != "barack" and x !="unit" and x != "state" and x!=""]    

    google_frequency_distribution = FreqDist(google_tokens_total)
    google_common = google_frequency_distribution.most_common(25)

    print "\nGoogle Tokens FreqDist: ", google_common
    return [bing_common, google_common]

#use this for non-english languages since stemming will not occur; just compare to subject array
def get_most_common(result_df, subjects):
    bing_tokens_total = []
    for t in result_df["bing_tokens"]:
        bing_tokens_total += t

    bing_tokens_total = [x for x in bing_tokens_total if x not in subjects]    

    bing_frequency_distribution = FreqDist(bing_tokens_total)
    bing_common = bing_frequency_distribution.most_common(25)

    print "Bing Tokens FreqDist: ", bing_common

    google_tokens_total = []
    for t in result_df["google_tokens"]:
        google_tokens_total += t

    google_tokens_total = [x for x in google_tokens_total if x not in subjects]    

    google_frequency_distribution = FreqDist(google_tokens_total)
    google_common = google_frequency_distribution.most_common(25)

    print "\nGoogle Tokens FreqDist: ", google_common
    return [bing_common, google_common]



In [85]:
print "\nRussia"
result_df = get_suggestions("ru", "ru", "ru_autosuggestions.csv", ru_subjects, ru_predicates)
normalize_dataframe(result_df)
get_sentiment(result_df, "ru_autosuggestions.csv")
print_results(result_df)
#print result_df
ru_common = get_most_common(result_df, ru_subjects)

#now get various "most common" arrays from different tlds and languages and see which tokens appear the most
print "\nUSA"
usa_df = get_suggestions("com", "en", "usa_autosuggestions.csv", en_subjects, en_predicates)
normalize_dataframe(usa_df)
get_sentiment(usa_df, "usa_autosuggestions.csv")
print_results(usa_df)
#print usa_df
usa_common = get_most_common_en(usa_df)

print "\nUK"
uk_df = get_suggestions("co.uk", "en", "uk_autosuggestions.csv", en_subjects, en_predicates)
normalize_dataframe(uk_df)
get_sentiment(uk_df, "uk_autosuggestions.csv")
print_results(uk_df)
#print uk_df
uk_common = get_most_common_en(uk_df)

print "\nMexico"
mx_df = get_suggestions("mx", "es", "mx_autosuggestions.csv", es_subjects, es_predicates)
normalize_dataframe(mx_df)
get_sentiment(mx_df, "mx_autosuggestions.csv")
print_results(mx_df)
#print mx_df
mx_common = get_most_common(mx_df, es_subjects)

print "\nCanada (FR)"
ca_df = get_suggestions("ca", "fr", "ca_autosuggestions.csv", fr_subjects, fr_predicates)
normalize_dataframe(ca_df)
get_sentiment(ca_df, "ca_autosuggestions.csv")
print_results(ca_df)
#print ca_df
ca_common = get_most_common(ca_df, fr_subjects)

print "\nCanada (EN)"
ca_en_df = get_suggestions("ca", "en", "ca_en_autosuggestions.csv", en_subjects, en_predicates)
normalize_dataframe(ca_en_df)
get_sentiment(ca_en_df, "ca_en_autosuggestions.csv")
print_results(ca_en_df)
#print ca_en_df
ca_en_common = get_most_common_en(ca_en_df)


Russia
Bing positivity avg: 0.0
Bing negativity avg: 0.0
Google positivity avg: 0.0323076923077
Google negativity avg: 0.00769230769231
Bing Tokens FreqDist:  [('', 63), (u'\u0441\u0448\u0430', 2), (u'\u0440\u0430\u0437\u0440\u0443\u0448\u0435\u043d', 2), (u'\u0431\u044b\u0442\u044c', 2), (u'\u0434\u043e\u043b\u0436\u043d\u044b', 2)]

Google Tokens FreqDist:  [(u'make', 30), (u'\u0441\u0448\u0430', 22), (u'obama', 20), (u'\u0434\u043e\u043b\u0436\u043d\u044b', 15), (u'barack', 10), (u'\u0434\u043e\u043b\u0436\u043d\u0430', 10), (u'\u0430\u043c\u0435\u0440\u0438\u043a\u0430', 10), (u'\u0434\u0435\u043b\u0430\u044e\u0442', 10), (u'\u0431\u044b\u0442\u044c', 8), (u'america', 8), (u'fun', 7), (u'\u0432', 5), (u'\u043e\u0431\u0440\u0435\u0437\u0430\u043d\u0438\u0435', 4), (u'\u043a\u0438\u0442\u0430\u044e', 4), (u'\u0447\u0442\u043e', 3), (u'\u0443\u043d\u0438\u0447\u0442\u043e\u0436\u0435\u043d\u044b', 3), (u'\u0440\u0430\u0437\u0440\u0443\u0448\u0435\u043d\u044b', 3), (u'trump', 3), (u'\



In [86]:
#used to generate charts
import matplotlib
import matplotlib.pyplot as plt

#get font support for non-English characters
matplotlib.rcdefaults()
matplotlib.rcParams['font.family'] = 'fantasy'
matplotlib.rcParams['font.fantasy'] = 'Times New Roman','Arial','Tahoma','Calibri'

#adapted from code found here: http://cs.smith.edu/dftwiki/index.php/MatPlotLib_Tutorial_1#Adding_String_Labels_for_X_Values
def visualize(common, clr):
    #omit invalid terms
    bing_u = [(x,i) for (x,i) in common[0] if isinstance(x, unicode)]
    google_u = [(x,i) for (x,i) in common[1] if isinstance(x, unicode)]
    
    #plot data
    N = len(bing_u)
    x = np.arange(1, N+1)
    y = [ num for (s, num) in bing_u ]
    labels = [ s for (s, num) in bing_u ]
    width = 1
    bar1 = plt.bar( x, y, width, color=clr )
    plt.ylabel('Bing Term Frequency')
    plt.xticks(x + width/2.0, labels )
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=90)
    plt.show()
    
    N = len(google_u)
    x = np.arange(1, N+1)
    y = [ num for (s, num) in google_u ]
    labels = [ s for (s, num) in google_u ]
    width = 1
    bar1 = plt.bar( x, y, width, color=clr )
    plt.ylabel('Google Term Frequency')
    plt.xticks(x + width/2.0, labels )
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=90)
    plt.show()

#plot results
all_results = [(ru_common,"#ff0000"), (usa_common,"#0000ff"), (uk_common,"#33ff99"), (mx_common,"#ffff00"), (ca_common,"#dd8855"), (ca_en_common, "#aabbcc")]
for (i,c) in all_results:
    visualize(i,c)