### Defining functions from Google Translate API: detect and translate

In [181]:
#DISCLAIMER: THIS ENTIRE CODE BELOW WAS FOUND HERE: https://gist.github.com/jseabold/1473363

import os
import urlparse
import urllib
import urllib2
import httplib2
import gzip
import json
from httplib2 import FileCache
from urllib2 import HTTPRedirectHandler, HTTPDefaultErrorHandler, HTTPError

api = 'AIzaSyCn_yOR28P6LR1jQGRtgICWuBHRANPjRpg'

languages = ["af", "sq", "ar","be", "bg", "ca", "zh-CN", "zh-TW", "hr",
             "cs", "da", "nl", "en", "et", "tl", "fi", "fr", "gl", "de",
             "el", "iw", "hi", "hu", "is", "id", "ga", "it", "ja", "ko",
             "lv", "lt", "mk", "ms", "mt", "no", "fa", "pl", "pt", "ro",
             "ru", "sr", "sk", "sl", "es", "sw", "sv", "th", "tr", "uk",
             "vi", "cy", "yi"]


def _validate_language(lang):
    if lang in languages:
        return True
    return False

### Custom G-Zipped Cache ###

def save_cached_key(path, value):
    f = gzip.open(path, 'wb')
    f.write(value)
    f.close()

def load_cached_key(key):
    f = gzip.open(key)
    retval = f.read()
    f.close()
    return retval

class ZipCache(FileCache):
    def __init__(self, cache='.cache'): #TODO: allow user configurable?
        super(ZipCache, self).__init__(cache)

    def get(self, key):
        cacheFullPath = os.path.join(self.cache, self.safe(key))
        retval = None
        try:
            retval = load_cached_key(cacheFullPath)
        except IOError:
            pass
        return retval

    def set(self, key, value):
        retval = None
        cacheFullPath = os.path.join(self.cache, self.safe(key))
        save_cached_key(cacheFullPath, value)

        
### Error Handlers ###

class DefaultErrorHandler(HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, headers):
        result = HTTPError(req.get_full_url(), code, msg, headers, fp)
        result.status = code
        return result


class RedirectHandler(HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        result = HTTPRedirectHandler.http_error_301(self, req, fp, code,
                        msg, headers)
        result.status = code
        return result

    def http_error_302(self, req, fp, code, msg, headers):
        results = HTTPRedirectHandler.http_error_302(self, req, fp, code,
                        msg, headers)
        results.status = code
        return result
    
### Translator Class ###

class GoogleTranslator(object):
    """
    Google Translator object.
    Examples
    --------
    translator = GoogleTranslator()
    results1 = translator.translate("Einen schönen Tag allerseits")
    # try 2 at a time
    results2 = translator.translate(["Einen schönen Tag allerseits",
                                     "Ich nehme an"])
    # try detect
    results3 = translator.detect("Einen schönen Tag allerseits")
    # try to detect 2 at a time
    results4 = translator.detect(["Einen schönen Tag allerseits",
                                     "Ich nehme an"])
    """
    def __init__(self):
        #NOTE: caching is done on etag not expiry
        self.cache_control = 'max-age='+str(7 * 24 * 60 * 60)
        self.connection = httplib2.Http(ZipCache())
        self._opener = urllib2.build_opener(DefaultErrorHandler,
                                            RedirectHandler)
        self.base_url = "https://www.googleapis.com/language/translate/v2/"

    def _urlencode(self, params):
        """
        Rewrite urllib.urlencode to handle string input verbatim
        """
        params = "&".join(map("=".join,params))
        return params

    def _build_uri(self, extra_url, params):
        params = [('key', api)] + params
        params = self._urlencode(params)
        url = "%s?%s" % (urlparse.urljoin(self.base_url, extra_url), params)
        if len(url) > 2000: # for GET requests only, POST is 5K
            print('len of url',len(url))
            raise ValueError("Query is too long. URL can only be 2000 "
                             "characters")
        return url

    def _fetch_data(self, url):
        connection = self.connection
        resp, content = connection.request(url, headers={'user-agent' : api,
                            'cache-control' : self.cache_control})
        #DEBUG
        #if resp.fromcache:
        #   print "Using from the cache"
        return content

    def _sanitize_query(self, query):
        if isinstance(query, (list,tuple)):
            query = zip('q' * len(query), map(urllib.quote,query))
        else:
            query = [('q',urllib.quote(query))]
        return query

    def _decode_json(self, response):
        """
        Assumes that response only holds one result
        """
        json_data = json.loads(response)
        try:
            data = json_data["data"]
            if 'translations' in data:
                return data['translations']
            elif 'detections' in data:
                return data['detections']
        except:
            if 'error' in json_data:
                return json_data["error"]


    def detect(self, query):
        """
        Try to detect the language of a word, phrase, or list of either.
        Parameters
        ----------
        query : str or iterable
            Query or list of queries to translate
        Returns
        -------
        List of dictionaries for each query
        """
        query = self._sanitize_query(query)
        url = self._build_uri(extra_url='detect/', params=query)
        content = self._fetch_data(url)
        # going to have json, decode it first
        return self._decode_json(content)

    def translate(self, query, target="en", source="", _dirty=False):
        """
        Translate a query.
        Parameters
        ----------
        query : str or iterable
            Query or list of queries to translate
        target : str
            Language to translate into.
        source : str, optional
            Language of the source text, if known. Will be auto-detected
            if an empty string is passed.
        dirty : bool
            This is not intended to be used by users. It is here to avoid
            infinite recursion if the query returns an error because the
            language can't be detected.
        Returns
        -------
        List of dictionaries for each query
        Notes
        -----
        If the language can't be detected for a word an attempt is made
        to detect the language of the word and resubmit the query. If a
        list of words to translate is given and an error is encountered,
        it is assumed that the list of words all have the same source language
        when resubmitted.
        """
        try:
            assert _validate_language(target)
        except:
            raise ValueError("target language %s is not valid" % target)
        newquery = self._sanitize_query(query)
        params = [('key', api), ('target' , target)]
        if source:
            try:
                assert _validate_language(target)
            except:
                raise ValueError("source language %s is not valid" % target)
            params += ["source", source]
        params += newquery
        url = self._build_uri("", params)
        content = self._fetch_data(url)
        results = self._decode_json(content)

        if "errors" in results and not _dirty:
            if results['message'] == 'Bad language pair: {0}':
                # try to detect language and resubmit query
                source = self.detect(query)
                source = source[0]['language']
                return self.translate(query, target, source, True)

        return results


#do_run = raw_input("Running this costs money. Key 1 to continue: ")
#if int(do_run) != 1:
#    raise ValueError("Exiting")
#translator = GoogleTranslator()

#results1 = translator.detect("Mi casa es tu casa. No me digas que no vienes.")
#print(results1[0])

### Getting Language of each individual review

In [199]:
import numpy as np 
import pandas as pd
import time

reviewFile='Asheville-reviews.csv'
data = pd.read_csv(reviewFile, sep=',', names=['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'])
#print data.shape
reviews = data['comments']
#print reviews[:10]

listing_ids = data['listing_id']
ids = data['id']
reviewer_ids = data['reviewer_id']
dates = data['date']
reviewer_names = data['reviewer_name']
comments = data['comments']

s = 17745
e = 17751

test_set = reviews[s:e]
listing_ids_set = listing_ids[s:e]
print(listing_ids_set)
ids_set = ids[s:e]
reviewer_ids_set = reviewer_ids[s:e]
dates_set = dates[s:e]
reviewer_names_set = reviewer_names[s:e]
comments_set = comments[s:e]

c = s

language_details = []

#header1 = ['listing_id,id,reviewer_id,date,reviewer_name,comment,isReliable,confidence,language,language_encoded,english_review']
header1 = ["listing_id", "id", "reviewer_id", "date", "reviewer_name", "comment", "isReliable", "confidence", "language", "language_encoded", "english_review"]

for i in test_set:
    
    #print c
    #adding 1 and I skip the header
    listing_id = listing_ids_set.loc[c]
    id_ = ids_set.loc[c]
    reviewer_id = reviewer_ids_set.loc[c]
    date = dates_set.loc[c]
    reviewer_name = reviewer_names_set.loc[c]
    comment = comments_set.loc[c]
    c = c+1
    
    
    if (c%500==0):
        print c
    #print(c)
    
    #MAX LENGTH OF STRING IS 1294 -max length of url is 2000, and the string is appended to a 706 character url-
    #print('Length of string:',len(i))
    
    if len(i)>1294:
        sentence_string = i[0:1293]
    else:
        sentence_string = i
    
    #print sentence
    #print sentence.sentiment
    
    works = True
    
    #new = i.encode('ascii','replace')
    new2 = sentence_string.decode('utf-8','replace')
    
    #print(sentence_string)
    
    time.sleep(0.12)
    translator = GoogleTranslator()
    results5 = translator.detect(sentence_string)
    #print(results5)
    isReliable = results5[0][0]["isReliable"]
    #print('isReliable:', isReliable)
    confidence = results5[0][0]["confidence"]
    #print('confidence:', confidence)
    language = results5[0][0]["language"]
    language_encoded = language.encode('ascii','replace')
    #print('language:', language_encoded)
    if language_encoded != 'en':
        translation = translator.translate(comment)
        english_review = translation[0]["translatedText"]
    else:
        english_review = comment
    
    listing_language_detail = [listing_id, id_, reviewer_id, date, reviewer_name, comment, isReliable, confidence, language, language_encoded, english_review]
    language_details.append(listing_language_detail)

    
print(language_details)
print('FINISH')

17745    4182500
17746    4182500
17747    4182500
17748    4182500
17749    4182500
17750    4182500
Name: listing_id, dtype: object
[['4182500', '38890668', '23934272', '2015-07-19', 'Kelli', 'Very sweet and welcoming family and very comfortable home. Accommodations were just as described. I would definitely stay here my next trip to Asheville.  ', False, 1, u'en', 'en', 'Very sweet and welcoming family and very comfortable home. Accommodations were just as described. I would definitely stay here my next trip to Asheville.  '], ['4182500', '39711496', '39170594', '2015-07-25', 'Tom', 'Svitlana was very flexible with our uncertain arrival time and welcomed us warmly.  We did not see her at all for the two nights we stayed at her home, however we were also gone much of the time also.\r\n\r\nThe space was clean and inviting with its own bathroom, and was located just inside the front door so we could come and go as we pleased without disturbing our hosts -- just perfect for us!', False,

### Save original data plus translations into csv

In [200]:
import csv

with open("language_info_english_sample.csv","a") as file:
        writer = csv.writer(file, delimiter=",")
        writer.writerow(header1)
        writer.writerows(language_details)

### Takes all translated reviews and gets sentiment

In [201]:
import numpy as np 
import pandas as pd
from textblob import TextBlob

#NEED TO CLEAN FOLLOWING ERRORS - TEXT BLOB CRASHES WITH SPECIAL CHARACTERS IN OTHER LANGUAGES
#ROW 9696 HAS THIS ISSUE

reviewFile='language_info_english_sample.csv'
data = pd.read_csv(reviewFile, sep=',', names=header1)
#print data.shape

reviews = data['english_review']
#print reviews[:10]


test_set = reviews[:]
#print test_set
c = 0
for i in test_set:
    c = c+1
    print(c)
    sentence_blob = TextBlob(i)
    sentence_string = i

    a = sentence_blob.sentiment
    print i
    print "polarity", a.polarity
    print "subjectivity", a.subjectivity

1
english_review
polarity 0.0
subjectivity 0.0
2
Very sweet and welcoming family and very comfortable home. Accommodations were just as described. I would definitely stay here my next trip to Asheville.  
polarity 0.24375
subjectivity 0.58625
3
Svitlana was very flexible with our uncertain arrival time and welcomed us warmly.  We did not see her at all for the two nights we stayed at her home, however we were also gone much of the time also.

The space was clean and inviting with its own bathroom, and was located just inside the front door so we could come and go as we pleased without disturbing our hosts -- just perfect for us!
polarity 0.370833333333
subjectivity 0.7
4
Svitlana and her husband were very warm and welcoming. I enjoyed meeting them and feeling at home during my stay. Svitlana was very sweet and shared tea and cheescake with me, and I very much enjoyed out conversation!The house is about 10 minutes from downtown, but very close to Biltmore Village. 
polarity 0.487
subj