In [2]:
import requests
from bs4 import BeautifulSoup
import logging
import random
import multiprocessing
import math
import pandas as pd
import os
import sys
from time import perf_counter
from logging import info, error, warning
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
import time

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

info(f"Core count: {multiprocessing.cpu_count()}.")

base_url = "https://translate.google.com/m"

INFO:root:Core count: 96.


In [13]:
user_agents = [
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36"
]


def get_random_useragent():
    """
    This function returns a random user agent from the user_agents list.
    @return: a user agent from the user_agents list.
    """
    index = random.randint(0, len(user_agents)-1)
    user_agent = user_agents[index]
    return user_agent


def gtranslate(text):
    """
    This function translates a text using the Google translate API.
    """
    user_agent = get_random_useragent()
    response = requests.get(base_url,
                            params={"q": text},
                            headers={"user-agent": user_agent})

    if response.status_code != 200:
        if response.status_code == 429:
            error(f"To many requests {response.status_code}.")
            info("Script encountered status 429: to requests. Waitin for 30 minutes to resume")
            time.sleep(1800)
            info("Script resumes after 30 minutes")
            return gtranslate(text)
        else:
            error(f"Received status: {response.status_code}.")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    element = soup.find("div", {"class": "t0"})

    if not element:
        element = soup.find("div", {"class": "result-container"})

    translated = None
    if not element:
        translated = element.get_text()
    return translated


def batch_gtranslate(texts):
    """
    This function runs gtranslate in batch mode.

    @param: texts: list of texts that need to be translated
    @return: list of translated texts.
    """
    translated_texts = []
    start = perf_counter()
    for text in texts:
        translated_text = gtranslate(text)
        translated_texts.append(translated_text)

    elapsed_time = perf_counter()-start
    info(f"batch_translate: text_count: {len(texts)}, took: {elapsed_time:.2f} seconds.")
    return translated_texts


def para_gtranslate(texts, min_batch_size=6):
    """
    This function batch translates texts in parallel.

    @param: texts: list of texts to translate
    @param: batch_size: batch size of texts processed per thread. Default is 6.
    """
    cpu_count = multiprocessing.cpu_count()
    thread_count = cpu_count*2
    info(f"Running parallel gtranslate with cpu count: {cpu_count}, and threads: {thread_count} available.")
    batch_size = min_batch_size
    if len(texts)/thread_count <= min_batch_size:
        # Number of threads is greater then number of texts to translate
        # Example 32 threads and 14 texts, then three threads will spawned.
        # Two threads will processes 6 texts and 1 will processes 2 texts.
        batch_count = math.ceil(len(texts)/batch_size)

    else:
        # Texts is greater then number threads.
        # Example: 256 texts and 32 threads, each thread will processes 
        # 8 texts.
        batch_size = math.ceil(len(texts)/thread_count)
        batch_count = thread_count

    info(f"Start translation with batch_count: {batch_count}, batch_size: {batch_size}.")
    result = []
    futures = []
    with ThreadPoolExecutor(batch_count) as executor:
        for i in range(0, batch_count):
            start = i*batch_size
            end = start+batch_size
            if end > len(texts):
                end = len(texts)
            batch = texts[start:end:]
            future = executor.submit(batch_gtranslate, batch)
            futures.append(future)

    wait(futures)
    for future in futures:
        translated_texts = future.result()
        for text in translated_texts:
            result.append(text)

    return result


def translate_column(df, column_name):
    """
    This function translates a hydrated data-set.

    @param: df: this is a pandas dataframe.
    @param: column_name: name of the column inside the dataframe that needs to be translated. 
    @return: dataframe containing a new column called 'processed_text'
    """
    texts = df[column_name].tolist()

    translated_texts = para_gtranslate(texts)

    se = pd.Series(translated_texts)

    df["processed_texts"] = se.values
    return df


def translate_dataset(dirpath, overwrite_cache=True):
    """
    This function translates a dataset of hydrated tweets.
    A dataset is considered a directory containing .csv files.
    Each files is read into a pandas dataframe and then written to 
    directory called 'processed'. The processed directory is a sibling 
    of the hydrated directory.

    This function caches results so if file hydrated/A.csv and processed/A.csv
    exists then A.csv is skipped. Moreover if a the file 'B.csv' exists in the 
    hydrated directory but not in the processed directory the file is translated 
    and written to the processed directory.

    @param: dirpath: The path to the dataset. The dataset must be a directory.
    @param: overwrite_cache: Whether caching is enabled. The default value is True
    """
    if not os.path.exists(dirpath) and not os.path.isdir(dirpath):
        raise FileNotFoundError("File 'dirpath' does not exist or is not a directory.")

    parentdir = os.path.dirname(dirpath)
    # create a path to directory that is a sibling of 'dirpath' variable.
    processeddir = os.path.join(parentdir, "processed")
    # create a directory called 'processed' for the processed dataset if it does not yet exist.
    if not os.path.exists(processeddir):
        os.mkdir(processeddir)
        info(f"Created directory: {processeddir}.")

    filenames = [file for file in os.listdir(dirpath) if file.endswith(".csv")]
    for file in filenames:
        filepath = os.path.join(dirpath, file)
        processed_filepath = os.path.join(processeddir, file)
        if not overwrite_cache and os.path.exists(processed_filepath):
            info(f"File already exists in: {processed_filepath}; skipping.")
            continue

        df = pd.read_csv(filepath)
        df = translate_column(df, column_name="full_text")
        df.to_csv(processed_filepath)
        info(f"Created new processed file in: {processed_filepath}.")

    return True

In [75]:
def test_gtranslate():
    text = "Liever te dik in de kist dan een feestje gemist."
    gtranslate(text)


def test_batch_gtranslate():
    texts = [
        "Liever te dik in de kist dan een feestje gemist.",
        "Hallo welt",
        "Buenos dias"
    ]
    batch_gtranslate(texts)


def test_para_gtranslate():
    texts = [
        "Liever te dik in de kist dan een feestje gemist.",
        "Hallo welt",
        "Buenos dias"
    ]
    result = para_gtranslate(texts, min_batch_size=2)
    print(result)

In [159]:
test_gtranslate()

ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
INFO:root:Script resumes after 30 minutes


In [77]:
test_para_gtranslate()

['Better to be too fat in the coffin than to miss a party.', 'Hello World', 'good morning']


In [160]:
filepath = os.path.join("data-sets",
                        "examples",
                        "hydrated",
                        "output2020_02_lg.csv")
df = pd.read_csv(filepath)

print(len(df))

texts = df.full_text.tolist()


start = perf_counter()
translated_texts = para_gtranslate(texts)
elapsed_time = perf_counter()-start
print(f"Batch translation took: {elapsed_time:.2f} seconds.")

21109
INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 availabel.
INFO:root:Start translation with batch_count: 192, batch_size: 110.


  exec(code_obj, self.user_global_ns, self.user_ns)


INFO:root:batch_translate: text_count: 110, took: 446.22 seconds.
INFO:root:batch_translate: text_count: 110, took: 449.62 seconds.
INFO:root:batch_translate: text_count: 110, took: 453.16 seconds.
INFO:root:batch_translate: text_count: 110, took: 459.81 seconds.
INFO:root:batch_translate: text_count: 110, took: 460.71 seconds.
INFO:root:batch_translate: text_count: 110, took: 461.06 seconds.
INFO:root:batch_translate: text_count: 99, took: 441.30 seconds.
INFO:root:batch_translate: text_count: 110, took: 463.83 seconds.
INFO:root:batch_translate: text_count: 110, took: 463.80 seconds.
INFO:root:batch_translate: text_count: 110, took: 463.76 seconds.
INFO:root:batch_translate: text_count: 110, took: 463.27 seconds.
INFO:root:batch_translate: text_count: 110, took: 465.90 seconds.
INFO:root:batch_translate: text_count: 110, took: 465.25 seconds.
INFO:root:batch_translate: text_count: 110, took: 468.36 seconds.
INFO:root:batch_translate: text_count: 110, took: 469.81 seconds.
INFO:root:b

In [127]:
filepath = os.path.join("data-sets",
                        "examples",
                        "hydrated",
                        "output2020_02_lg.csv")
df = pd.read_csv(filepath)

texts = df.full_text[0:50].tolist()

start = perf_counter()
translated_texts = batch_gtranslate(texts)
elapsed_time = perf_counter()-start
print(f"Batch translation took: {elapsed_time:.2f} seconds.")

INFO:root:batch_translate: text_count: 50, took: 5.45 seconds.
Batch translation took: 5.46 seconds.


In [None]:
datasetdir = os.path.join("data-sets", "Lopez1", "hydrated")

In [None]:
dirpath = os.path.join("data-sets", "examples", "hydrated")
translate_dataset(dirpath=dirpath, overwrite_cache=False)

INFO:root:File already exists in: data-sets/examples/processed/output2020_02_sm.csv; skipping.
INFO:root:Running parallel gtranslate with cpu count: 96, and threads: 192 availabel.
INFO:root:Start translation with batch_count: 192, batch_size: 110.


  translate_dataset(dirpath=dirpath, overwrite_cache=False)


ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 429: to requests. Waitin for 30 minutes to resume
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
ERROR:root:To many requests 429.
INFO:root:Script encountered status 